In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ghckd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train = pd.read_csv(filepath_or_buffer="../data/train.csv")
test = pd.read_csv(filepath_or_buffer="../data/test.csv")

In [3]:
def preprocess(data):
    def text_cleaner(text:str):
        if text is not np.nan:
            text = text.lower()
            text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
            stop = stopwords.words("english")
            text = " ".join([word for word in text.split() if word not in (stop)])
        
        return text

    def keyword_preprocessor(x:str):
        if "hellfire" in x:
            x = "%20".join(("hell", "fire"))
        if "wildfire" in x:
            x = "%20".join(("wild", "fire"))
        
        if "%20" in x:
            x = x.split("%20")
        else:
            x = [x]
    
        return x

    def lemmatizer(word_list):
        lemmatizer = nltk.wordnet.WordNetLemmatizer()
        for i, word in enumerate(word_list):
            word_list[i] = lemmatizer.lemmatize(word)
            word_list[i] = lemmatizer.lemmatize(word_list[i], "v")
            word_list[i] = lemmatizer.lemmatize(word_list[i], pos="a")
        return word_list

    for col in ["location", "text"]:
        data[col] = data[col].apply(text_cleaner)   
         
    data["keyword"] = data["keyword"].fillna("")
    data["keyword_split"] = data["keyword"].apply(keyword_preprocessor)  
    data["keyword_split"] = data["keyword_split"].apply(lemmatizer)      

    # data["location"] = data["location"].fillna("")
    
    # data["text"] = data["text"].fillna("")
    # data["text_tokenized"] = data["text"].apply(nltk.word_tokenize)
    # data["text_tokenized"] = data["text_tokenized"].apply(lemmatizer)
    
    data = data.drop(columns=["id", "keyword", "location"]) ##, "keyword", "location"
    
    return data

In [4]:
train = preprocess(train)
test = preprocess(test)

### modeling

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from sklearn.metrics import accuracy_score

In [6]:
lemmatizer = nltk.wordnet.WordNetLemmatizer()

def lemmatize_tokenizer(text):
    tokens = nltk.word_tokenize(text)    
    for i, token in enumerate(tokens):
        tokens[i] = lemmatizer.lemmatize(token)
        tokens[i] = lemmatizer.lemmatize(tokens[i], "v")
        tokens[i] = lemmatizer.lemmatize(tokens[i], pos="a")
    return tokens

In [14]:
count_vectorizer = CountVectorizer(tokenizer=lemmatize_tokenizer)
X_train_count = count_vectorizer.fit_transform(train["text"])
X_test_count = count_vectorizer.transform(test["text"])

tfidf_vectorizer = TfidfVectorizer(tokenizer=lemmatize_tokenizer)
X_train_tf = tfidf_vectorizer.fit_transform(train["text"])
X_test_tf = tfidf_vectorizer.transform(test["text"])

y_train = train['target']



#### RF + MNB + LR, tfidf
#### 훈련 셋에 대한 정확도 0.9125180612110863,  0.80294

#### RF + MNB + LR, count + tfidf
#### 훈련 셋에 대한 정확도 

In [1]:
rf_model = RandomForestClassifier(
    n_estimators=1000, 
    random_state=0, 
)
mnb_model = MultinomialNB()
lr_model = LogisticRegression()

ct_voting_classifier = VotingClassifier(
    estimators=[('random_forest', rf_model), 
                ('multinomial_naive_bayes', mnb_model), 
                ('logistic_regression', lr_model), 
                ],
    voting='hard'
    )

ct_voting_classifier.fit(X_train_count, y_train)
ct_ensemble_train_pred = ct_voting_classifier.predict(X_train_count)
ct_ensemble_pred = ct_voting_classifier.predict(X_test_count)

tf_voting_classifier = VotingClassifier(
    estimators=[('random_forest', rf_model), 
                ('multinomial_naive_bayes', mnb_model), 
                ('logistic_regression', lr_model), 
                ],
    voting='hard'
    )

tf_voting_classifier.fit(X_train_count, y_train)
tf_ensemble_train_pred = tf_voting_classifier.predict(X_train_tf)
tf_ensemble_pred = tf_voting_classifier.predict(X_test_tf)

NameError: name 'RandomForestClassifier' is not defined

#### RF + MNB + LR + BNB, lemma

In [8]:
# rf_model = RandomForestClassifier(
#     n_estimators=1000, 
#     random_state=0, 
# )
# mnb_model = MultinomialNB()
# lr_model = LogisticRegression()
# bnb_model = BernoulliNB()

# voting_classifier = VotingClassifier(
#     estimators=[('random_forest', rf_model), 
#                 ('multinomial_naive_bayes', mnb_model), 
#                 ('logistic_regression', lr_model), 
#                 ('bernoulli_naive_bayes', bnb_model),
#                 ],
#     voting='hard'
#     )

# voting_classifier.fit(X_train, y_train)
# ensemble_train_pred = voting_classifier.predict(X_train)
# ensemble_pred = voting_classifier.predict(X_test)

## 훈련 셋에 대한 정확도 

In [9]:
print("훈련 셋에 대한 정확도", accuracy_score(y_train, ensemble_train_pred))

훈련 셋에 대한 정확도 0.9125180612110863


In [10]:
ensemble_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [12]:
submit = pd.read_csv("../submit/sample_submission.csv")
submit["target"] = ensemble_pred
# submit.to_csv("../submit/submit_ensemble_tfidf.csv", index=False)

### Cross Validation

In [82]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
cross_val_predict(voting_classifier, X_train, y_train)

In [None]:
sns.heatmap(confusion_matrix(y_true, y_pred), annot=True, cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

### Optimization