In [14]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ghckd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
train = pd.read_csv(filepath_or_buffer="../data/train.csv")
test = pd.read_csv(filepath_or_buffer="../data/test.csv")

In [16]:
def preprocess(data):
    def text_cleaner(text:str):
        if text is not np.nan:
            text = text.lower()
            text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
            stop = stopwords.words("english")
            text = " ".join([word for word in text.split() if word not in (stop)])
        
        return text

    def keyword_preprocessor(x:str):
        if "hellfire" in x:
            x = "%20".join(("hell", "fire"))
        if "wildfire" in x:
            x = "%20".join(("wild", "fire"))
        
        if "%20" in x:
            x = x.split("%20")
        else:
            x = [x]
    
        return x

    def lemmatizer(word_list):
        lemmatizer = nltk.wordnet.WordNetLemmatizer()
        for i, word in enumerate(word_list):
            word_list[i] = lemmatizer.lemmatize(word)
            word_list[i] = lemmatizer.lemmatize(word_list[i], "v")
            word_list[i] = lemmatizer.lemmatize(word_list[i], pos="a")
        return word_list

    for col in ["location", "text"]:
        data[col] = data[col].apply(text_cleaner)   
         
    data["keyword"] = data["keyword"].fillna("")
    data["keyword_split"] = data["keyword"].apply(keyword_preprocessor)  
    data["keyword_split"] = data["keyword_split"].apply(lemmatizer)      

    # data["location"] = data["location"].fillna("")
    
    # data["text"] = data["text"].fillna("")
    # data["text_tokenized"] = data["text"].apply(nltk.word_tokenize)
    # data["text_tokenized"] = data["text_tokenized"].apply(lemmatizer)
    
    data = data.drop(columns=["id", "keyword", "location"]) ##, "keyword", "location"
    
    return data

In [17]:
train = preprocess(train)
test = preprocess(test)

### modeling

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

from sklearn.metrics import accuracy_score

In [19]:
lemmatizer = nltk.wordnet.WordNetLemmatizer()

def lemmatize_tokenizer(text):
    tokens = nltk.word_tokenize(text)    
    for i, token in enumerate(tokens):
        tokens[i] = lemmatizer.lemmatize(token)
        tokens[i] = lemmatizer.lemmatize(tokens[i], "v")
        tokens[i] = lemmatizer.lemmatize(tokens[i], pos="a")
    return tokens

In [20]:
vectorizer = CountVectorizer(tokenizer=lemmatize_tokenizer)
X_train = vectorizer.fit_transform(train["text"])
X_test = vectorizer.transform(test["text"])
y_train = train['target']



#### RF + MNB + LR, lemma

In [21]:
# rf_model = RandomForestClassifier(
#     n_estimators=1000, 
#     random_state=0, 
# )
# mnb_model = MultinomialNB()
# lr_model = LogisticRegression(
#     max_iter=100,
#     random_state=0
# )

# voting_classifier = VotingClassifier(
#     estimators=[('random_forest', rf_model), 
#                 ('multinomial_naive_bayes', mnb_model), 
#                 ('logistic_regression', lr_model), 
#                 ],
#     voting='soft'
#     )

# voting_classifier.fit(X_train, y_train)
# ensemble_train_pred = voting_classifier.predict(X_train)
# ensemble_pred = voting_classifier.predict(X_test)
# print("훈련 셋에 대한 정확도", accuracy_score(y_train, ensemble_train_pred))

#### RF + MNB + LR + BNB, lemma

In [22]:
rf_model = RandomForestClassifier(
    n_estimators=1000, 
    random_state=0, 
)
mnb_model = MultinomialNB()
lr_model = LogisticRegression(
    random_state=0
)
bnb_model = BernoulliNB()

voting_classifier = VotingClassifier(
    estimators=[('random_forest', rf_model), 
                ('multinomial_naive_bayes', mnb_model), 
                ('logistic_regression', lr_model), 
                ('bernoulli_naive_bayes', bnb_model),
                ],
    voting='soft'
    )
voting_classifier.fit(X_train, y_train)
ensemble_train_pred = voting_classifier.predict(X_train)
ensemble_pred = voting_classifier.predict(X_test)
print("훈련 셋에 대한 정확도", accuracy_score(y_train, ensemble_train_pred))
# ## 훈련 셋에 대한 정확도 0.9255221331932221,  0.80355

훈련 셋에 대한 정확도 0.926310258767897


#### RF + BNB + LR, lemma

In [None]:
# rf_model = RandomForestClassifier(
#     n_estimators=1000, 
#     random_state=0, 
# )
# bnb_model = BernoulliNB()
# lr_model = LogisticRegression()

# voting_classifier = VotingClassifier(
#     estimators=[('random_forest', rf_model), 
#                 ('bernoulli_naive_bayes', bnb_model),
#                 ('logistic_regression', lr_model), 
#                 ],
#     voting='soft'
#     )
# voting_classifier.fit(X_train, y_train)
# ensemble_train_pred = voting_classifier.predict(X_train)
# ensemble_pred = voting_classifier.predict(X_test)
# print("훈련 셋에 대한 정확도", accuracy_score(y_train, ensemble_train_pred))
## 훈련 셋에 대한 정확도 0.9557336135557599,  0.8106

In [10]:
print("훈련 셋에 대한 정확도", accuracy_score(y_train, ensemble_train_pred))

훈련 셋에 대한 정확도 0.9461447523972153


In [12]:
ensemble_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [24]:
submit = pd.read_csv("../submit/sample_submission.csv")
submit["target"] = ensemble_pred
# submit.to_csv("../submit/submit_ensemble_4_soft.csv", index=False)

1. random_forest, multinomial_naive_bayes, logistic_regression -> hard
   * 0.9573098647051097
   * 0.81244

2. random_forest, multinomial_naive_bayes, logistic_regression(max_iter=1000) -> hard
   * 0.9573098647051097
   * 0.81244

3. random_forest, multinomial_naive_bayes, logistic_regression -> soft
   * 0.9461447523972153
   * 0.80968

4. random_forest, multinomial_naive_bayes, logistic_regression, bernoulli_naive_bayes -> soft
   * 0.926310258767897
   * 0.80631
