In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import string

In [2]:
traindata = pd.read_csv("train.csv")
testdata = pd.read_csv("test.csv")
submissiondata = pd.read_csv("sample_submission.csv") 

In [3]:
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df
data_clean = clean_text(traindata, "text")
data_clean.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake may allah forgive us all,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected,1
3,6,,,13000 people receive wildfires evacuation orders in california,1
4,7,,,just got sent this photo from ruby alaska as smoke from wildfires pours into a school,1


In [5]:
new_stopwords = ['new','amp','people','one','via']
data_clean['text'] = data_clean['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (new_stopwords)]))

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data_clean['text'],data_clean['target'],random_state = 0)

In [8]:
from sklearn.linear_model import SGDClassifier
pipeline_sgd = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('classifier', SGDClassifier()),
])
model1 = pipeline_sgd.fit(X_train, y_train)


y_predict = model1.predict(X_test)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83      1107
           1       0.79      0.71      0.75       797

    accuracy                           0.80      1904
   macro avg       0.80      0.79      0.79      1904
weighted avg       0.80      0.80      0.80      1904



In [10]:
submission_test_clean = testdata.copy()
submission_test_clean = clean_text(submission_test_clean, "text")
submission_test_clean['text'] = submission_test_clean['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (new_stopwords)]))
submission_test_clean = submission_test_clean['text']
submission_test_clean.head()

0    just happened a terrible car crash                                                            
1    heard about earthquake is different cities stay safe everyone                                 
2    there is a forest fire at spot pond geese are fleeing across the street i cannot save them all
3    apocalypse lighting spokane wildfires                                                         
4    typhoon soudelor kills 28 in china and taiwan                                                 
Name: text, dtype: object

In [11]:
submission_test_pred1 = model1.predict(submission_test_clean)

In [12]:
id_col = testdata['id']
submission_df_1 = pd.DataFrame({
                  "id": id_col, 
                  "target": submission_test_pred1})
submission_df_1.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [13]:
submission_df_1.to_csv('submission_1.csv', index=False)

In [14]:
from sklearn.naive_bayes import MultinomialNB
pipeline_nb = Pipeline([
    ('bow',CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])
model2 = pipeline_nb.fit(X_train, y_train)

In [15]:
y_predict2 = model2.predict(X_test)
print(classification_report(y_test, y_predict2))

              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1107
           1       0.84      0.63      0.72       797

    accuracy                           0.80      1904
   macro avg       0.81      0.77      0.78      1904
weighted avg       0.80      0.80      0.79      1904



In [16]:
submission_test_pred2 = model2.predict(submission_test_clean)

In [17]:
id_col = testdata['id']
submission_df_2 = pd.DataFrame({
                  "id": id_col, 
                  "target": submission_test_pred2})
submission_df_2.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [18]:
submission_df_2.to_csv('submission_2.csv', index=False)

In [19]:
from sklearn.linear_model import LogisticRegression
pipeline_LR = Pipeline([
    ('bow',CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', LogisticRegression()),  # train on TF-IDF vectors w/ LogisticRegression classifier
])
model3 = pipeline_LR.fit(X_train, y_train)



In [20]:
y_predict3 = model3.predict(X_test)
print(classification_report(y_test, y_predict3))

              precision    recall  f1-score   support

           0       0.80      0.90      0.84      1107
           1       0.83      0.68      0.75       797

    accuracy                           0.81      1904
   macro avg       0.81      0.79      0.80      1904
weighted avg       0.81      0.81      0.80      1904



In [21]:
submission_test_pred3 = model3.predict(submission_test_clean)

In [22]:
id_col = testdata['id']
submission_df_3 = pd.DataFrame({
                  "id": id_col, 
                  "target": submission_test_pred3})
submission_df_3.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [23]:
submission_df_3.to_csv('submission_3.csv', index=False)

In [24]:
import xgboost as xgb
from xgboost import XGBClassifier
pipeline_xgb = Pipeline([
    ('bow',CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', XGBClassifier()),  # train on TF-IDF vectors w/ XGB classifier
])
model4 = pipeline_xgb.fit(X_train, y_train)

In [25]:
y_predict4 = model4.predict(X_test)
print(classification_report(y_test, y_predict4))

              precision    recall  f1-score   support

           0       0.76      0.88      0.82      1107
           1       0.79      0.62      0.70       797

    accuracy                           0.77      1904
   macro avg       0.78      0.75      0.76      1904
weighted avg       0.78      0.77      0.77      1904



In [26]:
submission_test_pred4 = model4.predict(submission_test_clean)

In [27]:
id_col = testdata['id']
submission_df_4 = pd.DataFrame({
                  "id": id_col, 
                  "target": submission_test_pred4})
submission_df_4.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [28]:
submission_df_4.to_csv('submission_4.csv', index=False)