## Data and Model Import

In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
blanks = df[df['text'].apply(lambda x: isinstance(x, str) and x.isspace())].index.tolist()

# Print the results
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


In [6]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
len(df)

7613

In [8]:
df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [9]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC()),
])

In [11]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Test,Run predictions and analyze the results (naïve Bayes)

In [12]:
text_clf_nb.fit(X_train, y_train)

In [13]:
# Form a prediction set
predictions = text_clf_nb.predict(X_test)

In [14]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1316  130]
 [ 375  692]]


In [15]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84      1446
           1       0.84      0.65      0.73      1067

    accuracy                           0.80      2513
   macro avg       0.81      0.78      0.79      2513
weighted avg       0.81      0.80      0.79      2513



## Test,Run predictions and analyze the results (Linear SVC)

In [16]:
text_clf_lsvc.fit(X_train, y_train)

In [17]:
# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)

In [18]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1199  247]
 [ 282  785]]


In [19]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82      1446
           1       0.76      0.74      0.75      1067

    accuracy                           0.79      2513
   macro avg       0.79      0.78      0.78      2513
weighted avg       0.79      0.79      0.79      2513

