In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Data/train.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df.loc[0]['text']

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [4]:
from io import StringIO
col = ['target', 'text']
df = df[col]
df.tail()

Unnamed: 0,target,text
7608,1,Two giant cranes holding a bridge collapse int...
7609,1,@aria_ahrary @TheTawniest The out of control w...
7610,1,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611,1,Police investigating after an e-bike collided ...
7612,1,The Latest: More Homes Razed by Northern Calif...


In [5]:
df.columns = ['target', 'text']
target_df = df[['text', 'target']].drop_duplicates().sort_values('target')
target_dict = dict(target_df.values)
df.head()

Unnamed: 0,target,text
0,1,Our Deeds are the Reason of this #earthquake M...
1,1,Forest fire near La Ronge Sask. Canada
2,1,All residents asked to 'shelter in place' are ...
3,1,"13,000 people receive #wildfires evacuation or..."
4,1,Just got sent this photo from Ruby #Alaska as ...


In [6]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df.groupby('target').text.count().plot.bar(ylim=0)
plt.show()

<Figure size 800x600 with 1 Axes>

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(df.text).toarray()
labels = df.target
features.shape

(7613, 3623)

In [8]:
from sklearn.feature_selection import chi2
N = 2
for target in [0,1]:
  features_chi2 = chi2(features, labels == target)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(target))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# '0':
  . Most correlated unigrams:
. hiroshima
. california
  . Most correlated bigrams:
. suicide bombing
. suicide bomber
# '1':
  . Most correlated unigrams:
. hiroshima
. california
  . Most correlated bigrams:
. suicide bombing
. suicide bomber


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
import timeit

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
    DecisionTreeClassifier()
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  if model.__class__.__name__ == 'RandomForestClassifier':
      model_name = 'RF'
  elif model.__class__.__name__ == 'LinearSVC':
      model_name = 'SVC'
  elif model.__class__.__name__ == 'MultinomialNB':
      model_name = 'MNB'
  elif model.__class__.__name__ == 'LogisticRegression':
      model_name = 'LR'
  elif model.__class__.__name__ == 'DecisionTreeClassifier':
      model_name = 'DT'
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['Model', 'fold_idx', 'Accuracy'])
import seaborn as sns
sns.boxplot(x='Model', y='Accuracy', data=cv_df)
sns.stripplot(x='Model', y='Accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.title('Model Accuracy')
plt.show()
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()



In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
test_df = pd.read_csv('Data/test.csv')

In [None]:
trainCounts = count_vect.fit_transform(df['text'])
X_train_tfidf = tfidf_transformer.fit_transform(trainCounts)

In [None]:
import timeit

iterated_run_times = []
run_times = []
for model in models:
    model_name = model.__class__.__name__
    clf = model.fit(X_train_tfidf, labels)
    solution_df = test_df[['id']]
    if model_name != 'RandomForestClassifier': # takes too long!
        for i in range(0, 50):
                start = timeit.default_timer()
                for j in range(0, len(test_df)):
                    text = [test_df['text'][j]]
                    clf.predict(count_vect.transform(text))
                iterated_run_times.append([model_name, timeit.default_timer()-start])
    start = timeit.default_timer()
    solution_df['target'] = test_df.apply(lambda row : clf.predict(count_vect.transform([row['text']])), axis = 1)
    run_times.append((model_name,timeit.default_timer()-start))
    solution_df['target'] = solution_df.apply(lambda row : row['target'][0], axis=1)
    solution_df.to_csv('Data/preds_'+model_name +'.csv', index= False)

In [None]:
rt_df = pd.DataFrame(columns=['Model','Runtime'])
for t in run_times:
    if t[0] == 'LinearSVC':
        rt_df.loc[-1] = ['SVC', t[1]]
    elif t[0] == 'MultinomialNB':
        rt_df.loc[-1] = ['MNB', t[1]]
    elif t[0] == 'LogisticRegression':
        rt_df.loc[-1] = ['LR', t[1]]
    elif t[0] == 'DecisionTreeClassifier':
        rt_df.loc[-1] = ['DT', t[1]]
    else:
        rt_df.loc[-1] = ['RF', t[1]]
    rt_df.index += 1
rt_df

In [None]:
sns.barplot(x='Model',y='Runtime',data=rt_df)
plt.title('Model Runtimes')

In [None]:
iterated_run_times

In [None]:
irt_df = pd.DataFrame(columns=['Model','Runtime'])
for l in iterated_run_times:
    if l[0] == 'LinearSVC':
        irt_df.loc[-1] = ['SVC', l[1]]
    elif l[0] == 'MultinomialNB':
        irt_df.loc[-1] = ['MNB', l[1]]
    elif l[0] == 'LogisticRegression':
        irt_df.loc[-1] = ['LR', l[1]]
    else:
        irt_df.loc[-1] = ['DT', l[1]]
    irt_df.index += 1

In [None]:
irt_df

In [None]:
sns.catplot(x='Model', y='Runtime', data=irt_df)
plt.title('Model Runtimes')
plt.show()