In [1]:
import pandas as pd    
import numpy as np      


In [2]:
train = pd.read_csv("./data/train_set.csv")
test = pd.read_csv("./data/test_set.csv")


# Data set exploration


In [3]:
# Format of the data
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Duplicates in the data (in regards to text)
train.text.duplicated().sum()


110

In [5]:
# Check class imbalance
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [6]:
# Look for NULL data
train.isnull().sum()


id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

# Data Clean-up


In [7]:
# drop duplicate values?
# fill null rows?


# Create model 
### CountVectorizer | RandomForest

In [8]:
# required imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV

In [12]:
# Create a vector and an analyzer
vectorizer = CountVectorizer()
analyze = vectorizer.build_analyzer()

dvec = vectorizer.fit_transform(train['text'])

In [19]:
# Derive a test set from the train set
dshuf = train.sample(frac=1)

d_train = dshuf[:5000]
d_test = dshuf[5000:]
d_train_att = vectorizer.fit_transform(d_train['text'])  # fit bag-of-words on training set
d_test_att = vectorizer.transform(d_test['text'])  # reuse on testing set
d_train_label = d_train['target']
d_test_label = d_test['target']

Note: At that stage, we have a training set that we want to perform a fit transform on, 
which means it will learn the words and also produce the matrix. However, for the 
testing set, we don't perform a fit transform again, since we don't want the model 
to learn from different words for the testing data. We will use the same words it 
learned on the training set. 

In [20]:
# Create a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=80)
clf.fit(d_train_att, d_train_label)
clf.score(d_test_att, d_test_label)

0.7822426329889016

In [21]:
# Create a confusion matrix
pred_labels = clf.predict(d_test_att)
confusion_matrix(d_test_label, pred_labels)

array([[1418,   82],
       [ 487,  626]], dtype=int64)

In [26]:
# Perform cross-validation
scores = cross_val_score(clf, d_train_att, d_train_label, cv=5)

"Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)

'Accuracy: 0.77 (+/- 0.02)'

In [30]:
# Create a pipeline
dshuf = train.sample(frac=1)
d_content = dshuf['text']
d_label = dshuf['target']

pipeline = make_pipeline(CountVectorizer(), RandomForestClassifier())



In [32]:
# Fit the pipeline
pipeline.fit(d_content[:1500], d_label[:1500])

pipeline.score(d_content[1500:], d_label[1500:])

0.7416980206118109

In [33]:
# Perform cross-validation
scores = cross_val_score(pipeline, d_content, d_label, cv=5)

"Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)

'Accuracy: 0.77 (+/- 0.01)'

In [34]:
# Add TF-IDF (term frequency–inverse document frequency)
pipeline2 = make_pipeline(CountVectorizer(),
                          TfidfTransformer(norm=None),
                          RandomForestClassifier())

scores = cross_val_score(pipeline2, d_content, d_label, cv=5)

"Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)

'Accuracy: 0.77 (+/- 0.02)'

In [35]:
# Parameter search
parameters = {
    'countvectorizer__max_features': (None, 1000, 2000),
    # unigrams or bigrams
    'countvectorizer__ngram_range': ((1, 1), (1, 2)),
    'countvectorizer__stop_words': ('english', None),
    # effectively turn on/off tfidf
    'tfidftransformer__use_idf': (True, False),
    'randomforestclassifier__n_estimators': (20, 50, 100)
}

grid_search = GridSearchCV(pipeline2, parameters, n_jobs=-1, verbose=1)

grid_search.fit(d_content, d_label)

"Best score: %0.3f" % grid_search.best_score_
"Best parameters set:"
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    "\t%s: %r" % (param_name, best_parameters[param_name])

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   32.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:  3.2min finished


In [36]:
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.789
Best parameters set:
	countvectorizer__max_features: None
	countvectorizer__ngram_range: (1, 1)
	countvectorizer__stop_words: 'english'
	randomforestclassifier__n_estimators: 50
	tfidftransformer__use_idf: False
