In [2]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
data = pd.read_csv('./data/train_data/final_train_data.csv')
data.head()

Unnamed: 0,dates&time,user,tweet,class
0,2020-02-06 22:03:12+00:00,DallasPD,DallasPD and dfrincidents are currently on loc...,1
1,2019-08-30 21:16:20+00:00,DallasPD,Monday Sept on LaborDay Jack Evans Police Hd...,1
2,2019-08-14 22:31:39+00:00,DallasPD,PIODPD is at the scene of a possible barricade...,1
3,2019-07-13 22:04:27+00:00,DallasPD,PIODPD is on scene of an Officer Involved Shoo...,1
4,2019-07-13 01:38:19+00:00,DallasPD,Major police incident in downtown Dallas Griff...,1


In [4]:
data.isna().sum() # check for missing values

dates&time    0
user          0
tweet         0
class         0
dtype: int64

In [6]:
data["class"].value_counts(normalize=True)
# baseline accuracy score=94.9%

0    0.948833
1    0.051167
Name: class, dtype: float64

In [7]:
X, y = data["tweet"], data["class"] # create feature matrix and target vector

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
# split data into training and test sets with similar distribution of class values to original dataset

In [11]:
tfidf_params = {
    "tfidf__min_df": [None, 25, 50],
    "ada__n_estimators": [None, 100, 150, 200],
    "ada__learning_rate": [0.1, 0.5, 1.0]
}
# specify parameters of TFIDF and AdaBoost parameters with which to tune the model

In [12]:
tfidf_pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("ada", AdaBoostClassifier())
])
# combine vectorizing and modeling in a pipeline

In [18]:
tfidf_gs = GridSearchCV(estimator=tfidf_pipe, param_grid=tfidf_params, n_jobs=6)
# set up gridsearch

In [19]:
tfidf_gs.fit(X_train, y_train);
# fit the pipeline to the data using all combinations of the above parameters to obtain the best cross-val score

In [20]:
tfidf_gs.best_params_
# show the combination of parameters which gave the best cross-val score

{'ada__learning_rate': 1.0, 'ada__n_estimators': 200, 'tfidf__min_df': 25}

In [37]:
tfidf_gs.best_estimator_.score(X_train, y_train)
# show the accuracy score of the model with those parameters on the training set

0.9993301211431523

In [38]:
tfidf_gs.best_estimator_.score(X_test, y_test)
# and on the testing set

0.9978450884966465

In [41]:
cvec_params = {
    "cvec__min_df": [None, 25, 50],
    "ada__n_estimators": [None, 100, 150, 200],
    "ada__learning_rate": [0.1, 0.5, 1.0]
}
# create a new dictionary of parameters for gridsearch using CountVectorizer rather than TFIDF

In [42]:
cvec_pipe = Pipeline([
    ("cvec", CountVectorizer()),
    ("ada", AdaBoostClassifier())
])
# set up a new pipeline with CountVectorizer and AdaBoost

In [43]:
cvec_gs = GridSearchCV(estimator=cvec_pipe, param_grid=cvec_params, n_jobs=6)
# set up gridsearch

In [44]:
cvec_gs.fit(X_train, y_train);

In [45]:
tfidf_gs.best_params_
# best parameters for CountVectorizer and AdaBoost

{'ada__learning_rate': 1.0, 'ada__n_estimators': 200, 'tfidf__min_df': 25}

In [46]:
tfidf_gs.best_estimator_.score(X_train, y_train)
# best accuracy score on training data

0.9993301211431523

In [47]:
tfidf_gs.best_estimator_.score(X_test, y_test)

0.9978450884966465

In [None]:
# best accuracy score on test data