In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.metrics import r2_score,mean_squared_error,accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

In [3]:
df = pd.read_csv('data/final_train_data.csv')

In [4]:
df.head()

Unnamed: 0,dates&time,user,tweet,class
0,2020-02-06 22:03:12+00:00,DallasPD,DallasPD and dfrincidents are currently on loc...,1
1,2019-08-30 21:16:20+00:00,DallasPD,Monday Sept on LaborDay Jack Evans Police Hd...,1
2,2019-08-14 22:31:39+00:00,DallasPD,PIODPD is at the scene of a possible barricade...,1
3,2019-07-13 22:04:27+00:00,DallasPD,PIODPD is on scene of an Officer Involved Shoo...,1
4,2019-07-13 01:38:19+00:00,DallasPD,Major police incident in downtown Dallas Griff...,1


In [5]:
df.shape

(165204, 4)

In [6]:
df.isna().sum()

dates&time    0
user          0
tweet         0
class         0
dtype: int64

In [7]:
df.dropna(inplace = True)

In [8]:
X = df['tweet']
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [9]:
lemmatizer = WordNetLemmatizer()
def word_lemmatizer(text):
    lemm_text = lemmatizer.lemmatize(text)
    return lemm_text

In [10]:
df["tweet"].apply(lambda x: word_lemmatizer(x))

0         DallasPD and dfrincidents are currently on loc...
1         Monday Sept   on LaborDay Jack Evans Police Hd...
2         PIODPD is at the scene of a possible barricade...
3         PIODPD is on scene of an Officer Involved Shoo...
4         Major police incident in downtown Dallas Griff...
                                ...                        
165199                            cheermomntx Not until Feb
165200    Wow Really This is not safe for many reasons N...
165201                     Happy New Year Be safe out there
165202    As we close in on the New Year be sure to have...
165203    Start the NewYear out right PlanWhileYouCan to...
Name: tweet, Length: 165204, dtype: object

### Basic Model

In [11]:
cvec = CountVectorizer(stop_words = 'english',max_df=0.90,
    min_df=3)

In [12]:
X_train_cvec = cvec.fit_transform(X_train)

In [13]:
df_cvec_train  = pd.DataFrame(X_train_cvec.toarray(),
             columns=cvec.get_feature_names())

In [14]:
df_cvec_train.head()

Unnamed: 0,aa,aacenter,aah,aaron,aashtospeaks,ab,abandon,abandoned,abandoning,abandonment,...,zipper,zoe,zombie,zona,zonas,zone,zones,zonesfor,zoo,zumba
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
X_test_cvec = cvec.transform(X_test)

In [121]:
df_cvec_test  = pd.DataFrame(X_test_cvec.toarray(),
             columns=cvec.get_feature_names())

In [122]:
random_forest_class = RandomForestClassifier()
random_forest_class.fit(df_cvec_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [123]:
random_forest_class.score(df_cvec_train, y_train)

0.9997146365507341

In [124]:
random_forest_class.score(df_cvec_test, y_test)

0.9917880634356967

### Modeling with Advanced Tuning.

In [15]:
pipe = Pipeline([('cvec',CountVectorizer()),
                     ('rf', RandomForestClassifier())
                ])
# Pipeline parameters
pipe_params = {
    'cvec__ngram_range': [(1,2)],
    'rf__n_estimators': [100,150,300,600],
    'rf__max_depth': [None,25,50,100]
}
# Instantiating a grid search
gs = GridSearchCV(pipe, 
                  param_grid=pipe_params, n_jobs = 6, cv = 5, verbose = 5) 
# Fitting my model
gs.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed: 16.7min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed: 118.0min
[Parallel(n_jobs=6)]: Done  80 out of  80 | elapsed: 160.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=6,
       param_grid={'cvec__ngram_range': [(1, 2)], 'rf__n_estimators': [100, 150, 300, 600], 'rf__max_depth': [None, 25, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [16]:
gs.best_params_

{'cvec__ngram_range': (1, 2), 'rf__max_depth': None, 'rf__n_estimators': 300}

In [17]:
gs.score(X_train, y_train)

1.0

In [18]:
gs.score(X_test, y_test)

0.9882773092288447