Modeling
Imports

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Reading in Data

In [3]:
tr = pd.read_csv("data/train.csv")

Trying to do predictions only based on text. Maybe later bring look at keywords, location, and id

In [6]:
tr.drop(columns=['keyword', 'location','id'], inplace = True)


In [4]:
import nltk
nltk.download('punkt')

from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Function for tokenizing and lowercasing text

In [5]:
def transform(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)

Main data set I am working on

In [6]:
tr['transform_text'] = tr['text'].apply(transform)

In [7]:
tr

Unnamed: 0,id,keyword,location,text,target,transform_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv us
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resid ask place notifi offic evacu shelter pla...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,two giant crane hold bridg collaps nearbi home...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,thetawniest control wild fire california even ...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,utc 5km volcano hawaii http
7611,10872,,,Police investigating after an e-bike collided ...,1,polic investig collid car littl portug rider s...


Dropping the original text for the altered text

In [8]:
tr.drop(columns=['text'], inplace = True)

In [9]:
tr

Unnamed: 0,id,keyword,location,target,transform_text
0,1,,,1,deed reason earthquak may allah forgiv us
1,4,,,1,forest fire near la rong sask canada
2,5,,,1,resid ask place notifi offic evacu shelter pla...
3,6,,,1,peopl receiv wildfir evacu order california
4,7,,,1,got sent photo rubi alaska smoke wildfir pour ...
...,...,...,...,...,...
7608,10869,,,1,two giant crane hold bridg collaps nearbi home...
7609,10870,,,1,thetawniest control wild fire california even ...
7610,10871,,,1,utc 5km volcano hawaii http
7611,10872,,,1,polic investig collid car littl portug rider s...


Setting up for modeling

In [10]:
X = tr["transform_text"]

In [11]:
y = tr["target"]

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

Vectorizing the text

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tf = TfidfVectorizer()

In [14]:
tf_vec = tf.fit_transform(X)

Setting up the pipeline.

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [17]:
import pandas as pd
import nltk
import regex as re
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [18]:
mnb_tf = Pipeline([('Vectorizer',  TfidfVectorizer(lowercase=False)),
               ('mnb', MultinomialNB())])

lr_tf = Pipeline([('Vectorizer', TfidfVectorizer(lowercase=False)),
               ('LogisticReg', LogisticRegression(max_iter=200, random_state=1))])

dtc_tf = Pipeline([('Vectorizer', TfidfVectorizer(lowercase=False)),
               ('DecisionTree', DecisionTreeClassifier(random_state=1))])

rf_tf = Pipeline([('Vectorizer', TfidfVectorizer(lowercase=False)),
               ('RandomFor', RandomForestClassifier(random_state=1))]) 

gbc_tf = Pipeline([('Vectorizer', TfidfVectorizer(lowercase=False)),
               ('gradiendboosting', GradientBoostingClassifier(random_state=1))])

svc_tf = Pipeline([('Vectorizer', TfidfVectorizer(lowercase=False)),
                ('SupportVec', SVC(random_state=1))])


In [19]:
models2 = [('MultiNomBa', mnb_tf),
          ('LogisticReg', lr_tf),
          ('DecTreeClass', dtc_tf),           
          ('RandomFor', rf_tf),
          ('GradBoost', gbc_tf),
          ('SupportVec', svc_tf)]


In [20]:
num_mba=0
num_lreg=1
num_dtc=2
num_rfc=3
num_gbc=4
num_svc=5


In [21]:
tuned_params = {}

In [27]:

def gridsearch_tf(params, name, models, num):
    for model, grid in params.items():
        print(model, 'Grid Search:')
        print(model)
        pipe = Pipeline(steps=[('Vectorizer', TfidfVectorizer(lowercase=False)),
                                ('classifier', models[num][1][1])]) 
        print(pipe["Vectorizer"])
        gridsearch = GridSearchCV(estimator=pipe, param_grid=grid[0], scoring='accuracy', cv=5)
        gridsearch.fit(X, y)
        print("Scoring method: Recall")
        print(f'Avg of cross validation scores: {gridsearch.cv_results_["mean_test_score"]}')
        print(f'Best cross validation score: {gridsearch.best_score_ :.2%}')
        print(f'Optimal parameters: {gridsearch.best_params_}')
        tuned_params[name] = gridsearch.best_params_, gridsearch.best_score_

In [23]:
tuned_params1 = {}

In [39]:
names = [
         "Logistic Regression",
         "Random Forest",
        ]

classifiers = [
    LogisticRegression(),
    RandomForestClassifier(),
]

parameters = [
              {'clf__C': (np.logspace(-5, 1, 5))},
    
              {
               'clf__n_estimators': (100, 200)                  
               }
             ]

for name, classifier, params in zip(names, classifiers, parameters):
    clf_pipe = Pipeline([
        ('vect', TfidfVectorizer(stop_words='english')),
        ('clf', classifier),
    ])
    gs_clf = GridSearchCV(clf_pipe, param_grid=params, scoring='accuracy', cv=5)
    clf = gs_clf.fit(X, y)
    score = clf.score
    tuned_params1[name] = clf.best_params_, clf.best_score_
    print("{} score: {}".format(name, score))


Logistic Regression score: <bound method BaseSearchCV.score of GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(stop_words='english')),
                                       ('clf', LogisticRegression())]),
             param_grid={'clf__C': array([1.00000000e-05, 3.16227766e-04, 1.00000000e-02, 3.16227766e-01,
       1.00000000e+01])},
             scoring='accuracy')>
Random Forest score: <bound method BaseSearchCV.score of GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(stop_words='english')),
                                       ('clf', RandomForestClassifier())]),
             param_grid={'clf__n_estimators': (100, 200)}, scoring='accuracy')>


In [38]:
tuned_params1

{'Logistic Regression': ({'clf__C': 0.31622776601683794}, 0.6936903528291126),
 'Random Forest': ({'clf__n_estimators': 200}, 0.6494215286759395)}

Logistic Regression

In [26]:
params_lr_cv1 = {'LogisticReg': [{
    "classifier__penalty":["l1", "l2", "elasticnet"],
    'classifier__max_iter':[100, 200],
    'classifier__C':[0.001, 0.1, 1],
    'classifier__solver':['lbfgs', 'saga'],
    'classifier__fit_intercept':[True, False]

}]}

gridsearch_tf(params_lr_cv1, name="LogisticReg", models=models2, num=num_lreg)

LogisticReg Grid Search:
LogisticReg
TfidfVectorizer(lowercase=False)
Scoring method: Accuracy
Avg of cross validation scores: [       nan 0.4        0.         0.                nan        nan
        nan 0.4        0.         0.                nan        nan
        nan 0.         0.67779303 0.67779303        nan        nan
        nan 0.         0.67779303 0.67779303        nan        nan
        nan 0.20023017 0.29044051 0.29074632        nan        nan
        nan 0.20023017 0.29044051 0.29074632        nan        nan
        nan 0.22762378 0.64905946 0.64905946        nan        nan
        nan 0.22762378 0.64905946 0.64905946        nan        nan
        nan 0.44085954 0.55948736 0.56009898        nan        nan
        nan 0.44085954 0.55948736 0.56009898        nan        nan
        nan 0.54632257 0.65059551 0.65059551        nan        nan
        nan 0.54632257 0.65059551 0.65059551        nan        nan]
Best cross validation score: 67.78%
Optimal parameters: {'classifier

Decision tree

In [27]:
params_dtc1 = {'DecisionTree': [{
    'classifier__criterion':['gini', 'entropy'],
    'classifier__max_depth':[1, 3, 5, 10, 15, 25],
    'classifier__min_samples_split':[2, 5, 6, 8],
    'classifier__ccp_alpha':[0.0, 0.01, 0.1]
}]}

gridsearch_tf(params_dtc1, name='DecisionTree', models=models2, num=num_dtc)


DecisionTree Grid Search:
DecisionTree
TfidfVectorizer(lowercase=False)
Scoring method: Accuracy
Avg of cross validation scores: [0.6194726  0.6194726  0.6194726  0.6194726  0.62459502 0.62459502
 0.62459502 0.62459502 0.61290411 0.61290411 0.61290411 0.61290411
 0.60764614 0.60764614 0.6093533  0.60869661 0.59858525 0.59727231
 0.5984541  0.59806023 0.61080109 0.60948772 0.6113262  0.61263974
 0.6194726  0.6194726  0.6194726  0.6194726  0.6244637  0.6244637
 0.6244637  0.6244637  0.61382369 0.61382369 0.61382369 0.61382369
 0.60357531 0.60357531 0.60357531 0.59950199 0.59648379 0.59819146
 0.5970095  0.59595885 0.61277011 0.61329582 0.6111941  0.61171964
 0.6194726  0.6194726  0.6194726  0.6194726  0.6194726  0.6194726
 0.6194726  0.6194726  0.6194726  0.6194726  0.6194726  0.6194726
 0.6194726  0.6194726  0.6194726  0.6194726  0.6194726  0.6194726
 0.6194726  0.6194726  0.6194726  0.6194726  0.6194726  0.6194726
 0.6194726  0.6194726  0.6194726  0.6194726  0.62012937 0.62012937
 0.62

Random Forest 

In [26]:
params_rf1 = {'RandomForest': [{
    "classifier__n_estimators": [50,100, 150, 200, 250],
    'classifier__criterion':['gini', 'entropy'],
#    "classifier__max_depth": [2, 3, 4, 5, 8, 12, 20],
#    "classifier__min_samples_leaf": [2, 4, 6]
#    'classifier__max_depth':[5, 10, 15, 20],
#    "classifier__min_weight_fraction_leaf": [0.1, 0.3]
}]}

gridsearch_tf(params_rf1, name='RandomForest', models=models2, num=num_rfc)


RandomForest Grid Search:
RandomForest
TfidfVectorizer(lowercase=False)
Scoring method: Recall
Avg of cross validation scores: [0.48090436 0.49374746 0.49741765 0.50261643 0.50353386 0.48121017
 0.49436002 0.49894437 0.49619348 0.49435955]
Best cross validation score: 50.35%
Optimal parameters: {'classifier__criterion': 'gini', 'classifier__n_estimators': 250}


In [28]:
params_rf1 = {'RandomForest': [{
    "classifier__n_estimators": [50,100, 150, 200, 250],
    'classifier__criterion':['gini', 'entropy'],
#    "classifier__max_depth": [2, 3, 4, 5, 8, 12, 20],
#    "classifier__min_samples_leaf": [2, 4, 6]
#    'classifier__max_depth':[5, 10, 15, 20],
#    "classifier__min_weight_fraction_leaf": [0.1, 0.3]
}]}

gridsearch_tf(params_rf1, name='RandomForest', models=models2, num=num_rfc)


RandomForest Grid Search:
RandomForest
TfidfVectorizer(lowercase=False)
Scoring method: Recall
Avg of cross validation scores: [0.65204758 0.65125785 0.65388623 0.65559494 0.65559459 0.65651452
 0.65664662 0.65809053 0.65822159 0.65730227]
Best cross validation score: 65.82%
Optimal parameters: {'classifier__criterion': 'entropy', 'classifier__n_estimators': 200}


Logistic Regression

Multinomial Naive Bayes

In [29]:
params_nb_cv1 = {'MultinomialNB': [{
    'classifier__alpha':[.001, .01, .05, .1, .2, .4, .6, .8, 1],

}]}

gridsearch_tf(params_nb_cv1, name="MultinomialNB", models=models2, num=num_mba)

MultinomialNB Grid Search:
MultinomialNB
TfidfVectorizer(lowercase=False)
Scoring method: Accuracy
Avg of cross validation scores: [0.67450826 0.67897477 0.68501816 0.69316335 0.70183304 0.70590508
 0.7118161  0.71221058 0.71431247]
Best cross validation score: 71.43%
Optimal parameters: {'classifier__alpha': 1}


Gradient Boost Classifier

In [30]:
params_gbc1 = {'GradBoostClassifier': [{
    'classifier__learning_rate':[.001, .01],
    'classifier__n_estimators':[100, 200],
    'classifier__max_depth':[5, 10]
}]}

gridsearch_count(params_gbc1, name='GradBoostClassifier1', models=models2, num=num_gbc)

GradBoostClassifier Grid Search:
GradBoostClassifier
CountVectorizer(lowercase=False)
Scoring method: Accuracy
Avg of cross validation scores: [0.5703402  0.59542719 0.5703402  0.59333091 0.62603902 0.63943795
 0.62380641 0.62525274]
Best cross validation score: 63.94%
Optimal parameters: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 5, 'classifier__n_estimators': 200}


Support Vec

In [31]:
params_svc1 = {'SVC': [{
    'classifier__C':[1],
    'classifier__kernel':['linear'],
    'classifier__gamma':['scale'],
}]}

gridsearch_count(params_svc1, name='SVC1', models=models2, num=num_svc)

SVC Grid Search:
SVC
CountVectorizer(lowercase=False)
Scoring method: Accuracy
Avg of cross validation scores: [0.65244231]
Best cross validation score: 65.24%
Optimal parameters: {'classifier__C': 1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}


These grid searches were significantly lower than the defaults run in the exploratory models notebook so there is something wrong with these grid searches. Random forest is because the max depth was probably significantly higher but the others don't make sense. It is possible this is because there was no shuffle split. It is possible the data would be better if shuffled. This can be done in grid search by train test split.