In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## NOTEBOOK WORKPLAN
### - TRY OUT DIFFERENT MODELS WITH DEFAULT PARAMETERS
### - TRY OUT TFIDFVECTOIRZER
### - TUNE THE TOP MODELS
### - USE VOTING CLASSIFIER TO VOTE THE TUNED TOP MODELS

In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [4]:
test_text = test["text"]

In [5]:
X = train["text"]
y = train["target"]

In [5]:
X.head(10)

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
5    #RockyFire Update => California Hwy. 20 closed...
6    #flood #disaster Heavy rain causes flash flood...
7    I'm on top of the hill and I can see a fire in...
8    There's an emergency evacuation happening now ...
9    I'm afraid that the tornado is coming to our a...
Name: text, dtype: object

### LET US TRY OUT OUR MODELS

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [6]:
vect = CountVectorizer()

In [13]:
lg = LogisticRegression()
knn = KNeighborsClassifier()
knn_200 = KNeighborsClassifier(n_neighbors= 200)
nb = MultinomialNB()
rf = RandomForestClassifier()
xgb = XGBClassifier()

In [11]:
pipe = make_pipeline(vect, lg)

In [12]:
def run_models(model):
    pipe = make_pipeline(vect, model)
    return cross_val_score(pipe, X, y, cv = 5, scoring = "accuracy").mean()

In [29]:
def output_csv(model, file_name):
    pipe = make_pipeline(vect, model)
    pipe.fit(X, y)
    pred = pipe.predict(test_text)
    return pd.DataFrame({"id": test["id"], "target": pred}).to_csv("../submission-file/"+file_name, index = False)
    

In [20]:
%time run_models(lg)

Wall time: 2.3 s


0.7110221858413593

In [21]:
%time run_models(knn)

Wall time: 6.03 s


0.6006873306121466

In [22]:
%time run_models(knn_200)

Wall time: 6.83 s


0.5734931779982688

In [23]:
%time run_models(nb)

Wall time: 2.35 s


0.7284953073625868

In [24]:
%time run_models(rf)

Wall time: 11.7 s


0.6842256146241612

In [25]:
%time run_models(xgb)

Wall time: 51.5 s


0.6926302547757452

In [30]:
%time output_csv(nb, "nb_default.csv")

Wall time: 431 ms


#### In conclusion models to work with: lg, nb, xgb, rf

### TUNING THE COUNTVECTORIZER

In [34]:
vect_tune = CountVectorizer()
vect_tune

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [35]:
vect.fit_transform(X)

<7613x21637 sparse matrix of type '<class 'numpy.int64'>'
	with 111497 stored elements in Compressed Sparse Row format>

In [39]:
print(vect.get_feature_names()[0:100])

['00', '000', '0000', '007npen6lg', '00cy9vxeff', '00end', '00pm', '01', '02', '0215', '02elqlopfk', '02pm', '03', '030', '033', '034', '039', '03l7nwqdje', '04', '05', '05th', '06', '060', '061', '06jst', '07', '073izwx0lb', '08', '0840728', '0853', '087809233445', '0880', '08lngclzsj', '09', '0abgfglh7x', '0ajisa5531', '0blkwcupzq', '0btniwagt1', '0bvk5tub4j', '0c1y8g7e9p', '0cr74m1uxm', '0cxm5tkz8y', '0dqjeretxu', '0drqlrsgy5', '0dxvz7fdh3', '0erisq25kt', '0f8xa4ih1u', '0fekgyby5f', '0fs9ksv5xk', '0ghk693egj', '0gidg9u45j', '0gknpy4lua', '0h7oua1pns', '0iw6drf5x9', '0iyuntxduv', '0jfnvaxfph', '0jmkdtcymj', '0kccg1bt06', '0keh2treny', '0krw1zyahm', '0l', '0la1aw9uud', '0llwuqn8vg', '0lmheaex9k', '0lpu0gr2j0', '0m1tw3datd', '0mcxc68gzd', '0migwcmtje', '0mnpcer9no', '0npzp', '0nr4dpjgyl', '0oms8ri3l1', '0pamznyyuw', '0q040stkcv', '0r03c6njli', '0rny349unt', '0rokdutyun', '0rsverlztm', '0s6ydfrwdq', '0sa6xx1oq7', '0scnwe8xbv', '0szwlwl9qu', '0t8vqkeari', '0t9yd557ry', '0tslqjokvh', '0tz

In [81]:
vect_tune = CountVectorizer(ngram_range = (1,2))

In [83]:
models = [lg, nb, rf]

In [50]:
def run_models_v2(model):
    pipe = make_pipeline(vect_tune, model)
    return cross_val_score(pipe, X, y, cv = 5, scoring = "accuracy").mean()

In [82]:
for i in models:
    print(i)
    print(run_models_v2(i))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
0.727181333536549
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.7292847788900689
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.685407836149009
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1

### TRYING OUT TFIDF VECTORIZER AND TUNING IT

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
tfidf = TfidfVectorizer()
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [8]:
tfidf= TfidfVectorizer(binary = True)

In [130]:
models = [lg, nb, rf, xgb]

In [131]:
def run_models_v2(model):
    pipe = make_pipeline(tfidf, model)
    return cross_val_score(pipe, X, y, cv = 5, scoring = "accuracy").mean()

In [9]:
def output_csv_v2(model, file_name):
    pipe = make_pipeline(tfidf, model)
    pipe.fit(X, y)
    pred = pipe.predict(test_text)
    return pd.DataFrame({"id": test["id"], "target": pred}).to_csv("../submission-file/"+file_name, index = False)

In [132]:
for i in models:
    print(i)
    print(run_models_v2(i))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
0.7358508548243303
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.7379563679791643
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.6769975906729122
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel

In [135]:
output_csv_v2(nb, "nb_tfidf.csv")

In [14]:
output_csv_v2(lg, "lg_tfidf.csv")

#### Conclusion: we will go with Tfidf Vectorizer

### TUNE THE PARAMETERS OF LOGISTIC REGRESSION MODEL (0.735)

In [136]:
lg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [187]:
lg_tune = LogisticRegression()

In [9]:
def run_models_v2(model):
    pipe = make_pipeline(tfidf, model)
    return cross_val_score(pipe, X, y, cv = 5, scoring = "accuracy").mean()

In [188]:
run_models_v2(lg_tune)

0.7358508548243303

#### Conclusion: no head way with Logistic Regression

### TUNE THE PARAMETERS OF NAIVES BAYES (0.737)

In [191]:
nb

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [253]:
#nb_tune = MultinomialNB()
#nb_tune = BernoulliNB()
nb_tune = BernoulliNB(alpha = 1.2)

In [254]:
run_models_v2(nb_tune)

0.7514860158691511

In [255]:
def output_csv_v2(model, file_name):
    pipe = make_pipeline(tfidf, model)
    pipe.fit(X, y)
    pred = pipe.predict(test_text)
    return pd.DataFrame({"id": test["id"], "target": pred}).to_csv("../submission-file/"+file_name, index = False)

In [257]:
output_csv_v2(nb_tune, "bernoulli_nb_tfidf.csv")

#### Conclusion: BernolliNB seems better than MultinomialNB 

### TUNE THE PARAMETERS OF RANDOM FOREST (0.67)

In [8]:
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [46]:
rf_tune = RandomForestClassifier(n_jobs = -1, class_weight = "balanced_subsample", n_estimators = 200, max_features = "log2")

In [16]:
tfidf = TfidfVectorizer()

In [114]:
def run_models_v2(model):
    pipe = make_pipeline(tfidf, model)
    return cross_val_score(pipe, X, y, cv = 5, scoring = "accuracy").mean()

In [47]:
%time run_models_v2(rf_tune)

Wall time: 1min 11s


0.725739143800223

#### Conclusion: I was able to tune the RandomForestClassfier model up to 0.725...not bad

### TUNE THE PARAMETERS OF XGBClassifier (0.69)

In [49]:
xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [90]:
xgb_tune = XGBClassifier()

In [91]:
%time run_models_v2(xgb_tune)

Wall time: 30.5 s


0.6935488034388358

#### Conclusion: The XGBClassifier proves untunable

### USE THE VOTINGCLASSIFIER WITH TUNED MODELS

In [103]:
from sklearn.ensemble import VotingClassifier

In [100]:
## models
lg = LogisticRegression()
xgb = XGBClassifier()
rf = RandomForestClassifier(n_jobs = -1, class_weight = "balanced_subsample", n_estimators = 200, max_features = "log2")
nb = BernoulliNB(alpha = 1.2)

In [108]:
X_dtm = tfidf.fit_transform(X)

def run_models_v3(model):
    return cross_val_score(model, X_dtm, y, cv = 5, scoring = "accuracy").mean()

In [113]:
stack = VotingClassifier(estimators = [("lr" ,lg), ("xgb", xgb), ("rf", rf), ("nb", nb)], voting = "soft")
stack

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('xgb',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsam...
                                                     max_

In [115]:
%time run_models_v2(stack)

Wall time: 1min 53s


0.7505633284329998

In [116]:
def output_csv_v2(model, file_name):
    pipe = make_pipeline(tfidf, model)
    pipe.fit(X, y)
    pred = pipe.predict(test_text)
    return pd.DataFrame({"id": test["id"], "target": pred}).to_csv("../submission-file/"+file_name, index = False)

In [117]:
output_csv_v2(stack, "voting_tuned_model.csv")

#### Conclusion: Here i will conclude on this notebook