## Import modules

In [50]:
from sklearn import svm, naive_bayes, ensemble, tree
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np

import pandas as pd

## Import Dataset

In [21]:
df = pd.read_csv('train_processed.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target,processed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near rong sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resid ask place notifi offic evacu shelter pla...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...


In [22]:
df.shape

(7613, 6)

In [23]:
len(df)

7613

On drop ensuite les colonnes inutiles

In [24]:
df.drop(['id','keyword', 'location'], axis = 1, inplace = True) 

On regarde ensuite les valeurs manquantes

In [25]:
df.isnull().sum()

text              0
target            0
processed_text    4
dtype: int64

In [26]:
df.dropna(inplace=True)

In [27]:
df.isnull().sum()

text              0
target            0
processed_text    0
dtype: int64

In [28]:
blanks = []  # start with an empty list

for i,lb,rv,pv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list

print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


On remarque qu'aucune donnée n'est manquante et pas d'espaces marquants manquants

In [29]:
len(df)

7609

## PREPROCESS DATA

Vu que les données sont déjà bien préparées, on peut passer directement à la modélisation

## Division du dataset (target/feature)

In [30]:
# define the features and the target
features = 'processed_text'
target = 'target'

# create the X and y
X = df[features]
y = df[target]

# split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### BAG OF WORDS

On commence sans cross-validation ni hyperparameters tuning avec bag of words

In [31]:
# create a pipeline
text_clf = Pipeline([('bag', CountVectorizer()), ('clf', tree.DecisionTreeClassifier())])
text_clf.fit(X_train, y_train)

# make predictions
predictions = text_clf.predict(X_test)

In [32]:
# evaluate the model
print(confusion_matrix(y_test, predictions))

# Print a classification report
print(classification_report(y_test, predictions))

# Print the overall accuracy
print(accuracy_score(y_test,predictions))

[[705 159]
 [245 413]]
              precision    recall  f1-score   support

           0       0.74      0.82      0.78       864
           1       0.72      0.63      0.67       658

    accuracy                           0.73      1522
   macro avg       0.73      0.72      0.72      1522
weighted avg       0.73      0.73      0.73      1522

0.7345597897503285


In [33]:
# create a pipeline for the other models
text_clf = Pipeline([('bag', CountVectorizer()), ('clf', svm.LinearSVC())])
text_clf.fit(X_train, y_train)

# make predictions
predictions = text_clf.predict(X_test)

In [34]:
# evaluate the model
print(confusion_matrix(y_test, predictions))

# Print a classification report
print(classification_report(y_test, predictions))

# Print the overall accuracy
print(accuracy_score(y_test,predictions))

[[715 149]
 [205 453]]
              precision    recall  f1-score   support

           0       0.78      0.83      0.80       864
           1       0.75      0.69      0.72       658

    accuracy                           0.77      1522
   macro avg       0.76      0.76      0.76      1522
weighted avg       0.77      0.77      0.77      1522

0.7674113009198423


In [35]:
# create a pipeline for the other models
text_clf = Pipeline([('bag', CountVectorizer()), ('clf', naive_bayes.MultinomialNB())])
text_clf.fit(X_train, y_train)

# make predictions
predictions = text_clf.predict(X_test)

In [36]:
# evaluate the model
print(confusion_matrix(y_test, predictions))

# Print a classification report
print(classification_report(y_test, predictions))

# Print the overall accuracy
print(accuracy_score(y_test,predictions))

[[733 131]
 [182 476]]
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       864
           1       0.78      0.72      0.75       658

    accuracy                           0.79      1522
   macro avg       0.79      0.79      0.79      1522
weighted avg       0.79      0.79      0.79      1522

0.7943495400788436


In [37]:
# create a pipeline for the other models
text_clf = Pipeline([('bag', CountVectorizer()), ('clf', ensemble.RandomForestClassifier())])
text_clf.fit(X_train, y_train)

# make predictions
predictions = text_clf.predict(X_test)

In [38]:
# evaluate the model
print(confusion_matrix(y_test, predictions))

# Print a classification report
print(classification_report(y_test, predictions))

# Print the overall accuracy
print(accuracy_score(y_test,predictions))

[[778  86]
 [247 411]]
              precision    recall  f1-score   support

           0       0.76      0.90      0.82       864
           1       0.83      0.62      0.71       658

    accuracy                           0.78      1522
   macro avg       0.79      0.76      0.77      1522
weighted avg       0.79      0.78      0.78      1522

0.7812089356110381


On peut voir :

- LinearSVC il y a une accuracy de 0.767
- DecisionTreeClassifier il y a une accuracy de 0.720
- MultinomialNB il y a une accuracy de 0.794
- RandomForestClassifier il y a une accuracy de 0.787

On peut voir qu'avec bag of words le plus performant sans cross-validation et hyperparameters tunning est le MultinomialNB

Maintenant on va essayer avec le tuning des hyperparameters

In [51]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', naive_bayes.MultinomialNB()),
])

parameters = {
  'clf__alpha': np.linspace(0.5, 1.5, 6),
  'clf__fit_prior': [True, False]
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)
print(gs_clf.best_estimator_)
print(gs_clf.score(X_test, y_test))

0.7974368315314113
{'clf__alpha': 1.1, 'clf__fit_prior': True}
Pipeline(steps=[('vect', CountVectorizer()), ('clf', MultinomialNB(alpha=1.1))])
0.7943495400788436


In [52]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', svm.LinearSVC()),
])

parameters = {
    'clf__C': np.linspace(0.1, 1, 10),
    'clf__loss': ['hinge', 'squared_hinge']
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)
print(gs_clf.best_estimator_)
print(gs_clf.score(X_test, y_test))

0.7966147340697536
{'clf__C': 0.1, 'clf__loss': 'hinge'}
Pipeline(steps=[('vect', CountVectorizer()),
                ('clf', LinearSVC(C=0.1, loss='hinge'))])
0.7950065703022339


In [53]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', ensemble.RandomForestClassifier()),
])

parameters = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [10, 20, 30, 40, 50]
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)
print(gs_clf.best_estimator_)
print(gs_clf.score(X_test, y_test))

0.7698368622942902
{'clf__max_depth': 50, 'clf__n_estimators': 100}
Pipeline(steps=[('vect', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=50))])
0.7674113009198423


In [54]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', tree.DecisionTreeClassifier()),
])

parameters = {
    'clf__max_depth': [10, 20, 30, 40, 50]
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)
print(gs_clf.best_estimator_)
print(gs_clf.score(X_test, y_test))

0.7453618888407656
{'clf__max_depth': 50}
Pipeline(steps=[('vect', CountVectorizer()),
                ('clf', DecisionTreeClassifier(max_depth=50))])
0.7365308804204993


On peut voir :

- LinearSVC il y a une accuracy de 0.796
- DecisionTreeClassifier il y a une accuracy de 0.745
- MultinomialNB il y a une accuracy de 0.797
- RandomForestClassifier il y a une accuracy de 0.769

On peut voir qu'avec bag of words le plus performant sans cross-validation et hyperparameters tunning est le MultinomialNB

### TF-IDF

Maintenant on va essayer de faire sans cross-validation et hyperparameters tunning avec TF-IDF

In [39]:
# create a pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', tree.DecisionTreeClassifier())])
text_clf.fit(X_train, y_train)

# make predictions
predictions = text_clf.predict(X_test)

In [40]:
# evaluate the model
print(confusion_matrix(y_test, predictions))

# Print a classification report
print(classification_report(y_test, predictions))

# Print the overall accuracy
print(accuracy_score(y_test,predictions))

[[683 181]
 [234 424]]
              precision    recall  f1-score   support

           0       0.74      0.79      0.77       864
           1       0.70      0.64      0.67       658

    accuracy                           0.73      1522
   macro avg       0.72      0.72      0.72      1522
weighted avg       0.73      0.73      0.73      1522

0.7273324572930355


In [41]:
# create a pipeline for the other models
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', svm.LinearSVC())])
text_clf.fit(X_train, y_train)

# make predictions
predictions = text_clf.predict(X_test)

In [42]:
# evaluate the model
print(confusion_matrix(y_test, predictions))

# Print a classification report
print(classification_report(y_test, predictions))

# Print the overall accuracy
print(accuracy_score(y_test,predictions))

[[726 138]
 [202 456]]
              precision    recall  f1-score   support

           0       0.78      0.84      0.81       864
           1       0.77      0.69      0.73       658

    accuracy                           0.78      1522
   macro avg       0.78      0.77      0.77      1522
weighted avg       0.78      0.78      0.77      1522

0.7766097240473062


In [43]:
# create a pipeline for the other models
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', naive_bayes.MultinomialNB())])
text_clf.fit(X_train, y_train)

# make predictions
predictions = text_clf.predict(X_test)

In [44]:
# evaluate the model
print(confusion_matrix(y_test, predictions))

# Print a classification report
print(classification_report(y_test, predictions))

# Print the overall accuracy
print(accuracy_score(y_test,predictions))

[[769  95]
 [213 445]]
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       864
           1       0.82      0.68      0.74       658

    accuracy                           0.80      1522
   macro avg       0.80      0.78      0.79      1522
weighted avg       0.80      0.80      0.79      1522

0.797634691195795


In [45]:
# create a pipeline for the other models
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', ensemble.RandomForestClassifier())])
text_clf.fit(X_train, y_train)

# make predictions
predictions = text_clf.predict(X_test)

In [46]:
# evaluate the model
print(confusion_matrix(y_test, predictions))

# Print a classification report
print(classification_report(y_test, predictions))

# Print the overall accuracy
print(accuracy_score(y_test,predictions))

[[767  97]
 [233 425]]
              precision    recall  f1-score   support

           0       0.77      0.89      0.82       864
           1       0.81      0.65      0.72       658

    accuracy                           0.78      1522
   macro avg       0.79      0.77      0.77      1522
weighted avg       0.79      0.78      0.78      1522

0.783180026281209


On peut voir :

- LinearSVC il y a une accuracy de 0.776
- DecisionTreeClassifier il y a une accuracy de 0.727
- MultinomialNB il y a une accuracy de 0.797
- RandomForestClassifier il y a une accuracy de 0.783

On peut voir qu'avec TF-IDF le plus performant sans cross-validation et hyperparameters tunning est le MultinomialNB

Maintenant on va essayer avec le tuning des hyperparameters

In [55]:
text_clf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', naive_bayes.MultinomialNB()),
])

parameters = {
    'clf__alpha': np.linspace(0.5, 1.5, 6),
    'clf__fit_prior': [True, False]
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)
print(gs_clf.best_estimator_)
print(gs_clf.score(X_test, y_test))

0.7997373012050144
{'clf__alpha': 0.7, 'clf__fit_prior': True}
Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', MultinomialNB(alpha=0.7))])
0.7996057818659659


In [56]:
text_clf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', svm.LinearSVC()),
])

parameters = {
    'clf__C': np.linspace(0.1, 1, 10),
    'clf__loss': ['hinge', 'squared_hinge']
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)
print(gs_clf.best_estimator_)
print(gs_clf.score(X_test, y_test))

0.8002291024930075
{'clf__C': 0.2, 'clf__loss': 'squared_hinge'}
Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', LinearSVC(C=0.2))])
0.7956636005256241


In [57]:
text_clf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', ensemble.RandomForestClassifier()),
])

parameters = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [10, 20, 30, 40, 50]
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)
print(gs_clf.best_estimator_)
print(gs_clf.score(X_test, y_test))

0.7708222188940745
{'clf__max_depth': 50, 'clf__n_estimators': 200}
Pipeline(steps=[('vect', TfidfVectorizer()),
                ('clf',
                 RandomForestClassifier(max_depth=50, n_estimators=200))])
0.7588699080157687


In [58]:
text_clf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', tree.DecisionTreeClassifier()),
])

parameters = {
    'clf__max_depth': [10, 20, 30, 40, 50]
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)
print(gs_clf.best_estimator_)
print(gs_clf.score(X_test, y_test))

0.7341897017215069
{'clf__max_depth': 50}
Pipeline(steps=[('vect', TfidfVectorizer()),
                ('clf', DecisionTreeClassifier(max_depth=50))])
0.7352168199737188


On peut voir :

- LinearSVC il y a une accuracy de 0.800
- DecisionTreeClassifier il y a une accuracy de 0.734
- MultinomialNB il y a une accuracy de 0.799
- RandomForestClassifier il y a une accuracy de 0.770

On peut voir qu'avec TF-IDF le plus performant sans cross-validation et hyperparameters tunning est le LinearSVC