In [2]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports

In [3]:
import time
import pickle

import pandas as pd
import numpy as np

from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer

import mlflow

In [4]:
run_name = str(int(time.time()))
print('Run name: ', run_name)

Run name:  1621023102


## Reading the data

In [5]:
# train_data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
# test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [6]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

In [7]:
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [8]:
print('The length of the training data is %d' % len(train_data))
print('The length of the test data is %d' % len(test_data))

The length of the training data is 7613
The length of the test data is 3263


## Feature Engineering

### Replacing NaN with empty string.

In [9]:
train_data = train_data.replace(np.nan, '', regex=True)
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


### Replacing %20 with space

In [10]:
train_data = train_data.replace('%20', ' ', regex=True)
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [11]:
train_data['keyword'].value_counts()

                       61
fatalities             45
armageddon             42
deluge                 42
damage                 41
                       ..
forest fire            19
epicentre              12
threat                 11
inundation             10
radiation emergency     9
Name: keyword, Length: 222, dtype: int64

### Creating unique word using the values in the keyword column.

In [12]:
train_data['keyword_unique'] = train_data['keyword'].apply(lambda word : 'x' + word + 'x' if len(word) else '')

In [13]:
train_data['keyword'].iloc[40]

'ablaze'

In [14]:
train_data['keyword_unique'].iloc[40]

'xablazex'

## Text preprocessing

In [15]:
def clean_text(text):
    # split into words
    tokens = word_tokenize(text)
    # remove all tokens that are not alphanumeric. Can also use .isalpha() here if do not want to keep numbers.
    words = [word for word in tokens if word.isalnum()]
    # remove stopwords
    stop_words = stopwords.words('english')
    words = [word for word in words if word not in stop_words]
    # performing lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    # Converting list of words to string
    words = ' '.join(words)
    return words

In [16]:
train_data['cleaned_text'] = train_data['text'].apply(clean_text)

In [17]:
train_data['text'].iloc[100]

'.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad'

In [18]:
train_data['cleaned_text'].iloc[100]

'NorwayMFA Bahrain police previously died road accident killed explosion http'

In [19]:
train_data['keyword_unique'].iloc[100]

'xaccidentx'

In [20]:
train_data['target'].iloc[100]

1

## Merging the keyword_unique and text column

In [21]:
train_data.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'keyword_unique',
       'cleaned_text'],
      dtype='object')

In [22]:
train_data['keyword_unique_cleaned_text'] = train_data['keyword_unique'] + ' ' + train_data['cleaned_text']

In [23]:
train_data['keyword_unique'].iloc[190]

'xambulancex'

In [24]:
train_data['cleaned_text'].iloc[190]

'http Twelve feared killed Pakistani air ambulance helicopter crash http'

In [25]:
train_data['keyword_unique_cleaned_text'].iloc[190]

'xambulancex http Twelve feared killed Pakistani air ambulance helicopter crash http'

In [26]:
train_data['keyword_unique_cleaned_text']

0         Our Deeds Reason earthquake May ALLAH Forgive u
1                   Forest fire near La Ronge Sask Canada
2        All resident asked place notified officer No ...
3        people receive wildfire evacuation order Cali...
4        Just got sent photo Ruby Alaska smoke wildfir...
                              ...                        
7608     Two giant crane holding bridge collapse nearb...
7609     TheTawniest The control wild fire California ...
7610                        UTC 5km S Volcano Hawaii http
7611     Police investigating collided car Little Port...
7612     The Latest More Homes Razed Northern Californ...
Name: keyword_unique_cleaned_text, Length: 7613, dtype: object

In [27]:
train_data['keyword_unique_cleaned_text'].values

array([' Our Deeds Reason earthquake May ALLAH Forgive u',
       ' Forest fire near La Ronge Sask Canada',
       ' All resident asked place notified officer No evacuation shelter place order expected',
       ..., ' UTC 5km S Volcano Hawaii http',
       ' Police investigating collided car Little Portugal rider suffered serious threatening injury',
       ' The Latest More Homes Razed Northern California Wildfire ABC News http'],
      dtype=object)

## Tf-idf features

In [28]:
tfidf_vectorizer = TfidfVectorizer(decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=100, vocabulary=None, binary=False, norm='l2', smooth_idf=True)

In [29]:
train_data_tfidf = tfidf_vectorizer.fit_transform(train_data['keyword_unique_cleaned_text'])
train_data_tfidf

<7613x100 sparse matrix of type '<class 'numpy.float64'>'
	with 15936 stored elements in Compressed Sparse Row format>

In [30]:
tfidf_vectorizer.get_feature_names()[:10]

['accident',
 'amp',
 'and',
 'as',
 'attack',
 'back',
 'body',
 'bomb',
 'building',
 'burning']

## Baseline model

## SVC cross validation.

In [31]:
svc = SVC(C=1.0, kernel='poly', degree=5, gamma='scale', coef0=0.0, shrinking=True, probability=True, tol=1e-3, cache_size=500, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=42)


In [32]:
cv_results = cross_validate(svc, train_data_tfidf, train_data['target'], return_train_score=True, n_jobs=-1)
cv_results

{'fit_time': array([5.27161098, 5.13530016, 5.16436005, 5.28551006, 5.34749508]),
 'score_time': array([0.14501405, 0.13326597, 0.13284707, 0.14554501, 0.14316297]),
 'test_score': array([0.65462902, 0.60013132, 0.6126067 , 0.61038108, 0.64914586]),
 'train_score': array([0.80082102, 0.8180624 , 0.82216749, 0.81267444, 0.80857002])}

## Pipeline

In [33]:
text_clf = Pipeline(
    [
        ('tfidf_vect', TfidfVectorizer(decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, vocabulary=None, dtype=np.float64, norm='l2', smooth_idf=True)),
        ('clf', SVC(gamma='scale', coef0=0.0, shrinking=True, probability=True, tol=1e-3, cache_size=500, verbose=False, max_iter=-1, random_state=42)),
    ]
)

In [34]:
parameters = {
    'tfidf_vect__ngram_range': [(1, 1), (1, 2), (1, 2, 3)],
    'tfidf_vect__max_df': [0.6, 0.8],
    'tfidf_vect__min_df': [0.0, 0.1],
    'tfidf_vect__max_features': [10, 100, 500],
    'tfidf_vect__binary': [True, False],
    'tfidf_vect__use_idf': [True, False],
    'tfidf_vect__sublinear_tf': [True, False],
    'clf__C': [1, 10],
    'clf__kernel': ['poly', 'rbf'],
    'clf__degree': [3, 5],
    'clf__class_weight': ['balanced', None]
}

In [35]:
strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [36]:
gs_clf = GridSearchCV(text_clf, parameters, cv = strat_k_fold, n_jobs=-1, verbose = 10)

In [67]:
gs_clf = gs_clf.fit(train_data['keyword_unique_cleaned_text'], train_data['target'])

Fitting 5 folds for each of 4608 candidates, totalling 23040 fits


In [37]:
gs_clf = pickle.load(open('/Volumes/Area51/Kaggle/nlp_with_disaster_tweets/gs_clf.pkl', 'rb'))

In [38]:
gs_clf.best_estimator_

Pipeline(steps=[('tfidf_vect',
                 TfidfVectorizer(max_df=0.6, max_features=500, min_df=0.0)),
                ('clf',
                 SVC(C=1, cache_size=500, probability=True, random_state=42))])

In [39]:
gs_clf.best_score_

0.781954145071238

In [40]:
gs_clf.classes_

array([0, 1])

In [41]:
gs_clf.best_index_

1634

In [42]:
gs_clf.best_params_

{'clf__C': 1,
 'clf__class_weight': None,
 'clf__degree': 3,
 'clf__kernel': 'rbf',
 'tfidf_vect__binary': False,
 'tfidf_vect__max_df': 0.6,
 'tfidf_vect__max_features': 500,
 'tfidf_vect__min_df': 0.0,
 'tfidf_vect__ngram_range': (1, 1),
 'tfidf_vect__sublinear_tf': False,
 'tfidf_vect__use_idf': True}

In [43]:
tfidf_vectorizer = TfidfVectorizer(decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, ngram_range=(1, 1), max_df=0.6, min_df=0.0, max_features=500, vocabulary=None, binary=False, dtype=np.float64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

In [44]:
train_data_tfidf = tfidf_vectorizer.fit_transform(train_data['keyword_unique_cleaned_text'])
train_data_tfidf

<7613x500 sparse matrix of type '<class 'numpy.float64'>'
	with 33700 stored elements in Compressed Sparse Row format>

In [45]:
tfidf_vectorizer.get_feature_names()[:10]

['2015',
 '40',
 '70',
 'accident',
 'accidentx',
 'after',
 'air',
 'airplane',
 'all',
 'also']

In [46]:
svc = SVC(C=1, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=True, tol=1e-3, cache_size=500, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=42)


In [47]:
cv_results = cross_validate(svc, train_data_tfidf, train_data['target'], return_train_score=True, n_jobs=-1)
cv_results

{'fit_time': array([8.03279209, 7.67498517, 7.76982212, 7.90120101, 8.16133094]),
 'score_time': array([0.27571321, 0.28474784, 0.28194284, 0.28488088, 0.29657435]),
 'test_score': array([0.69796454, 0.57124097, 0.62967827, 0.61695138, 0.71616294]),
 'train_score': array([0.87947455, 0.8863711 , 0.88489327, 0.88228534, 0.87604663])}

In [48]:
cv_results = cross_validate(gs_clf.best_estimator_, train_data['keyword_unique_cleaned_text'], train_data['target'], cv=strat_k_fold, return_train_score=True, n_jobs=-1)
cv_results

{'fit_time': array([8.12541723, 8.04615188, 8.00885677, 8.06134772, 8.21598196]),
 'score_time': array([0.27530074, 0.2766211 , 0.2776053 , 0.2753191 , 0.28297496]),
 'test_score': array([0.79251477, 0.7800394 , 0.75640184, 0.78120894, 0.79960578]),
 'train_score': array([0.87553366, 0.87914614, 0.87865353, 0.87883763, 0.87653916])}

In [49]:
gs_clf.best_score_

0.781954145071238

In [50]:
scoring_function_f1 = make_scorer(f1_score, pos_label=1, average='binary')

In [51]:
cv_results = cross_validate(estimator=gs_clf.best_estimator_, X=train_data['keyword_unique_cleaned_text'], y=train_data['target'], scoring=scoring_function_f1, cv=strat_k_fold, return_train_score=True, n_jobs=-1)
cv_results

{'fit_time': array([7.82014894, 7.67784786, 7.71276212, 7.83777881, 7.94395709]),
 'score_time': array([0.29004002, 0.29981303, 0.30108094, 0.28211832, 0.28850913]),
 'test_score': array([0.72852234, 0.71489362, 0.68953975, 0.71465296, 0.74261603]),
 'train_score': array([0.84168755, 0.84653878, 0.8467123 , 0.84764657, 0.8439834 ])}

In [52]:
def mean_sd_cv_results(cv_results, metric='F1'):
    print(f"{metric} Train CV results: {cv_results['train_score'].mean().round(3)} +- {cv_results['train_score'].std().round(3)}")
    print(f"{metric} Val CV results: {cv_results['test_score'].mean().round(3)} +- {cv_results['test_score'].std().round(3)}")

In [53]:
mean_sd_cv_results(cv_results)

F1 Train CV results: 0.845 +- 0.002
F1 Val CV results: 0.718 +- 0.018


In [54]:
with open('gs_clf.pkl', 'wb') as f:
    pickle.dump(gs_clf, f, pickle.HIGHEST_PROTOCOL)

In [59]:
! jupyter nbconvert --to html real-or-not-disaster-tweets-modeling-SVC.ipynb

[NbConvertApp] Converting notebook real-or-not-disaster-tweets-modeling-SVC.ipynb to html
[NbConvertApp] Writing 661692 bytes to real-or-not-disaster-tweets-modeling-SVC.html


In [61]:
# MLflow logging.
with mlflow.start_run(run_name=run_name) as run:
    # Modeling exp.
    mlflow.set_tag('Modeling technique', 'SVC')
    # Hyperparameter search parameters.
    mlflow.log_dict(parameters, 'hyperparameters_search_space_gs_cv.yaml')
    # GridSearch classifier object store in the pickle file.
    mlflow.sklearn.log_model(sk_model=gs_clf, artifact_path='gs_cv_clf_sklearn')
    # GS CV Best parameters.
    mlflow.log_dict(gs_clf.best_params_, 'best_params_gs_cv.yaml')
    # mean F1-score - train.
    mlflow.log_metric('mean F1-score - train', cv_results['train_score'].mean().round(3))
    # mean F1-score - val.
    mlflow.log_metric('mean F1-score - val', cv_results['test_score'].mean().round(3))
    # std F1-score - train.
    mlflow.log_metric('std F1-score - train', cv_results['train_score'].std().round(3))
    # std F1-score - val.
    mlflow.log_metric('std F1-score - val', cv_results['test_score'].std().round(3))
    # Nb.
    mlflow.log_artifact('real-or-not-disaster-tweets-modeling-SVC.ipynb', artifact_path='Notebook')
    # Nb in HTML.
    mlflow.log_artifact('real-or-not-disaster-tweets-modeling-SVC.html', artifact_path='Notebook')