In [2]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Imports

In [12]:
import pandas as pd
import numpy as np

from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

## Reading the data

In [7]:
# train_data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
# test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [8]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

In [9]:
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [10]:
print('The length of the training data is %d' % len(train_data))
print('The length of the test data is %d' % len(test_data))

The length of the training data is 7613
The length of the test data is 3263


## Feature Engineering

### Replacing NaN with empty string.

In [13]:
train_data = train_data.replace(np.nan, '', regex=True)
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


### Replacing %20 with space

In [14]:
train_data = train_data.replace('%20', ' ', regex=True)
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [15]:
train_data['keyword'].value_counts()

                       61
fatalities             45
armageddon             42
deluge                 42
body bags              41
                       ..
forest fire            19
epicentre              12
threat                 11
inundation             10
radiation emergency     9
Name: keyword, Length: 222, dtype: int64

### Creating unique word using the values in the keyword column.

In [16]:
train_data['keyword_unique'] = train_data['keyword'].apply(lambda word : 'x' + word + 'x' if len(word) else '')

In [17]:
train_data['keyword'].iloc[40]

'ablaze'

In [18]:
train_data['keyword_unique'].iloc[40]

'xablazex'

## Text preprocessing

In [19]:
def clean_text(text):
    # split into words
    tokens = word_tokenize(text)
    # remove all tokens that are not alphanumeric. Can also use .isalpha() here if do not want to keep numbers.
    words = [word for word in tokens if word.isalnum()]
    # remove stopwords
    stop_words = stopwords.words('english')
    words = [word for word in words if word not in stop_words]
    # performing lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    words = [wordnet_lemmatizer.lemmatize(word) for word in words]
    # Converting list of words to string
    words = ' '.join(words)
    return words

## Kaggle session comments:

1. Consider creating a class for pre_processing 
* It initialized the WordNet lemmatizer - 
* Then in the apply function call the class .preprocess function
* It will be much faster.
* Class Pre_processor - different functions for different processing steps - scikit-learn pipelines it's useful to split those functions in multiple classes. In scikit-learn multiple of those classes can be a component and we can see output of those components - when using scikit-learn - separate classes for functional components - for e.g. removing stopwords.
* One class of everything can be done.

2. Lemmatization is word normalization - root form - is slower than stemming.

In [24]:
train_data['cleaned_text'] = train_data['text'].apply(clean_text)

In [25]:
train_data['text'].iloc[100]

'.@NorwayMFA #Bahrain police had previously died in a road accident they were not killed by explosion https://t.co/gFJfgTodad'

In [26]:
train_data['cleaned_text'].iloc[100]

'NorwayMFA Bahrain police previously died road accident killed explosion http'

In [27]:
train_data['keyword_unique'].iloc[100]

'xaccidentx'

In [28]:
train_data['target'].iloc[100]

1

## Merging the keyword_unique and text column

In [29]:
train_data.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'keyword_unique',
       'cleaned_text'],
      dtype='object')

In [30]:
train_data['keyword_unique_cleaned_text'] = train_data['keyword_unique'] + ' ' + train_data['cleaned_text']

In [31]:
train_data['keyword_unique'].iloc[190]

'xambulancex'

In [32]:
train_data['cleaned_text'].iloc[190]

'http Twelve feared killed Pakistani air ambulance helicopter crash http'

In [33]:
train_data['keyword_unique_cleaned_text'].iloc[190]

'xambulancex http Twelve feared killed Pakistani air ambulance helicopter crash http'

In [34]:
train_data['keyword_unique_cleaned_text']

0         Our Deeds Reason earthquake May ALLAH Forgive u
1                   Forest fire near La Ronge Sask Canada
2        All resident asked place notified officer No ...
3        people receive wildfire evacuation order Cali...
4        Just got sent photo Ruby Alaska smoke wildfir...
                              ...                        
7608     Two giant crane holding bridge collapse nearb...
7609     TheTawniest The control wild fire California ...
7610                        UTC 5km S Volcano Hawaii http
7611     Police investigating collided car Little Port...
7612     The Latest More Homes Razed Northern Californ...
Name: keyword_unique_cleaned_text, Length: 7613, dtype: object

## Tf-idf features

In [37]:
tfidf_vectorizer = TfidfVectorizer(max_features=100)
train_data_tfidf = tfidf_vectorizer.fit_transform(train_data['keyword_unique_cleaned_text'])
train_data_tfidf.shape

(7613, 100)

In [40]:
tfidf_vectorizer.get_feature_names()[:10]

['accident',
 'amp',
 'and',
 'as',
 'attack',
 'back',
 'body',
 'bomb',
 'building',
 'burning']

## Baseline model

## Linear SVC cross validation.

In [41]:
svc = LinearSVC()

In [42]:
cv_results = cross_validate(svc, train_data_tfidf, train_data['target'], return_train_score=True, n_jobs=-1)
cv_results

{'fit_time': array([0.0183208 , 0.0177958 , 0.01758409, 0.01740599, 0.01769972]),
 'score_time': array([0.00064015, 0.00066519, 0.00075293, 0.00069404, 0.00072336]),
 'test_score': array([0.6651346 , 0.62902167, 0.63558766, 0.63994744, 0.72470434]),
 'train_score': array([0.71215107, 0.73004926, 0.73136289, 0.72204892, 0.70891479])}

Kaggle session comments:
1. Variation in the test score - cross validation
* shuffling can play a role
* larger the dataset - lesser the variation


## Pipeline

In [43]:
text_clf = Pipeline(
    [
        ('tfidf_vect', TfidfVectorizer()),
        ('clf', LinearSVC()),
    ]
)

In [45]:
parameters = {
    'tfidf_vect__strip_accents': ('unicode', None),
    'tfidf_vect__ngram_range': [(1, 1), (1, 2), (1, 2, 3), (2, 3), (2, 2)],
    'tfidf_vect__max_df': [0.6, 0.8, 0.95],
    'tfidf_vect__min_df': [0.0, 0.1, 0.25],
    'tfidf_vect__max_features': [100, 500, 1000, 10000],
    'tfidf_vect__binary': [True, False],
    'tfidf_vect__norm': ['l1', 'l2'],
    'tfidf_vect__use_idf': [True, False],
    'tfidf_vect__sublinear_tf': [True, False],
    'clf__dual': [True, False],
    'clf__C': [1, 10, 100],
    'clf__class_weight': ['balanced', None],
    'clf__random_state': [42]
}

In [48]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, verbose = 2)

In [49]:
gs_clf = gs_clf.fit(train_data['keyword_unique_cleaned_text'], train_data['target'])

Fitting 5 folds for each of 69120 candidates, totalling 345600 fits
Exception in thread Thread-5:
Traceback (most recent call last):
  File "/Users/mihir/opt/anaconda3/envs/nlp_disaster_tweets/lib/python3.9/threading.py", line 954, in _bootstrap_inner
    self.run()
  File "/Users/mihir/opt/anaconda3/envs/nlp_disaster_tweets/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 567, in run
    self.flag_executor_shutting_down()
  File "/Users/mihir/opt/anaconda3/envs/nlp_disaster_tweets/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 756, in flag_executor_shutting_down
    self.kill_workers()
  File "/Users/mihir/opt/anaconda3/envs/nlp_disaster_tweets/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 766, in kill_workers
    recursive_terminate(p)
  File "/Users/mihir/opt/anaconda3/envs/nlp_disaster_tweets/lib/python3.9/site-packages/joblib/externals/loky/backend/utils.py", line 28, in recursive_terminate
  

KeyboardInterrupt: 

Soufiane:
1. Seaborn plot