In [1]:
import json, csv
import numpy as np
import pandas as pd

df = pd.read_csv('../datasets/entities_nlp_train_hongyu.csv')
df[20:30:2]

Unnamed: 0,id,keyword,location,text,entities,labels,ent_dep,ent_head,ent_pos,ent_children,target
20,31,missing,unknown,this is ridiculous....,,,,,,,0
22,33,missing,unknown,Love skiing,,,,,,,0
24,36,missing,unknown,LOOOOOOL,,,,,,,0
26,38,missing,unknown,Was in NYC last week!,"NYC,last week","ORG,DATE",pobj,in,ADP,,0
28,40,missing,unknown,Cooool :),,,,,,,0


## redo this using Alex code

In [2]:
df['keyword'].value_counts()
# df.hist(column='keyword')

missing                  61
fatalities               45
deluge                   42
armageddon               42
sinking                  41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 222, dtype: int64

In [12]:
from collections import Counter
kv_df = list(zip(df.keyword, df.target))
Counter(kv_df).most_common()[:11]

[(('missing', 1), 42),
 (('body%20bags', 0), 40),
 (('derailment', 1), 39),
 (('outbreak', 1), 39),
 (('wreckage', 1), 39),
 (('armageddon', 0), 37),
 (('debris', 1), 37),
 (('harm', 0), 37),
 (('oil%20spill', 1), 37),
 (('typhoon', 1), 37),
 (('deluge', 0), 36)]

## replace keyword by mode value

In [13]:
df_mode = df.copy()

for ri, row in df.iterrows():
    if row['target'] == 0 and row['keyword'] == 'missing':
        df_mode.at[ri, 'keyword'] = 'body%20bags'
    if row['target'] == 1 and row['keyword'] == 'missing':
        df_mode.at[ri, 'keyword'] = 'derailment'

df_mode.head(3)

Unnamed: 0,id,keyword,location,text,entities,labels,ent_dep,ent_head,ent_pos,ent_children,target
0,1,derailment,unknown,Our Deeds are the Reason of this #earthquake M...,Deeds,PERSON,nsubj,are,AUX,Our,1
1,4,derailment,unknown,Forest fire near La Ronge Sask. Canada,"La Ronge Sask,Canada","FAC,GPE",ROOT,Canada,PROPN,,1
2,5,derailment,unknown,All residents asked to 'shelter in place' are ...,,,,,,,1


In [22]:
df_mode = df.replace({'keyword': 'missing'}, {'keyword': 'fatalities'})
df_mode['keyword'].value_counts()

fatalities               106
deluge                    42
armageddon                42
body%20bags               41
damage                    41
                        ... 
forest%20fire             19
epicentre                 12
threat                    11
inundation                10
radiation%20emergency      9
Name: keyword, Length: 221, dtype: int64

In [14]:
df_mode.to_csv('../datasets/replace-mode.csv', index=False)

## evaluation

In [15]:
y = df_mode['target']
y.shape

(7613,)

In [16]:
X = df_mode.drop(['target'], axis=1)
X.columns

Index(['id', 'keyword', 'location', 'text', 'entities', 'labels', 'ent_dep',
       'ent_head', 'ent_pos', 'ent_children'],
      dtype='object')

In [18]:
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer

class LabeledNormalizer(Normalizer):
    def fit(self, X, *args, **kwargs):
        try:
            self.names = X.columns
        except:
            self.names = [str(i) for i in range(X.shape[1])]
        return super().fit(X, *args, **kwargs)
         
    def get_feature_names(self):
        return self.names
    
vec = ColumnTransformer([
#     ('norm', LabeledNormalizer(), ['id']),
    ('kw', TfidfVectorizer(ngram_range=(1, 3), min_df=2, token_pattern=r"(?u)\b\w+\b",), 'keyword'),
    ('loc', TfidfVectorizer(ngram_range=(1, 3), min_df=2, token_pattern=r"(?u)\b\w+\b"), 'location'),
    ('text', TfidfVectorizer(ngram_range=(1, 3), min_df=2, token_pattern=r"(?u)\b\w+\b"), 'text'),
    ('ent', TfidfVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'entities'),
    ('label', TfidfVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'labels'),
    ('dep', TfidfVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_dep'),
    ('head', TfidfVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_head'),
    ('pos', TfidfVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_pos'),
    ('child', TfidfVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_children'),
    
])

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
# clf = SVC(kernel='linear', probability=True)
clf = SVC(kernel='rbf', probability=True)
# clf = SVC(kernel='poly', probability=True)
# clf = LinearSVC(verbose=True)


pipeline = make_pipeline(vec, clf)

## evaluation

In [19]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

def evaluate(_clf, X, y):
    report = classification_report(
        y_true=y, y_pred=cross_val_predict(pipeline, X, y, cv=5)
    )
    print(report)
    scores = cross_val_score(_clf, X, y, scoring='accuracy', cv=5)
    print('Cross-validation MSE: {:.3f} ± {:.3f}'.format(np.mean(scores), 2 * np.std(scores)))
    
    _clf.fit(X,y)
    print('Training Set Accuracy: {:.3f}'.format(_clf.score(X,y)))


import time
start_time = time.time()

evaluate(pipeline, X, y)

seconds = time.time() - start_time
print('\nEvaluation Time Taken: ', time.strftime("%H:%M:%S",time.gmtime(seconds)))

import time
start_time = time.time()

pipeline.fit(X,y)

seconds = time.time() - start_time
print('Training Time Taken:  ', time.strftime("%H:%M:%S",time.gmtime(seconds)))

              precision    recall  f1-score   support

           0       0.66      0.79      0.72      4342
           1       0.62      0.46      0.53      3271

    accuracy                           0.65      7613
   macro avg       0.64      0.62      0.62      7613
weighted avg       0.64      0.65      0.64      7613

Cross-validation MSE: 0.647 ± 0.086
Training Set Accuracy: 0.909

Evaluation Time Taken:  00:10:46
Training Time Taken:   00:01:19


## rbf kernel


### replace keyword by mode
```
              precision    recall  f1-score   support

           0       0.66      0.79      0.72      4342
           1       0.62      0.46      0.53      3271

    accuracy                           0.65      7613
   macro avg       0.64      0.62      0.62      7613
weighted avg       0.64      0.65      0.64      7613

Cross-validation MSE: 0.647 ± 0.086
Training Set Accuracy: 0.909

Evaluation Time Taken:  00:10:46
Training Time Taken:   00:01:19
```

### using tfrdf
```
              precision    recall  f1-score   support

           0       0.66      0.78      0.71      4342
           1       0.61      0.46      0.52      3271

    accuracy                           0.64      7613
   macro avg       0.63      0.62      0.62      7613
weighted avg       0.64      0.64      0.63      7613

Cross-validation MSE: 0.642 ± 0.089
Training Set Accuracy: 0.908

Evaluation Time Taken:  00:09:35
Training Time Taken:   00:01:13
```

### before
```
              precision    recall  f1-score   support

           0       0.65      0.85      0.74      4342
           1       0.67      0.40      0.50      3271

    accuracy                           0.66      7613
   macro avg       0.66      0.63      0.62      7613
weighted avg       0.66      0.66      0.64      7613

Cross-validation MSE: 0.657 ± 0.070
Training Set Accuracy: 0.902

Evaluation Time Taken: 00:09:49
Training Time Taken:   00:01:21
```


## polynomial kernel
### before
```
              precision    recall  f1-score   support

           0       0.64      0.92      0.75      4342
           1       0.74      0.30      0.43      3271

    accuracy                           0.65      7613
   macro avg       0.69      0.61      0.59      7613
weighted avg       0.68      0.65      0.61      7613

Cross-validation MSE: 0.655 ± 0.035
Training Set Accuracy: 0.873

Evaluation Time Taken: 00:11:37
Training Time Taken:   00:01:15
```

# Conclusion:
1. Replacing keyword by its mode doesn't help;
2. For categorical features like (multi-)labels, it's indeed better to use direct counting rather than tfrdf;