In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../raw')
# sys.path.clear()

In [16]:
import json, csv
import numpy as np
import pandas as pd

train_df = pd.read_csv('../raw/test.csv')
train_df[20:40:2]
train_df.shape

(3263, 4)

## fill missing values

In [3]:
y = train_df['target']
y.shape

(7613,)

In [7]:
fill_v = {'keyword': 'missing', 'location': 'unknown'}
filled_df = train_df.fillna(value=fill_v)
# filled_df = filled_df.drop(['target'], axis=1)

filled_df.insert(4, 'entities','',True)
filled_df.insert(5, 'labels','',True)
filled_df.head(3)

Unnamed: 0,id,keyword,location,text,entities,labels
0,0,missing,unknown,Just happened a terrible car crash,,
1,2,missing,unknown,"Heard about #earthquake is different cities, s...",,
2,3,missing,unknown,"there is a forest fire at spot pond, geese are...",,


## using `spacy`

In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")

## add entities and labels

In [9]:
from tqdm import tqdm

ent_df = filled_df.copy()
for ri, row in tqdm(ent_df.iterrows()):
    doc = nlp(row['text'])
    ents = []
    labels = []
    for ent in doc.ents:
#         print(ent.text)
        if ent.text not in ents:
            ents.append(ent.text)
        if ent.label_ not in labels:
            labels.append(ent.label_)
            
    ent_df.at[ri, 'entities'] = ','.join(ents)
    ent_df.at[ri, 'labels'] = ','.join(labels)
        
#     ent_df.at[ri, 'entities'] += row['entities'] + ent.text+ ","
#     ent_df.at[ri, 'labels'] += row['labels'] + ent.label_+ ","
#     print()
#     if ri > 5:
#         break

3263it [00:52, 62.08it/s]


In [10]:
ent_df.head(5)

Unnamed: 0,id,keyword,location,text,entities,labels
0,0,missing,unknown,Just happened a terrible car crash,,
1,2,missing,unknown,"Heard about #earthquake is different cities, s...",about #earthquake,MONEY
2,3,missing,unknown,"there is a forest fire at spot pond, geese are...",geese,NORP
3,9,missing,unknown,Apocalypse lighting. #Spokane #wildfires,,
4,11,missing,unknown,Typhoon Soudelor kills 28 in China and Taiwan,"Typhoon Soudelor,28,China,Taiwan","ORG,CARDINAL,GPE"


In [11]:
nlp_df = ent_df.copy()
nlp_df.insert(6, 'ent_dep', '', True)
nlp_df.insert(7, 'ent_head', '', True)
nlp_df.insert(8, 'ent_pos', '', True)
nlp_df.insert(9, 'ent_children', '', True)
nlp_df.head(5)

Unnamed: 0,id,keyword,location,text,entities,labels,ent_dep,ent_head,ent_pos,ent_children
0,0,missing,unknown,Just happened a terrible car crash,,,,,,
1,2,missing,unknown,"Heard about #earthquake is different cities, s...",about #earthquake,MONEY,,,,
2,3,missing,unknown,"there is a forest fire at spot pond, geese are...",geese,NORP,,,,
3,9,missing,unknown,Apocalypse lighting. #Spokane #wildfires,,,,,,
4,11,missing,unknown,Typhoon Soudelor kills 28 in China and Taiwan,"Typhoon Soudelor,28,China,Taiwan","ORG,CARDINAL,GPE",,,,


In [12]:
for ri, row in nlp_df.iterrows():
    doc = nlp(row['text'])
    entities = [ent_tok.lower() for ent_tok in row['entities'].split(',') if ent_tok]
#     print(entities)
    deps = []
    heads =[]
    positions = [] 
    children = set()
    
    for token in doc:
        if token.text.lower() in entities:
#             print(token.text.lower())
            if token.dep_ not in deps:
                deps.append(token.dep_)
            if token.head.text not in heads:
                heads.append(token.head.text)
            if token.head.pos_ not in positions:
                positions.append(token.head.pos_)
                
            children = children.union([child.text for child in token.children])
            
#             nlp_df.at[ri, 'ent_dep'] = row['ent_dep'] + token.dep_ + ","
#             nlp_df.at[ri, 'ent_head'] += row['ent_head'] + token.head.text + ","
#             nlp_df.at[ri, 'ent_pos'] += row['ent_pos'] + token.head.pos_ + ","
#             nlp_df.at[ri, 'ent_children'] += row['ent_children'] + ','.join([child.text for child in token.children])+','
    nlp_df.at[ri, 'ent_dep'] = ','.join(deps)
    nlp_df.at[ri, 'ent_head'] = ','.join(heads)
    nlp_df.at[ri, 'ent_pos'] = ','.join(positions)
    nlp_df.at[ri, 'ent_children'] = ','.join(children)
            
#     if ri > 5:
#         break


In [13]:
nlp_df.head(15)

Unnamed: 0,id,keyword,location,text,entities,labels,ent_dep,ent_head,ent_pos,ent_children
0,0,missing,unknown,Just happened a terrible car crash,,,,,,
1,2,missing,unknown,"Heard about #earthquake is different cities, s...",about #earthquake,MONEY,,,,
2,3,missing,unknown,"there is a forest fire at spot pond, geese are...",geese,NORP,nsubj,fleeing,VERB,
3,9,missing,unknown,Apocalypse lighting. #Spokane #wildfires,,,,,,
4,11,missing,unknown,Typhoon Soudelor kills 28 in China and Taiwan,"Typhoon Soudelor,28,China,Taiwan","ORG,CARDINAL,GPE","dobj,pobj,conj","kills,in,China","VERB,ADP,PROPN","and,Taiwan"
5,12,missing,unknown,We're shaking...It's an earthquake,,,,,,
6,21,missing,unknown,They'd probably still show more life than Arse...,"Arsenal,yesterday","ORG,DATE","pobj,npadvmod","than,did","SCONJ,AUX",
7,22,missing,unknown,Hey! How are you?,,,,,,
8,27,missing,unknown,What a nice hat?,,,,,,
9,29,missing,unknown,Fuck off!,,,,,,


In [15]:
X = nlp_df.copy()

X.to_csv('../datasets/test-entity.csv')
X.shape

(3263, 10)

## training

In [17]:
df = pd.read_csv('../datasets/test-entity.csv', index_col=0)
df.head(5)

Unnamed: 0,id,keyword,location,text,entities,labels,ent_dep,ent_head,ent_pos,ent_children
0,0,missing,unknown,Just happened a terrible car crash,,,,,,
1,2,missing,unknown,"Heard about #earthquake is different cities, s...",about #earthquake,MONEY,,,,
2,3,missing,unknown,"there is a forest fire at spot pond, geese are...",geese,NORP,nsubj,fleeing,VERB,
3,9,missing,unknown,Apocalypse lighting. #Spokane #wildfires,,,,,,
4,11,missing,unknown,Typhoon Soudelor kills 28 in China and Taiwan,"Typhoon Soudelor,28,China,Taiwan","ORG,CARDINAL,GPE","dobj,pobj,conj","kills,in,China","VERB,ADP,PROPN","and,Taiwan"


In [18]:
fillers = {'entities':'None', 'labels':'None', 'ent_dep':'None', 'ent_head':'None', 'ent_pos':'None', 'ent_children':'None'}
df = df.fillna(value=fillers)
df.to_csv('../datasets/test-entity.csv', index=False)
df.head(5)

Unnamed: 0,id,keyword,location,text,entities,labels,ent_dep,ent_head,ent_pos,ent_children
0,0,missing,unknown,Just happened a terrible car crash,,,,,,
1,2,missing,unknown,"Heard about #earthquake is different cities, s...",about #earthquake,MONEY,,,,
2,3,missing,unknown,"there is a forest fire at spot pond, geese are...",geese,NORP,nsubj,fleeing,VERB,
3,9,missing,unknown,Apocalypse lighting. #Spokane #wildfires,,,,,,
4,11,missing,unknown,Typhoon Soudelor kills 28 in China and Taiwan,"Typhoon Soudelor,28,China,Taiwan","ORG,CARDINAL,GPE","dobj,pobj,conj","kills,in,China","VERB,ADP,PROPN","and,Taiwan"


## evaluation

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('../datasets/entities_nlp_train_hongyu.csv', index_col=0)

In [3]:
y = df['target']
y.shape

(7613,)

In [4]:
X = df.drop(['target'], axis=1)
X.columns

Index(['keyword', 'location', 'text', 'entities', 'labels', 'ent_dep',
       'ent_head', 'ent_pos', 'ent_children'],
      dtype='object')

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer()
vectorizer = CountVectorizer(ngram_range=(1, 1), analyzer='word')
x = vectorizer.fit_transform(X['labels'][:10])
# x.toarray()
vectorizer.get_feature_names()

['cardinal', 'fac', 'gpe', 'none', 'org', 'person']

In [5]:
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer

class LabeledNormalizer(Normalizer):
    def fit(self, X, *args, **kwargs):
        try:
            self.names = X.columns
        except:
            self.names = [str(i) for i in range(X.shape[1])]
        return super().fit(X, *args, **kwargs)
         
    def get_feature_names(self):
        return self.names
    
vec = ColumnTransformer([
#     ('norm', LabeledNormalizer(), ['id']),
    ('kw', TfidfVectorizer(ngram_range=(1, 1), min_df=2, token_pattern=r"(?u)\b\w+\b",), 'keyword'),
    ('loc', TfidfVectorizer(ngram_range=(1, 1), min_df=2, token_pattern=r"(?u)\b\w+\b"), 'location'),
    ('text', TfidfVectorizer(ngram_range=(1, 3), min_df=2, token_pattern=r"(?u)\b\w+\b"), 'text'),
    ('ent', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'entities'),
    ('label', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'labels'),
    ('dep', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_dep'),
    ('head', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_head'),
    ('pos', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_pos'),
    ('child', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_children'),
    
])

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
# clf = SVC(kernel='linear', probability=True)
clf = SVC(kernel='rbf', probability=True)
# clf = SVC(kernel='poly', probability=True)
# clf = SVC(kernel='sigmoid', probability=True)
# clf = LinearSVC(verbose=True)

from sklearn import preprocessing
scaler = preprocessing.StandardScaler(with_mean=False)

pipeline = make_pipeline(vec, scaler, clf)

# pipeline = make_pipeline(vec, clf)

## evaluation

In [6]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

def evaluate(_clf, X, y):
    report = classification_report(
        y_true=y, y_pred=cross_val_predict(pipeline, X, y, cv=5)
    )
    print(report)
    scores = cross_val_score(_clf, X, y, scoring='accuracy', cv=5)
    print('Cross-validation MSE: {:.3f} ± {:.3f}'.format(np.mean(scores), 2 * np.std(scores)))
    
    _clf.fit(X,y)
    print('Training Set Accuracy: {:.3f}'.format(_clf.score(X,y)))


import time
start_time = time.time()

evaluate(pipeline, X, y)

seconds = time.time() - start_time
print('\nEvaluation Time Taken: ', time.strftime("%H:%M:%S",time.gmtime(seconds)))

import time
start_time = time.time()

pipeline.fit(X,y)

seconds = time.time() - start_time
print('Training Time Taken:  ', time.strftime("%H:%M:%S",time.gmtime(seconds)))

              precision    recall  f1-score   support

           0       0.64      0.96      0.77      4342
           1       0.85      0.30      0.44      3271

    accuracy                           0.67      7613
   macro avg       0.75      0.63      0.61      7613
weighted avg       0.73      0.67      0.63      7613

Cross-validation MSE: 0.675 ± 0.035
Training Set Accuracy: 0.954

Evaluation Time Taken:  00:17:35
Training Time Taken:   00:02:12


## linear kernel
```
              precision    recall  f1-score   support

           0       0.65      0.68      0.67      4342
           1       0.55      0.52      0.54      3271

    accuracy                           0.61      7613
   macro avg       0.60      0.60      0.60      7613
weighted avg       0.61      0.61      0.61      7613

Cross-validation MSE: 0.612 ± 0.087
Training Set Accuracy: 0.975

Evaluation Time Taken: 00:10:52
Training Time Taken:   00:01:41
```

## rbf kernel
```
              precision    recall  f1-score   support

           0       0.65      0.85      0.74      4342
           1       0.67      0.40      0.50      3271

    accuracy                           0.66      7613
   macro avg       0.66      0.63      0.62      7613
weighted avg       0.66      0.66      0.64      7613

Cross-validation MSE: 0.657 ± 0.070
Training Set Accuracy: 0.902

Evaluation Time Taken: 00:09:49
Training Time Taken:   00:01:21
```

### after scaling
```
              precision    recall  f1-score   support

           0       0.64      0.96      0.77      4342
           1       0.85      0.30      0.44      3271

    accuracy                           0.67      7613
   macro avg       0.75      0.63      0.61      7613
weighted avg       0.73      0.67      0.63      7613

Cross-validation MSE: 0.675 ± 0.035
Training Set Accuracy: 0.954

Evaluation Time Taken:  00:17:35
Training Time Taken:   00:02:12
```

## polynomial kernel
```
              precision    recall  f1-score   support

           0       0.64      0.92      0.75      4342
           1       0.74      0.30      0.43      3271

    accuracy                           0.65      7613
   macro avg       0.69      0.61      0.59      7613
weighted avg       0.68      0.65      0.61      7613

Cross-validation MSE: 0.655 ± 0.035
Training Set Accuracy: 0.873

Evaluation Time Taken: 00:11:37
Training Time Taken:   00:01:15
```

## sigmoid kernel
```
              precision    recall  f1-score   support

           0       0.63      0.69      0.66      4342
           1       0.53      0.45      0.49      3271

    accuracy                           0.59      7613
   macro avg       0.58      0.57      0.57      7613
weighted avg       0.58      0.59      0.58      7613

Cross-validation MSE: 0.590 ± 0.039
Training Set Accuracy: 0.768

Evaluation Time Taken:  00:09:07
Training Time Taken:   00:01:09
```

In [24]:
from sklearn.externals import joblib
pipeline.fit(X,y)

entity_mdl = '../models/nlp-entity-linear-svc.pkl'
joblib.dump(pipeline, entity_mdl)
!ls -lSh $entity_mdl

-rw-r--r-- 1 hongyu hongyu 8.2M Mar 21 12:50 ../models/nlp-entity-linear-svc.pkl


In [22]:
from sklearn.externals import joblib

entity_rbf_mdl = '../models/nlp-entity-rbf-svc.pkl'
joblib.dump(pipeline, entity_rbf_mdl)
!ls -lSh $entity_rbf_mdl

-rw-r--r-- 1 hongyu hongyu 8.9M Mar 21 12:48 ../models/nlp-entity-rbf-svc.pkl


In [27]:
from sklearn.externals import joblib

entity_poly_mdl = '../models/nlp-entity-poly-svc.pkl'
joblib.dump(pipeline, entity_poly_mdl)
!ls -lSh $entity_poly_mdl

-rw-r--r-- 1 hongyu hongyu 9.1M Mar 21 13:19 ../models/nlp-entity-poly-svc.pkl


# Conclusion:
1. Entity-features extracted by the NLP library do make a positive effect on prediction;
2. However, based on the performance of the linear kernel, it seems that the feature space is no longer linearly separatable;
3. As a corollary, non-linear kernels performed better than the linear one;
4. However, the performance of sigmoid kernel is exceptionaly bad;

# Future works:
1. We should consider continuing adding new features;
2. Improve performance by tuning hyper-parameters is worth to try. 