In [1]:
import json, csv
import numpy as np
import pandas as pd

df = pd.read_csv('../datasets/entities_nlp_train_hongyu.csv')
df[20:30:2]

Unnamed: 0,id,keyword,location,text,entities,labels,ent_dep,ent_head,ent_pos,ent_children,target
20,31,missing,unknown,this is ridiculous....,,,,,,,0
22,33,missing,unknown,Love skiing,,,,,,,0
24,36,missing,unknown,LOOOOOOL,,,,,,,0
26,38,missing,unknown,Was in NYC last week!,"NYC,last week","ORG,DATE",pobj,in,ADP,,0
28,40,missing,unknown,Cooool :),,,,,,,0


## evaluation

In [2]:
y = df['target']
y.shape

(7613,)

In [3]:
X = df.drop(['target'], axis=1)
X.columns

Index(['id', 'keyword', 'location', 'text', 'entities', 'labels', 'ent_dep',
       'ent_head', 'ent_pos', 'ent_children'],
      dtype='object')

In [4]:
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer

class LabeledNormalizer(Normalizer):
    def fit(self, X, *args, **kwargs):
        try:
            self.names = X.columns
        except:
            self.names = [str(i) for i in range(X.shape[1])]
        return super().fit(X, *args, **kwargs)
         
    def get_feature_names(self):
        return self.names
    
vec = ColumnTransformer([
#     ('norm', LabeledNormalizer(), ['id']),
    ('kw', TfidfVectorizer(ngram_range=(1, 1), min_df=2, token_pattern=r"(?u)\b\w+\b",), 'keyword'),
    ('loc', TfidfVectorizer(ngram_range=(1, 1), min_df=2, token_pattern=r"(?u)\b\w+\b"), 'location'),
    ('text', TfidfVectorizer(ngram_range=(1, 3), min_df=2, token_pattern=r"(?u)\b\w+\b"), 'text'),
    ('ent', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'entities'),
    ('label', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'labels'),
    ('dep', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_dep'),
    ('head', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_head'),
    ('pos', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_pos'),
    ('child', CountVectorizer(ngram_range=(1, 1), analyzer='word', token_pattern=r"(?u)\b\w+\b"), 'ent_children'),
    
])

# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(2)

# from sklearn.svm import LinearSVC
# from sklearn.svm import SVC
# clf = SVC(kernel='linear', probability=True)
# clf = SVC(kernel='rbf', probability=True)

from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier(n_estimators=20, random_state=0) # use a guassian forest
clf = RandomForestClassifier(n_estimators=200, max_features=250, random_state=0) # set limit to prevent overfitting
# clf = RandomForestClassifier(n_estimators=200, max_features=1000, random_state=0) # set limit to prevent overfitting


pipeline = make_pipeline(vec, clf)
# pipeline = make_pipeline(vec, poly, clf)


In [5]:
import time
start_time = time.time()

pipeline.fit(X,y)

seconds = time.time() - start_time
print('Training Time Taken:  ', time.strftime("%H:%M:%S",time.gmtime(seconds)))

Training Time Taken:   00:00:42


## extend to 767869266 features (Training Time Taken:   00:15:48)

In [8]:
# pipeline[1].get_feature_names()
pipeline[1].n_features_
# pipeline[1].n_output_features_

38278

In [13]:
from sklearn.externals import joblib

feature_mdl = '../models/extend-770M-features.pkl'
joblib.dump(pipeline, feature_mdl)
!ls -lSh $feature_mdl



-rw-rw-r-- 1 hhe450 hhe450 79M Mar 26 19:07 ../models/extend-770M-features.pkl


## evaluation

In [19]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

def evaluate(_clf, X, y):
    report = classification_report(
        y_true=y, y_pred=cross_val_predict(_clf, X, y, cv=5)
    )
    print(report)
    scores = cross_val_score(_clf, X, y, scoring='accuracy', cv=5)
    print('Cross-validation MSE: {:.3f} ± {:.3f}'.format(np.mean(scores), 2 * np.std(scores)))
    
    _clf.fit(X,y)
    print('Training Set Accuracy: {:.3f}'.format(_clf.score(X,y)))


import time
start_time = time.time()

evaluate(pipeline, X, y)

seconds = time.time() - start_time
print('\nEvaluation Time Taken: ', time.strftime("%H:%M:%S",time.gmtime(seconds)))

import time
start_time = time.time()

pipeline.fit(X,y)

seconds = time.time() - start_time
print('Training Time Taken:  ', time.strftime("%H:%M:%S",time.gmtime(seconds)))

              precision    recall  f1-score   support

           0       0.67      0.90      0.77      4342
           1       0.76      0.41      0.54      3271

    accuracy                           0.69      7613
   macro avg       0.72      0.66      0.65      7613
weighted avg       0.71      0.69      0.67      7613

Cross-validation MSE: 0.693 ± 0.058
Training Set Accuracy: 0.995

Evaluation Time Taken:  00:05:21
Training Time Taken:   00:00:39


## random forest

### after limiting the features to 100:
```
              precision    recall  f1-score   support

           0       0.66      0.92      0.77      4342
           1       0.77      0.38      0.51      3271

    accuracy                           0.69      7613
   macro avg       0.72      0.65      0.64      7613
weighted avg       0.71      0.69      0.66      7613

Cross-validation MSE: 0.687 ± 0.055
Training Set Accuracy: 0.995

Evaluation Time Taken:  00:05:00
```

### after limiting the features to 250:
```
              precision    recall  f1-score   support

           0       0.67      0.90      0.77      4342
           1       0.76      0.41      0.54      3271

    accuracy                           0.69      7613
   macro avg       0.72      0.66      0.65      7613
weighted avg       0.71      0.69      0.67      7613

Cross-validation MSE: 0.693 ± 0.058
Training Set Accuracy: 0.995

Evaluation Time Taken:  00:05:21
Training Time Taken:   00:00:39
```

### after limiting the features to 500:
```
              precision    recall  f1-score   support

           0       0.68      0.89      0.77      4342
           1       0.76      0.43      0.55      3271

    accuracy                           0.70      7613
   macro avg       0.72      0.66      0.66      7613
weighted avg       0.71      0.70      0.68      7613

Cross-validation MSE: 0.697 ± 0.054
Training Set Accuracy: 0.995

Evaluation Time Taken:  00:05:27
Training Time Taken:   00:00:41
```

### after limiting the features to 1k:
```
              precision    recall  f1-score   support

           0       0.68      0.88      0.77      4342
           1       0.74      0.45      0.56      3271

    accuracy                           0.70      7613
   macro avg       0.71      0.67      0.66      7613
weighted avg       0.71      0.70      0.68      7613

Cross-validation MSE: 0.696 ± 0.054
Training Set Accuracy: 0.995

Evaluation Time Taken:  00:06:12
Training Time Taken:   00:00:45
```

### 200 trees (original features):
```
              precision    recall  f1-score   support

           0       0.67      0.91      0.77      4342
           1       0.77      0.41      0.53      3271

    accuracy                           0.69      7613
   macro avg       0.72      0.66      0.65      7613
weighted avg       0.71      0.69      0.67      7613

Cross-validation MSE: 0.692 ± 0.066
Training Set Accuracy: 0.995

Evaluation Time Taken:  00:05:43
Training Time Taken:   00:00:45
```

### 20 trees (original features):
```
              precision    recall  f1-score   support

           0       0.66      0.90      0.76      4342
           1       0.75      0.39      0.51      3271

    accuracy                           0.68      7613
   macro avg       0.71      0.65      0.64      7613
weighted avg       0.70      0.68      0.66      7613

Cross-validation MSE: 0.682 ± 0.060
Training Set Accuracy: 0.987

Evaluation Time Taken:  00:00:49
Training Time Taken:   00:00:05
```

## rbf kernel

```
              precision    recall  f1-score   support

           0       0.65      0.85      0.74      4342
           1       0.67      0.40      0.50      3271

    accuracy                           0.66      7613
   macro avg       0.66      0.63      0.62      7613
weighted avg       0.66      0.66      0.64      7613

Cross-validation MSE: 0.657 ± 0.070
Training Set Accuracy: 0.902

Evaluation Time Taken: 00:09:49
Training Time Taken:   00:01:21
```
### SVC with rbf kernel (770M features)
```
              precision    recall  f1-score   support

           0       0.65      0.87      0.74      4342
           1       0.69      0.38      0.49      3271

    accuracy                           0.66      7613
   macro avg       0.67      0.62      0.62      7613
weighted avg       0.67      0.66      0.63      7613
```

# Conclusion:
1. Blindly adding extending feature space doesn't help even using non-linear kernel;
2. After extending the features, training the random forest takes forever;
3. Get noticeable performance gain by limiting  the number of considered features;