In [1]:
import pandas as pd
import torch
from textblob import TextBlob
from scipy import spatial
import pickle
import spacy
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
nlp = spacy.load('en_core_web_sm')

### Load dict embeddings from the files to a dictionary

In [181]:
with open('data/dict_emb1.pickle','rb') as f:
    d1 = pickle.load(f)

In [182]:
with open('data/dict_emb2.pickle','rb') as f:
    d2 = pickle.load(f)

In [183]:
dict_emb = dict(d1)
dict_emb.update(d2)

In [184]:
del d1,d2

### Load infersent model and dataframe
The dataframe was created in dataset_parser.py from the **SQuAD 2.0 dataset.**

In [186]:
infersent = torch.load('models/infersent_trained.pt')

In [187]:
df = pd.read_csv('data/train.csv')

### Add relevant columns to the dataframe

In [188]:
def get_target(x):
    idx = -1
    for i in range(len(x["sentences"])):
        if x["text"] in x["sentences"][i]: idx = i
    return idx

In [198]:
def get_cosine_sim(x):
    question = x['question']
    question_emb = dict_emb[question]
    li = [spatial.distance.cosine(question_emb,dict_emb[sentence]) for sentence in x['sentences']]
    return li + [1] * (10-len(li))

In [190]:
df['sentences'] = df['context'].apply(lambda x: [item.raw for item in TextBlob(x).sentences])

In [191]:
df = df[df['sentences'].map(len) <= 10]

In [192]:
df = df.dropna().reset_index(drop=True)

In [193]:
df['distances'] = df.apply(get_cosine_sim, axis=1)

### Getting all the unique sentences and adding them to a dictionary
The dictionary's values are the roots of each sentence

In [195]:
sentences = df["sentences"].reset_index(drop= True).tolist()
s = set()
for i in sentences:
    for j in i:
        s.add(j)

In [196]:
sent_roots = {}
for sent in s:
    sent_nlp = nlp(sent)
    sent_roots[sent] = [chunk.root.head.lemma_ for chunk in sent_nlp.noun_chunks]

In [197]:
def match_roots(x):
    question = x["question"]
    sentences = x['sentences']
    question_root = [sent.root.lemma_ for sent in nlp(question).sents][0]
    li = [int(question_root in sent_roots[sent]) for sent in sentences]
    return li + [0]*(10 - len(li))

### Create all the features
Afterwards, save them to features.csv

In [199]:
def create_features(data):
    columns = [f'column_root_{i}' for i in range(10)]
    root_df = pd.DataFrame(data.apply(match_roots, axis= 1).tolist(), columns = columns)
    
    print('Finished creating root columns!')
    
    columns = [f'column_cos_{i}' for i in range(10)]
    cos_df = pd.DataFrame(data.apply(get_cosine_sim, axis = 1).tolist(), columns = columns)
    print('Finisehd creating distances columns!')
    
    train = pd.concat([root_df, cos_df], axis=1, sort=False)
    train['target'] = data.apply(get_target,axis = 1)
    
    return train

In [205]:
train = create_features(df)

Finished creating root columns!
Finisehd creating distances columns!


In [207]:
train.shape

(84346, 21)

In [208]:
train.to_csv("data/features.csv", index = None)

### Create train/test data and trying different models

In [11]:
X = train.drop(['target'],axis = 1)
y = train['target']

In [12]:
train_x, test_x, train_y, test_y = train_test_split(X, y, train_size=0.8, random_state = 5)

In [21]:
rf = RandomForestClassifier(min_samples_leaf=8, n_estimators=200)
rf.fit(train_x, train_y)

print("Random Forest Train Accuracy : ", metrics.accuracy_score(train_y, rf.predict(train_x)))
print("Random Forest Test Accuracy : ", metrics.accuracy_score(test_y, rf.predict(test_x)))

Random Forest Train Accuracy :  0.7377734305530855
Random Forest Test Accuracy :  0.6816834617664493


In [24]:
model = xgb.XGBClassifier()
param_dist = {"max_depth": [3,5,10],
              "min_child_weight" : [1,5,10],
              "learning_rate": [0.07, 0.1,0.2],
               }
# run randomized search
grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, verbose=5)
grid_search.fit(train_x, train_y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] learning_rate=0.07, max_depth=3, min_child_weight=1 .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  learning_rate=0.07, max_depth=3, min_child_weight=1, score=0.680, total=   9.3s
[CV] learning_rate=0.07, max_depth=3, min_child_weight=1 .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.2s remaining:    0.0s


[CV]  learning_rate=0.07, max_depth=3, min_child_weight=1, score=0.680, total=   9.9s
[CV] learning_rate=0.07, max_depth=3, min_child_weight=1 .............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   19.1s remaining:    0.0s


[CV]  learning_rate=0.07, max_depth=3, min_child_weight=1, score=0.680, total=   9.7s
[CV] learning_rate=0.07, max_depth=3, min_child_weight=5 .............


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   28.9s remaining:    0.0s


[CV]  learning_rate=0.07, max_depth=3, min_child_weight=5, score=0.680, total=   9.4s
[CV] learning_rate=0.07, max_depth=3, min_child_weight=5 .............


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   38.2s remaining:    0.0s


[CV]  learning_rate=0.07, max_depth=3, min_child_weight=5, score=0.680, total=   9.9s
[CV] learning_rate=0.07, max_depth=3, min_child_weight=5 .............
[CV]  learning_rate=0.07, max_depth=3, min_child_weight=5, score=0.679, total=   9.5s
[CV] learning_rate=0.07, max_depth=3, min_child_weight=10 ............
[CV]  learning_rate=0.07, max_depth=3, min_child_weight=10, score=0.680, total=   9.6s
[CV] learning_rate=0.07, max_depth=3, min_child_weight=10 ............
[CV]  learning_rate=0.07, max_depth=3, min_child_weight=10, score=0.680, total=   9.9s
[CV] learning_rate=0.07, max_depth=3, min_child_weight=10 ............
[CV]  learning_rate=0.07, max_depth=3, min_child_weight=10, score=0.679, total=  10.4s
[CV] learning_rate=0.07, max_depth=5, min_child_weight=1 .............
[CV]  learning_rate=0.07, max_depth=5, min_child_weight=1, score=0.687, total=  14.5s
[CV] learning_rate=0.07, max_depth=5, min_child_weight=1 .............
[CV]  learning_rate=0.07, max_depth=5, min_child_weight

[CV]  learning_rate=0.2, max_depth=3, min_child_weight=5, score=0.684, total=  10.9s
[CV] learning_rate=0.2, max_depth=3, min_child_weight=5 ..............
[CV]  learning_rate=0.2, max_depth=3, min_child_weight=5, score=0.686, total=  10.3s
[CV] learning_rate=0.2, max_depth=3, min_child_weight=5 ..............
[CV]  learning_rate=0.2, max_depth=3, min_child_weight=5, score=0.685, total=  10.9s
[CV] learning_rate=0.2, max_depth=3, min_child_weight=10 .............
[CV]  learning_rate=0.2, max_depth=3, min_child_weight=10, score=0.684, total=  11.1s
[CV] learning_rate=0.2, max_depth=3, min_child_weight=10 .............
[CV]  learning_rate=0.2, max_depth=3, min_child_weight=10, score=0.686, total=  11.0s
[CV] learning_rate=0.2, max_depth=3, min_child_weight=10 .............
[CV]  learning_rate=0.2, max_depth=3, min_child_weight=10, score=0.685, total=  12.0s
[CV] learning_rate=0.2, max_depth=5, min_child_weight=1 ..............
[CV]  learning_rate=0.2, max_depth=5, min_child_weight=1, sco

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed: 22.1min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=False,

### Taking the best parameters for XGBoost

In [25]:
grid_search.best_estimator_

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=10, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [32]:
xg = xgb.XGBClassifier(learning_rate=0.1,max_depth=5)
xg.fit(train_x, train_y)

print("XGB Train Accuracy : ", metrics.accuracy_score(train_y, xg.predict(train_x)))
print("XGB Test Accuracy : ", metrics.accuracy_score(test_y, xg.predict(test_x)))


XGB Train Accuracy :  0.7139871954472702
XGB Test Accuracy :  0.6953764078245406
