In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df = pd.read_csv("/kaggle/input/472-mlproject-application-v2-reshape-fix/keyphrase_scores.csv")
df.head()

Unnamed: 0,anchor,target,sample_id,text,score
0,invest share,invest share,0,invest share[SEP]invest share,0.998756
1,invest share,share market,0,invest share[SEP]share market,0.000645
2,invest share,guide invest,0,invest share[SEP]guide invest,0.083592
3,invest share,invest,0,invest share[SEP]invest,0.443991
4,invest share,step guide,0,invest share[SEP]step guide,0.000148


In [3]:
gt = pd.read_csv("/kaggle/input/472-mlproject-application-v2-reshape-fix/quora_results.csv")
gt.head()

Unnamed: 0,id,question1,question2,label,top_5_mean
0,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0.912277,0.89179
1,1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0.655141,0.69446
2,2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0.515561,0.538707
3,3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0.104022,0.185442
4,4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0.325348,0.286883


In [4]:
from tqdm import tqdm

all_ids = df.sample_id.unique()

inputs = []
labels = []
for sid in tqdm(all_ids):
    scores = df[df.sample_id == sid]['score'].values
    if len(scores) < 25:
        scores = np.pad(scores, (0, 25-len(scores)), 'constant')
    
    inputs.append(scores)
    labels.append(gt[gt.id == sid].label.values[0])


100%|██████████| 19980/19980 [00:29<00:00, 687.03it/s]


In [5]:
inputs = np.stack(inputs)
labels = np.array(labels)

inputs.shape, labels.shape

((19980, 25), (19980,))

In [6]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(inputs, labels, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14985, 25), (4995, 25), (14985,), (4995,))

# MLP

In [7]:
%%time
regr = MLPRegressor(random_state=42, max_iter=1000)
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

CPU times: user 8.52 s, sys: 6.28 s, total: 14.8 s
Wall time: 3.76 s


0.705151640112658

In [8]:
preds = regr.predict(X_test)
preds.shape

(4995,)

In [9]:
np.mean(abs(y_test - preds))

0.11265334292113632

In [10]:
from joblib import dump, load
dump(regr, 'regressor_mlp.joblib')
regr2 = load('regressor_mlp.joblib') 

In [11]:
preds = regr2.predict(X_test)
np.mean(abs(y_test - preds))

0.11265334292113632

In [12]:
import pickle
# save the model to disk
filename = 'regressor_mlp.sav'
pickle.dump(regr, open(filename, 'wb'))

In [13]:
regr2 = pickle.load(open(filename, 'rb'))
preds = regr2.predict(X_test)
np.mean(abs(y_test - preds))

0.11265334292113632

# Random Forest

In [14]:
%%time
from sklearn.ensemble import RandomForestRegressor

forest_regressor = RandomForestRegressor(n_estimators = 300, random_state = 42)
forest_regressor.fit(X_train, y_train)

CPU times: user 1min 43s, sys: 428 ms, total: 1min 44s
Wall time: 1min 44s


RandomForestRegressor(n_estimators=300, random_state=42)

In [15]:
preds = forest_regressor.predict(X_test)
np.mean(abs(y_test - preds))

0.11405343417483878

# SVC

In [16]:
%%time
from sklearn.svm import SVR

svr_regressor = SVR(kernel='rbf', gamma='auto')
svr_regressor.fit(X_train, y_train)
preds = svr_regressor.predict(X_test)
np.mean(abs(y_test - preds))

CPU times: user 9.96 s, sys: 223 ms, total: 10.2 s
Wall time: 10.2 s


0.1185915736567972

In [17]:
%%time
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
model.fit(X_train, y_train)

CPU times: user 2min 24s, sys: 618 ms, total: 2min 25s
Wall time: 36.9 s


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
             early_stopping_rounds=None, enable_categorical=False, eta=0.1,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.100000001, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=1000,
             n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
             reg_alpha=0, ...)

In [18]:
preds = model.predict(X_test)
np.mean(abs(y_test - preds))

0.11339118352483933

# Top 5 mean

In [19]:
TOP_N_METRIC = 5
abs_diff = (gt[f'top_{TOP_N_METRIC}_mean'] - gt['label']).abs()
mean_abs_diff = abs_diff.mean()
print(f'Mean difference: {mean_abs_diff:.3f}')

Mean difference: 0.142
