In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import f1_score
from scipy import stats

In [None]:
from plotly.offline import init_notebook_mode
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)

In [None]:
train = "../data/temp/train.csv"
dev = "../data/temp/dev.csv"

##### Load train and dev dataset

In [None]:
train_df = pd.read_csv(train)
dev_df = pd.read_csv(dev)

In [None]:
train_df

In [None]:
texts = np.hstack([train_df['text1'].values, train_df['text2'].values])
texts1 = train_df['text1']
texts2 = train_df['text2']
Y_train = train_df['same'].values
train_df = None

In [None]:
texts

In [None]:
Y_train

In [None]:
vectorizer = TfidfVectorizer(analyzer='char_wb', min_df=0.1, vocabulary=None, norm='l1', ngram_range=(1, 6), max_features=5000)

In [None]:
vectorizer.fit(texts)
scaler = StandardScaler(with_mean=False)
scaler.fit(texts)

In [None]:
x1 = scaler.transform(vectorizer.transform(texts1))
x2 = scaler.transform(vectorizer.transform(texts2))
X_train = np.abs(x1-x2).todense()
texts1=None
texts2=None

## Logistic Regression

In [None]:
clf = LogisticRegression(solver='lbfgs', max_iter=500)
distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1'])
param_clf = RandomizedSearchCV(clf, distributions, random_state=0, verbose=2, scoring='roc_auc')
search = param_clf.fit(X_train, Y_train)
search.best_params_

In [None]:
clf = LogisticRegression(C=0.226, solver='lbfgs', max_iter=5000, verbose=True)
clf.fit(X_train, Y_train)

#### test on development data

In [None]:
texts = np.hstack([dev_df['text1'].values, dev_df['text2'].values])
texts1 = dev_df['text1']
texts2 = dev_df['text2']
Y_dev = dev_df['same'].values

In [None]:
x1 = scaler.transform(vectorizer.transform(texts1))
x2 = scaler.transform(vectorizer.transform(texts2))
X_dev = np.abs(x1-x2).todense()

In [None]:
preds = clf.predict_proba(X_dev)[:, 1]
print('FPR-TRP Curve')

fpr, tpr, thresh = roc_curve(Y_dev, preds)
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = fpr,
    y = tpr,
    text = thresh,
    mode='lines'
))
fig.show()
print(roc_auc)


print('P-R Curve')
precision, recall, thresholds = precision_recall_curve(Y_dev, preds)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=recall,
    y=precision,
    text=np.array(thresholds).astype(str)
))
fig.show()
print('AUC: ', auc(recall, precision))

In [None]:
preds = clf.predict(X_dev)
f1_score(Y_dev, preds, average='macro')

In [None]:
precision, recall, thresholds

### SVM

In [None]:
clf = SVC(probability = True, random_state = 1)

In [None]:
param_dist = {"C": stats.uniform(2, 10),
             "gamma": stats.uniform(0.1, 1)}
rand_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=15,n_jobs=-1)
rand_search.fit(X_train, Y_train)

rand_search.best_params_

In [None]:
clf = SVC(probability = True, C=4.33, gamma=1.07)
clf.fit(X_train, Y_train)

In [None]:
texts = np.hstack([dev_df['text1'].values, dev_df['text2'].values])
texts1 = dev_df['text1']
texts2 = dev_df['text2']
Y_dev = dev_df['same'].values

In [None]:
x1 = scaler.transform(vectorizer.transform(texts1))
x2 = scaler.transform(vectorizer.transform(texts2))
X_dev = np.abs(x1-x2).todense()

In [None]:
preds = clf.predict_proba(X_dev)[:, 1]
print('FPR-TRP Curve')

fpr, tpr, thresh = roc_curve(Y_dev, preds)
roc_auc = auc(fpr, tpr)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x = fpr,
    y = tpr,
    text = thresh,
    mode='lines'
))
fig.show()
print(roc_auc)


print('P-R Curve')
precision, recall, thresholds = precision_recall_curve(Y_dev, preds)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=recall,
    y=precision,
    text=np.array(thresholds).astype(str)
))
fig.show()
print('AUC: ', auc(recall, precision))

In [None]:
preds = (clf.predict(X_dev))
f1_score(Y_dev, preds, average='macro')