# 1. Data preprocessing

In [7]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
import re
from sklearn.metrics import f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read data
df1 = pd.read_json('../Data/domain1_train_data.json', lines=True)
df2 = pd.read_json('../Data/domain2_train_data.json', lines=True)

# Define Domains
df1['domain'], df2['domain'] = 1, 2

# Split set 1
x1_tr, x_dv = train_test_split(df1, stratify=df1['label'], random_state=0, test_size=0.2)
# Split set 2
x2_1 = df2[df2['label'] == 1].sample(500, random_state=0)
x2_0 = df2[df2['label'] == 0].sample(500, random_state=0)
x2_tr = df2[[i not in list(pd.concat([x2_1, x2_0]).reset_index()['index']) for i in df2.index]].reset_index(drop=True)
x2_dev = pd.concat([x2_1,x2_0]).reset_index(drop=True)

# Train test sets
x_train = pd.concat([x1_tr, x2_tr]).sample(frac=1).reset_index(drop=True)
x_dev_ = pd.concat([x_dv, x2_dev]).sample(frac=1).reset_index(drop=True)

# Transform numbers to strings to treat them as tokens
txt1 = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in x1_tr['text']]
txt2 = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in x2_tr['text']]
aug_txt = [re.sub(',', '', ', '.join([str(x) for x in tok])) for tok in x_train['text']]

# Final train test sets
x_tr_aug, y_train = aug_txt, x_train['label']
x_dev_, y_dev = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in x_dev_['text']], np.array(x_dev_['label'].astype('float32'))

# Print classes proportion
print(round(y_train.value_counts()/len(y_train),2))

label
0    0.81
1    0.19
Name: count, dtype: float64


# 2. Encoding

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Tokens encoding to sparce matrix (Countv)
Cvec = CountVectorizer(ngram_range=(8,9))
x_train_c = Cvec.fit_transform(x_tr_aug)
x_dev_c = Cvec.transform(x_dev_)

In [4]:
x_train_c.shape

(16000, 5154787)

### Logistic model

In [8]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC

# Train models
cl_aug_c = RidgeClassifier(alpha=1.0, solver="sparse_cg").fit(x_train_c, y_train)
cl_aug_c_w = RidgeClassifier(class_weight = "balanced", alpha=1.0, solver="sparse_cg").fit(x_train_c, y_train)

In [11]:
# Data augmentation scores
sc_aug_c_tr, sc_aug_c_ts = cl_aug_c.score(x_train_c, y_train), cl_aug_c.score(x_dev_c, y_dev)
sc_aug_c_w_tr, sc_aug_c_w_ts = cl_aug_c_w.score(x_train_c, y_train), cl_aug_c_w.score(x_dev_c, y_dev)
# F1 scores
f1_aug_c_tr, f1_aug_c_ts = f1_score(y_train, cl_aug_c.predict(x_train_c)), f1_score(y_dev, cl_aug_c.predict(x_dev_c))
f1_aug_c_w_tr, f1_aug_c_w_ts = f1_score(y_train, cl_aug_c_w.predict(x_train_c)), f1_score(y_dev, cl_aug_c_w.predict(x_dev_c))

# Print scores
print(f'Model -DA using  Cvec:\t\ttr_acc: {sc_aug_c_tr:.3f}\tval_acc: {sc_aug_c_ts:.3f}\ttr_F1" {f1_aug_c_tr:.3f}\tts_F1" {f1_aug_c_ts:.3f}')
print(f'Model -DA using  Cvec-w:\ttr_acc: {sc_aug_c_w_tr:.3f}\tval_acc: {sc_aug_c_w_ts:.3f}\ttr_F1" {f1_aug_c_w_tr:.3f}\tts_F1" {f1_aug_c_w_ts:.3f}')

Model -DA using  Cvec:		tr_acc: 1.000	val_acc: 0.928	tr_F1" 1.000	ts_F1" 0.929
Model -DA using  Cvec-w:	tr_acc: 1.000	val_acc: 0.897	tr_F1" 1.000	ts_F1" 0.901


In [12]:
print("Best Model:")
print(f'Model -DA using  Cvec-w:\
      tr_acc: {sc_aug_c_tr:.3f}\
      val_acc: {sc_aug_c_ts:.3f}\
      tr_F1" {f1_aug_c_tr:.3f}\
      ts_F1" {f1_aug_c_ts:.3f}\
      ROC_AUC: {roc_auc_score(y_dev, cl_aug_c_w.predict(x_dev_c))}')

Best Model:
Model -DA using  Cvec-w:      tr_acc: 1.000      val_acc: 0.928      tr_F1" 1.000      ts_F1" 0.929      ROC_AUC: 0.897


# 4. Predict

### Use full dataset

In [13]:
df_full = pd.concat([df1, df2]).sample(frac=1).reset_index(drop=True)
txt_full = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in df_full['text']]
# Full vect
Cvec_f = CountVectorizer(ngram_range=(8,9))
x_full = Cvec_f.fit_transform(txt_full)
y_full = df_full['label']
# Full model
cl_aug_c_w = RidgeClassifier(alpha=1.0, solver="sparse_cg").fit(x_full, y_full)  

In [14]:
test = pd.read_json('../Data/test_data.json', lines=True)['text']
test = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in test]
test_c = Cvec_f.transform(test)

preds = cl_aug_c_w.predict(test_c)

test_df = pd.DataFrame(columns = ['id', 'value'])
for idx, v in enumerate(preds):
    test_df.loc[idx] = [idx, preds[idx]]
test_df.to_csv('../Data/predictions_ridge.csv', index=False)