In [2]:
import pandas as pd

In [3]:
# Read data
df1 = pd.read_json('../Data/domain1_train_data.json', lines=True)
df2 = pd.read_json('../Data/domain2_train_data.json', lines=True)

# Domain 2

In [181]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Train test split
x_train, x_test, y_train, y_test = train_test_split(df2['text'], df2['label'], test_size=0.2, stratify=df2['label'])

# List to strings
text_tr = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in x_train]
text_ts = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in x_test]

# Tokens encoding to sparce matrix (Countv)
cvec = CountVectorizer()
X_tr = cvec.fit_transform(text_tr)
X_ts = cvec.transform(text_ts)

In [185]:
# Data oversampling
from imblearn.over_sampling import RandomOverSampler

# transform the dataset
oversample = RandomOverSampler()
X, y = oversample.fit_resample(X_tr, y_train)
y.value_counts()

label
0    9200
1    9200
Name: count, dtype: int64

In [188]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
cl_d1 = LogisticRegression(max_iter=100000).fit(X, y)

print(f'Train Accuracy {cl_d1.score(X, y):.3f}\t Train F1 {f1_score(y,cl_d1.predict(X)):.3f}')
print(f'Test Accuracy {cl_d1.score(X_ts, y_test):.3f}\t Test F1 {f1_score(y_test,cl_d1.predict(X_ts)):.3f}')

Train Accuracy 1.000	 Train F1 1.000
Test Accuracy 0.900	 Test F1 0.525


In [190]:
import warnings
from sklearn.model_selection import GridSearchCV 
warnings.filterwarnings('ignore')

# defining parameter range 
param_grid = [{'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
              {'penalty':['none', 'elasticnet', 'l1', 'l2']},
              {'C':[0.001, 0.01, 0.1, 1, 10, 100]}]
  
grid = GridSearchCV(LogisticRegression(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X, y)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END ..................solver=newton-cg;, score=0.965 total time=   2.3s
[CV 2/5] END ..................solver=newton-cg;, score=0.971 total time=   1.8s
[CV 3/5] END ..................solver=newton-cg;, score=0.968 total time=   1.6s
[CV 4/5] END ..................solver=newton-cg;, score=0.971 total time=   1.4s
[CV 5/5] END ..................solver=newton-cg;, score=0.972 total time=   1.3s
[CV 1/5] END ......................solver=lbfgs;, score=0.966 total time=   1.3s
[CV 2/5] END ......................solver=lbfgs;, score=0.971 total time=   1.5s
[CV 3/5] END ......................solver=lbfgs;, score=0.968 total time=   1.5s
[CV 4/5] END ......................solver=lbfgs;, score=0.971 total time=   1.3s
[CV 5/5] END ......................solver=lbfgs;, score=0.972 total time=   1.4s
[CV 1/5] END ..................solver=liblinear;, score=0.966 total time=   2.7s
[CV 2/5] END ..................solver=liblinear;

In [191]:
from sklearn.metrics import classification_report 
grid_predictions = grid.predict(X_ts) 
  
# print classification report 
print(classification_report(y_test, grid_predictions)) 

              precision    recall  f1-score   support

           0       0.93      0.96      0.95      2300
           1       0.61      0.46      0.52       300

    accuracy                           0.90      2600
   macro avg       0.77      0.71      0.73      2600
weighted avg       0.89      0.90      0.90      2600



In [192]:
import pickle
# save model
with open('../saved models/Domain2_pred_model.pkl','wb') as f:
    pickle.dump(cl_d1,f)

# Save countVect
with open('../saved models/objs2.pkl', 'wb') as f:
    pickle.dump([cvec], f)

# Predict

In [193]:
import pickle
# -----------------------------------------------------------------
# Domain classification model

# Getting back the objects:
with open('../saved models/objs.pkl', 'rb') as f:
    cvec = pickle.load(f)

# load model
with open('../saved models/Domain_class_model.pkl', 'rb') as f:
    cl_d = pickle.load(f)
# -----------------------------------------------------------------
# Domain 1 model
# Getting back the objects:
with open('../saved models/objs1.pkl', 'rb') as f:
    tvec1 = pickle.load(f)

# load model
with open('../saved models/Domain1_pred_model.pkl', 'rb') as f:
    cl_d1 = pickle.load(f)

# -----------------------------------------------------------------
# Domain 2 model
# Getting back the objects:
with open('../saved models/objs2.pkl', 'rb') as f:
    cvec2 = pickle.load(f)

# load model
with open('../saved models/Domain2_pred_model.pkl', 'rb') as f:
    cl_d2 = pickle.load(f)
# -----------------------------------------------------------------

In [194]:
# # Test dataset
# test_data = pd.read_json('../Data/test_data.json', lines=True)
# test_data['text'] = [[t if t != 0 else 1 for t in ls] for ls in test_data['text']]

import json
with open('../Data/test_Data_unmasked.json', 'r') as f:
    test_data = json.load(f)
test_data = pd.DataFrame(pd.Series(test_data))
test_data.columns = ['text']
test_data['class'] = 0

# Predict domains
text = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in test_data['text']]
X_ = cvec[0].transform(text)
test_data['domains'] = cl_d.predict(X_)
test_data['domains'].value_counts()

domains
1    2006
0    1994
Name: count, dtype: int64

In [195]:
# Predict domain 1 class
d1_txt = test_data[test_data['domains'] == 0][['text']]
test_data.loc[test_data[test_data['domains'] == 0].index,'class'] =  pred_d1(d1_txt)
test_data.loc[test_data[test_data['domains'] == 0].index,'class'].value_counts()

class
0    1035
1     959
Name: count, dtype: int64

In [196]:
# Predict domain 2 class
d2_txt = test_data[test_data['domains'] == 1]['text']
text = [re.sub(',', '',', '.join([str(x) for x in tok])) for tok in d2_txt]
X_ = cvec2[0].transform(text)
test_data.loc[test_data[test_data['domains'] == 1].index,'class'] = cl_d2.predict(X_)
test_data.loc[test_data[test_data['domains'] == 1].index,'class'].value_counts()

class
0    1422
1     584
Name: count, dtype: int64

In [197]:
test_data.reset_index(names='id', inplace=True)
test_data[['id', 'class']].to_csv('../Data/predictions.csv', index=False)

In [156]:
test_data['class'].value_counts()

class
0    2414
1    1586
Name: count, dtype: int64

In [198]:
pd.read_csv('../Data/predictions.csv')['class'].value_counts()

class
0    2457
1    1543
Name: count, dtype: int64

In [162]:
preds = pred_d1(test_data)

In [168]:
df_class = pd.DataFrame(preds, columns=['class'])
df_class.reset_index(names='id', inplace=True)
df_class.to_csv('../Data/predictions.csv', index=False)

In [170]:
pd.Series(preds).value_counts()

0    2381
1    1619
Name: count, dtype: int64