In [64]:
import pandas as pd
import csv
from sklearn.preprocessing import StandardScaler

In [65]:
df_train = pd.read_csv('train_with_label.txt',sep='\t',header=None,quoting=csv.QUOTE_NONE)
df_test = pd.read_csv('dev_with_label.txt', sep='\t',header=None,quoting=csv.QUOTE_NONE)
df_test_nolabel = pd.read_csv('test_without_label.txt', sep='\t',header=None,quoting=csv.QUOTE_NONE)

In [66]:
print("Training datatset shape: " + str(df_train.shape))
print("Test datatset shape: " + str(df_test.shape))
print("Test no label datatset shape: " + str(df_test_nolabel.shape))

Training datatset shape: (7801, 4)
Test datatset shape: (4000, 4)
Test no label datatset shape: (4000, 3)


In [67]:
#setting column names for test and training set
df_train.columns = ["instance_id","sent1","sent2","gold_score"]
df_test.columns = ["instance_id","sent1","sent2","gold_score"]
df_test_nolabel.columns = ["instance_id","sent1","sent2"]

In [68]:
#check for null values and eliminate them
print(df_train.isnull().sum())
print(df_test.isnull().sum())
print(df_test_nolabel.isnull().sum())

instance_id    0
sent1          0
sent2          0
gold_score     0
dtype: int64
instance_id    0
sent1          0
sent2          0
gold_score     0
dtype: int64
instance_id    0
sent1          0
sent2          0
dtype: int64


In [69]:
#possible target names 
print("Train data target names: {}".format(df_train["gold_score"].unique()))
print("Test data target names: {}".format(df_test["gold_score"].unique()))

Train data target names: [0 1]
Test data target names: [0 1]


In [70]:
print(df_train.dtypes)
print(df_test.dtypes)
print(df_test_nolabel.dtypes)

instance_id    object
sent1          object
sent2          object
gold_score      int64
dtype: object
instance_id    object
sent1          object
sent2          object
gold_score      int64
dtype: object
instance_id    object
sent1          object
sent2          object
dtype: object


In [71]:
#Training dataset
df_train['sent1']=df_train['sent1'].astype('string')
df_train['sent2']=df_train['sent2'].astype('string')
df_train['sent1'] = df_train['sent1'].str.lower()
df_train['sent2'] = df_train['sent2'].str.lower()
#Test dataset
df_test['sent1']=df_test['sent1'].astype('string')
df_test['sent2']=df_test['sent2'].astype('string')
df_test['sent1'] = df_test['sent1'].str.lower()
df_test['sent2'] = df_test['sent2'].str.lower()
#Test with no label
df_test_nolabel['sent1']=df_test_nolabel['sent1'].astype('string')
df_test_nolabel['sent2']=df_test_nolabel['sent2'].astype('string')
df_test_nolabel['sent1'] = df_test_nolabel['sent1'].str.lower()
df_test_nolabel['sent2'] = df_test_nolabel['sent2'].str.lower()

In [72]:
print(df_train.dtypes)
print(df_test.dtypes)
print(df_test_nolabel.dtypes)

instance_id    object
sent1          string
sent2          string
gold_score      int64
dtype: object
instance_id    object
sent1          string
sent2          string
gold_score      int64
dtype: object
instance_id    object
sent1          string
sent2          string
dtype: object


In [73]:
#Training
df_train['sent1'] = df_train['sent1'].str.replace(",","")
df_train['sent2'] = df_train['sent2'].str.replace(",","")
#Testing
df_test['sent1'] = df_test['sent1'].str.replace(",","")
df_test['sent2'] = df_test['sent2'].str.replace(",","")
#Testing with no label
df_test_nolabel['sent1'] = df_test_nolabel['sent1'].str.replace(",","")
df_test_nolabel['sent2'] = df_test_nolabel['sent2'].str.replace(",","")

## Feature1: Word count difference

In [74]:
#Training
df_train['sent1_tok'] = df_train['sent1'].str.split()
df_train['sent2_tok'] = df_train['sent2'].str.split()
#Testing
df_test['sent1_tok'] = df_test['sent1'].str.split()
df_test['sent2_tok'] = df_test['sent2'].str.split()
#Testing with no label
df_test_nolabel['sent1_tok'] = df_test_nolabel['sent1'].str.split()
df_test_nolabel['sent2_tok'] = df_test_nolabel['sent2'].str.split()

In [75]:
#Training
df_train['sent1l']=df_train['sent1_tok'].apply(lambda x: len(x))
df_train['sent2l']=df_train['sent2_tok'].apply(lambda x: len(x))
#Testing
df_test['sent1l']=df_test['sent1_tok'].apply(lambda x: len(x))
df_test['sent2l']=df_test['sent2_tok'].apply(lambda x: len(x))
#Testing with no label
df_test_nolabel['sent1l']=df_test_nolabel['sent1_tok'].apply(lambda x: len(x))
df_test_nolabel['sent2l']=df_test_nolabel['sent2_tok'].apply(lambda x: len(x))

In [76]:
df_train = df_train.assign(wcd=lambda x: abs((x['sent1l']-x['sent2l'])))
df_test = df_test.assign(wcd=lambda x: abs((x['sent1l']-x['sent2l'])))
df_test_nolabel = df_test_nolabel.assign(wcd=lambda x: abs((x['sent1l']-x['sent2l'])))

## Feature 2: Bleu Score (1,2 and 3)

In [77]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk import word_tokenize
import string

In [78]:
df_train['bleu_score'] = df_train.apply(lambda row: sentence_bleu(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation)),smoothing_function=SmoothingFunction().method4), axis = 1)
df_test['bleu_score'] = df_test.apply(lambda row: sentence_bleu(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation)),smoothing_function=SmoothingFunction().method4),axis = 1)
df_test_nolabel['bleu_score'] = df_test_nolabel.apply(lambda row: sentence_bleu(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation)),smoothing_function=SmoothingFunction().method4),axis = 1)

## Feature 3: Fuzzy ratio

In [79]:
from fuzzywuzzy import fuzz
import string

In [80]:
df_train['fuzz_ratio'] = df_train.apply(lambda row: fuzz.ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test['fuzz_ratio'] = df_test.apply(lambda row: fuzz.ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test_nolabel['fuzz_ratio'] = df_test_nolabel.apply(lambda row: fuzz.ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)


In [81]:
df_train['fuzz_token_sort_ratio'] = df_train.apply(lambda row: fuzz.token_sort_ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test['fuzz_token_sort_ratio'] = df_test.apply(lambda row: fuzz.token_sort_ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test_nolabel['fuzz_token_sort_ratio'] = df_test_nolabel.apply(lambda row: fuzz.token_sort_ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)

## Feature 4: Levenshtein distance

In [82]:
from Levenshtein import distance as lev

In [83]:
df_train['lev_dist'] = df_train.apply(lambda row: lev(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test['lev_dist'] = df_test.apply(lambda row: lev(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test_nolabel['lev_dist'] = df_test_nolabel.apply(lambda row: lev(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)

## Feature 5: NIST score (1,2, and 3)

In [89]:
from nltk.translate.nist_score import sentence_nist

In [None]:
df_train['nist_score'] = df_train.apply(lambda row: sentence_nist(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test['nist_score'] = df_test.apply(lambda row: sentence_nist(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))),axis = 1)
df_test_nolabel['nist_score'] = df_test_nolabel.apply(lambda row: sentence_nist(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))),axis = 1)

In [85]:
print(df_train.dtypes)
print(df_test.dtypes)
print(df_test_nolabel.dtypes)

instance_id               object
sent1                     string
sent2                     string
gold_score                 int64
sent1_tok                 object
sent2_tok                 object
sent1l                     int64
sent2l                     int64
wcd                        int64
bleu_score               float64
fuzz_ratio                 int64
fuzz_token_sort_ratio      int64
lev_dist                   int64
dtype: object
instance_id               object
sent1                     string
sent2                     string
gold_score                 int64
sent1_tok                 object
sent2_tok                 object
sent1l                     int64
sent2l                     int64
wcd                        int64
bleu_score               float64
fuzz_ratio                 int64
fuzz_token_sort_ratio      int64
lev_dist                   int64
dtype: object
instance_id               object
sent1                     string
sent2                     string
sent1_tok      

In [88]:
X_train = df_train.drop(columns=['instance_id','sent1','sent2','gold_score','sent1l','sent2l']).values
X_test = df_test.drop(columns=['instance_id','sent1','sent2','gold_score','sent1l','sent2l']).values
y_train = df_train['gold_score'].values
y_test =df_test['gold_score'].values
X_test_nolabel = df_test_nolabel.drop(columns=['instance_id','sent1','sent2','sent1l','sent2l']).values

normalizer = StandardScaler()
# X_train_val = X_train_val.reshape(-1,1)
X_test = X_test.reshape(-1,1)
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)
X_test_nolabel = normalizer.transform(X_test_nolabel)
print("train_val: {}, test: {}, test_nolabel: {}".format(X_train.shape, X_test.shape, X_test_nolabel.shape))

ValueError: setting an array element with a sequence.

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [None]:
clfmlp = MLPClassifier(hidden_layer_sizes=(6,5),
                    random_state=5,
                    verbose=True,
                    learning_rate_init=0.01)

clfmlp.fit(X_train, y_train)
y_test_pred = clfmlp.predict(X_test)
f1 = f1_score(y_test, y_test_pred)
acc = accuracy_score(y_test, y_test_pred)
print("F1 score: {:.3f}, Accuracy: {:.3f}".format(f1,acc))

## Prediction for test dataset

In [None]:
y_test_pred = clfmp.predict(X_test_nolabel)

In [None]:
iterations = df_test_nolabel['instance_id'].to_numpy()
# print(dt)
print(len(iterations))

In [None]:
# print(y_test_pred)
print(len(y_test_pred))

In [None]:
file = open('AbuHasnatHasib_test_result.txt', 'w') #write to file
count = 0
for i in range(0,len(iterations)):
    file.write(str(iterations[i]) + "\t" + str(y_test_pred[i]) + "\n")
    count+=1
print("Total number of test instances for the test result is : " + str(count))

In [None]:
file.close() #close file