In [23]:
import pandas as pd
import csv
from sklearn.preprocessing import StandardScaler

In [24]:
df_train = pd.read_csv('train_with_label.txt',sep='\t',header=None,quoting=csv.QUOTE_NONE)
df_test = pd.read_csv('dev_with_label.txt', sep='\t',header=None,quoting=csv.QUOTE_NONE)
df_test_nolabel = pd.read_csv('test_without_label.txt', sep='\t',header=None,quoting=csv.QUOTE_NONE)

In [25]:
print("Training datatset shape: " + str(df_train.shape))
print("Test datatset shape: " + str(df_test.shape))
print("Test no label datatset shape: " + str(df_test_nolabel.shape))

Training datatset shape: (7801, 4)
Test datatset shape: (4000, 4)
Test no label datatset shape: (4000, 3)


In [26]:
#setting column names for test and training set
df_train.columns = ["instance_id","sent1","sent2","gold_score"]
df_test.columns = ["instance_id","sent1","sent2","gold_score"]
df_test_nolabel.columns = ["instance_id","sent1","sent2"]

In [27]:
#check for null values and eliminate them
print(df_train.isnull().sum())
print(df_test.isnull().sum())
print(df_test_nolabel.isnull().sum())

instance_id    0
sent1          0
sent2          0
gold_score     0
dtype: int64
instance_id    0
sent1          0
sent2          0
gold_score     0
dtype: int64
instance_id    0
sent1          0
sent2          0
dtype: int64


In [28]:
#possible target names 
print("Train data target names: {}".format(df_train["gold_score"].unique()))
print("Test data target names: {}".format(df_test["gold_score"].unique()))

Train data target names: [0 1]
Test data target names: [0 1]


In [29]:
print(df_train.dtypes)
print(df_test.dtypes)
print(df_test_nolabel.dtypes)

instance_id    object
sent1          object
sent2          object
gold_score      int64
dtype: object
instance_id    object
sent1          object
sent2          object
gold_score      int64
dtype: object
instance_id    object
sent1          object
sent2          object
dtype: object


In [30]:
#Training dataset
df_train['sent1']=df_train['sent1'].astype('string')
df_train['sent2']=df_train['sent2'].astype('string')
df_train['sent1'] = df_train['sent1'].str.lower()
df_train['sent2'] = df_train['sent2'].str.lower()
#Test dataset
df_test['sent1']=df_test['sent1'].astype('string')
df_test['sent2']=df_test['sent2'].astype('string')
df_test['sent1'] = df_test['sent1'].str.lower()
df_test['sent2'] = df_test['sent2'].str.lower()
#Test with no label
df_test_nolabel['sent1']=df_test_nolabel['sent1'].astype('string')
df_test_nolabel['sent2']=df_test_nolabel['sent2'].astype('string')
df_test_nolabel['sent1'] = df_test_nolabel['sent1'].str.lower()
df_test_nolabel['sent2'] = df_test_nolabel['sent2'].str.lower()

In [31]:
print(df_train.dtypes)
print(df_test.dtypes)
print(df_test_nolabel.dtypes)

instance_id    object
sent1          string
sent2          string
gold_score      int64
dtype: object
instance_id    object
sent1          string
sent2          string
gold_score      int64
dtype: object
instance_id    object
sent1          string
sent2          string
dtype: object


In [32]:
#Training
df_train['sent1'] = df_train['sent1'].str.replace(",","")
df_train['sent2'] = df_train['sent2'].str.replace(",","")
#Testing
df_test['sent1'] = df_test['sent1'].str.replace(",","")
df_test['sent2'] = df_test['sent2'].str.replace(",","")
#Testing with no label
df_test_nolabel['sent1'] = df_test_nolabel['sent1'].str.replace(",","")
df_test_nolabel['sent2'] = df_test_nolabel['sent2'].str.replace(",","")

## Feature1: Word count difference

In [33]:
#Training
df_train['sent1l'] = df_train['sent1'].str.split()
df_train['sent2l'] = df_train['sent2'].str.split()
#Testing
df_test['sent1l'] = df_test['sent1'].str.split()
df_test['sent2l'] = df_test['sent2'].str.split()
#Testing with no label
df_test_nolabel['sent1l'] = df_test_nolabel['sent1'].str.split()
df_test_nolabel['sent2l'] = df_test_nolabel['sent2'].str.split()

In [34]:
#Training
df_train['sent1l']=df_train['sent1l'].apply(lambda x: len(x))
df_train['sent2l']=df_train['sent2l'].apply(lambda x: len(x))
#Testing
df_test['sent1l']=df_test['sent1l'].apply(lambda x: len(x))
df_test['sent2l']=df_test['sent2l'].apply(lambda x: len(x))
#Testing with no label
df_test_nolabel['sent1l']=df_test_nolabel['sent1'].apply(lambda x: len(x))
df_test_nolabel['sent2l']=df_test_nolabel['sent2'].apply(lambda x: len(x))

In [35]:
df_train = df_train.assign(wcd=lambda x: abs((x['sent1l']-x['sent2l'])))
df_test = df_test.assign(wcd=lambda x: abs((x['sent1l']-x['sent2l'])))
df_test_nolabel = df_test_nolabel.assign(wcd=lambda x: abs((x['sent1l']-x['sent2l'])))

## Feature 2: Fuzzy ratio

In [36]:
from fuzzywuzzy import fuzz
import string

In [37]:
df_train['fuzz_ratio'] = df_train.apply(lambda row: fuzz.ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test['fuzz_ratio'] = df_test.apply(lambda row: fuzz.ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test_nolabel['fuzz_ratio'] = df_test_nolabel.apply(lambda row: fuzz.ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)


In [38]:
df_train['fuzz_token_sort_ratio'] = df_train.apply(lambda row: fuzz.token_sort_ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test['fuzz_token_sort_ratio'] = df_test.apply(lambda row: fuzz.token_sort_ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test_nolabel['fuzz_token_sort_ratio'] = df_test_nolabel.apply(lambda row: fuzz.token_sort_ratio(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)

## Feature 3: Levenshtein distance

In [39]:
from Levenshtein import distance as lev

In [40]:
df_train['lev_dist'] = df_train.apply(lambda row: lev(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test['lev_dist'] = df_test.apply(lambda row: lev(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test_nolabel['lev_dist'] = df_test_nolabel.apply(lambda row: lev(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)

## Feature 4: Bleu Score 

In [52]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
import string

In [53]:
df_train['bleu_score'] = df_train.apply(lambda row: sentence_bleu(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation)),smoothing_function=SmoothingFunction().method4), axis = 1)
df_test['bleu_score'] = df_test.apply(lambda row: sentence_bleu(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation)),smoothing_function=SmoothingFunction().method4),axis = 1)
df_test_nolabel['bleu_score'] = df_test_nolabel.apply(lambda row: sentence_bleu(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation)),smoothing_function=SmoothingFunction().method4),axis = 1)

## Feature 5: NIST score

In [54]:
from nltk.translate.nist_score import sentence_nist

In [55]:
df_train['nist_score'] = df_train.apply(lambda row: sentence_nist(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))), axis = 1)
df_test['nist_score'] = df_test.apply(lambda row: sentence_nist(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))),axis = 1)
df_test_nolabel['nist_score'] = df_test_nolabel.apply(lambda row: sentence_nist(row['sent1'].translate(str.maketrans('', '', string.punctuation)), row['sent2'].translate(str.maketrans('', '', string.punctuation))),axis = 1)

In [57]:
print(df_train.dtypes)
print(df_test.dtypes)
print(df_test_nolabel.dtypes)

instance_id               object
sent1                     string
sent2                     string
gold_score                 int64
sent1l                     int64
sent2l                     int64
wcd                        int64
fuzz_ratio                 int64
fuzz_token_sort_ratio      int64
lev_dist                   int64
bleu_score               float64
nist_score               float64
dtype: object
instance_id               object
sent1                     string
sent2                     string
gold_score                 int64
sent1l                     int64
sent2l                     int64
wcd                        int64
fuzz_ratio                 int64
fuzz_token_sort_ratio      int64
lev_dist                   int64
bleu_score               float64
nist_score               float64
dtype: object
instance_id               object
sent1                     string
sent2                     string
sent1l                     int64
sent2l                     int64
wcd            

In [58]:
X_train = df_train.drop(columns=['instance_id','sent1','sent2','gold_score','sent1l','sent2l']).values
X_test = df_test.drop(columns=['instance_id','sent1','sent2','gold_score','sent1l','sent2l']).values
y_train = df_train['gold_score'].values
y_test =df_test['gold_score'].values
X_test_nolabel = df_test_nolabel.drop(columns=['instance_id','sent1','sent2','sent1l','sent2l']).values

normalizer = StandardScaler()
# X_train_val = X_train_val.reshape(-1,1)
# X_test = X_test.reshape(-1,1)
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)
X_test_nolabel = normalizer.transform(X_test_nolabel)
print("train_val: {}, test: {}, test_nolabel: {}".format(X_train.shape, X_test.shape, X_test_nolabel.shape))

train_val: (7801, 6), test: (4000, 6), test_nolabel: (4000, 6)


## MLP

In [59]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [65]:
clfmlp = MLPClassifier(hidden_layer_sizes=(8,7),
                    random_state=5,
                    verbose=False,
                    learning_rate_init=0.01)

clfmlp.fit(X_train, y_train)
y_test_pred = clfmlp.predict(X_test)
f1 = f1_score(y_test, y_test_pred)
acc = accuracy_score(y_test, y_test_pred)
print("F1 score: {:.3f}, Accuracy: {:.3f}".format(f1,acc))

F1 score: 0.798, Accuracy: 0.893


## Prediction for test dataset

In [66]:
y_test_pred = clfmlp.predict(X_test_nolabel)

In [67]:
iterations = df_test_nolabel['instance_id'].to_numpy()
# print(dt)
print(len(iterations))

4000


In [68]:
# print(y_test_pred)
print(len(y_test_pred))

4000


In [69]:
file = open('AbuHasnatHasib_test_result.txt', 'w') #write to file
count = 0
for i in range(0,len(iterations)):
    file.write(str(iterations[i]) + "\t" + str(y_test_pred[i]) + "\n")
    count+=1
print("Total number of test instances for the test result is : " + str(count))

Total number of test instances for the test result is : 4000


In [70]:
file.close() #close file