# COMP34812 Natural Language Understanding Coursework 

## Natural Language Inference (NLI)

### Method A: Unsupervised or traditional machine learning-based approaches

Load libs

In [1]:
# Import libs used in the project
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

from scipy.sparse import csr_matrix, hstack

import re
import pickle

Download pakages

In [2]:
# Download all pakages needed
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zhuge\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zhuge\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zhuge\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load train data

train_data = pd.read_csv("./training_data/NLI/train.csv")
train_data = train_data.fillna(value='')

# There is some words are not english in training data need to be removed
def remove_non_english(text):
    cleaned_text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return cleaned_text

for t in train_data:
    t = remove_non_english(t)

    # Generate TfidfVectorizer feature matrix
tfidf_vectorizer_train = TfidfVectorizer()
X_train_premise = tfidf_vectorizer_train.fit_transform(train_data['premise'])
tfidf_vectorizer_train_h = TfidfVectorizer(vocabulary=tfidf_vectorizer_train.vocabulary_)
X_train_hypothesis = tfidf_vectorizer_train_h.fit_transform(train_data['hypothesis'])

# Concatenate TF-IDF matrices for training data
X_train = hstack([X_train_premise, X_train_hypothesis])

# Labels
y_train = train_data['label']

Model training and Save model as file:

In [4]:
# Model training
model = LogisticRegression(C = 0.26)
model.fit(X_train, y_train)

# Save model to a file
with open('NLI_A_model.pkl', 'wb') as file:
    pickle.dump(model, file)
    print("model saved as 'NLI_A_model.pkl'")
    
with open('tfidf_vectorizer_train.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer_train, file)
    print("model saved as 'tfidf_vectorizer_train.pkl'")

model saved as 'NLI_A_model.pkl'
model saved as 'tfidf_vectorizer_train.pkl'


### Load model and predict

In [5]:
# Load model from file
with open('NLI_A_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

with open('tfidf_vectorizer_train.pkl', 'rb') as file:
    loaded_vectorizer = pickle.load(file)
    
val_data = pd.read_csv("./training_data/NLI/dev.csv")
val_data = val_data.fillna(value='')
    
for t in val_data:
    t = remove_non_english(t)
    
# Create TfidfVectorizer instance for validation data
tfidf_vectorizer_val = TfidfVectorizer(vocabulary=loaded_vectorizer.vocabulary_)
X_val_premise = tfidf_vectorizer_val.fit_transform(val_data['premise'])
X_val_hypothesis = tfidf_vectorizer_val.fit_transform(val_data['hypothesis'])
X_val = hstack([X_val_premise, X_val_hypothesis])

y_val = val_data['label']
    
# Predictions
y_pred_train = loaded_model.predict(X_train)
y_pred_val = loaded_model.predict(X_val)

# Evaluation
train_accuracy = accuracy_score(y_train, y_pred_train)
val_accuracy = accuracy_score(y_val, y_pred_val)

print("Train Accuracy:", train_accuracy)
print("Train classification report:")
print(classification_report(y_train, y_pred_train))
print("Validation Accuracy:", val_accuracy)
print("Validation classification report:")
print(classification_report(y_val, y_pred_val))

Train Accuracy: 0.7807675178147269
Train classification report:
              precision    recall  f1-score   support

           0       0.82      0.71      0.76     13024
           1       0.76      0.85      0.80     13920

    accuracy                           0.78     26944
   macro avg       0.79      0.78      0.78     26944
weighted avg       0.78      0.78      0.78     26944

Validation Accuracy: 0.6639453762802434
Validation classification report:
              precision    recall  f1-score   support

           0       0.68      0.58      0.62      3259
           1       0.65      0.75      0.70      3478

    accuracy                           0.66      6737
   macro avg       0.67      0.66      0.66      6737
weighted avg       0.67      0.66      0.66      6737



Save model as csv file

In [6]:
# Create a DataFrame with your original data
df_train = pd.DataFrame()

# Add the predictions as a new column
df_train['predictions'] = y_pred_train

# Save the DataFrame to a CSV file
df_train.to_csv('A_train.csv', index=False)



# Create a DataFrame with your original data
df_val = pd.DataFrame()

# Add the predictions as a new column
df_val['predictions'] = y_pred_val

# Save the DataFrame to a CSV file
df_val.to_csv('A_dev.csv', index=False)