In [55]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [56]:
# Preprocess the data
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [57]:
file_path1="C:\\Users\\ganya\\OneDrive\\Documents\\IIT GANYA\\SPRING 2023\\OSNA\\Project -2\\prepro_train.csv"
train_df = pd.read_csv(file_path1)
file_path2="C:\\Users\\ganya\\OneDrive\\Documents\\IIT GANYA\\SPRING 2023\\OSNA\\Project -2\\prepro_test.csv"
test_df = pd.read_csv(file_path2)

In [58]:
train_df['title1_en']=train_df['title1_en'].apply(str)
train_df['title2_en']=train_df['title2_en'].apply(str)
test_df['title1_en']=test_df['title1_en'].apply(str)
test_df['title2_en']=test_df['title2_en'].apply(str)

In [59]:
# initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# function to perform lemmatization on a sentence
def lemmatize_sentence(sentence):
    # tokenize the sentence into words
    token_words = nltk.word_tokenize(sentence)
    # lemmatize each word in the sentence
    lemmatized_sentence = ' '.join([lemmatizer.lemmatize(w) for w in token_words])
    return lemmatized_sentence

In [60]:
import nltk

#nltk.download('punkt')

#nltk.download('omw-1.4')

train_df['title1_en'] = train_df['title1_en'].apply(lemmatize_sentence)
train_df['title2_en'] = train_df['title2_en'].apply(lemmatize_sentence)
test_df['title1_en'] = test_df['title1_en'].apply(lemmatize_sentence)
test_df['title2_en'] = test_df['title2_en'].apply(lemmatize_sentence)

In [61]:
# Split the data into training and validation sets
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_df[['title1_en', 'title2_en']], train_df['label'], test_size=0.2)

In [62]:
# Convert the text data to TF-IDF feature vectors
tfidf = TfidfVectorizer(stop_words='english')
train_features = tfidf.fit_transform(train_texts['title1_en'] + ' ' + train_texts['title2_en'])
valid_features = tfidf.transform(valid_texts['title1_en'] + ' ' + valid_texts['title2_en'])

In [63]:
# Train a logistic regression model on the training data
clf = LogisticRegression(max_iter=10000)
clf.fit(train_features, train_labels)

In [64]:
# Evaluate the model on the validation data and print the classification report
valid_preds = clf.predict(valid_features)
print(classification_report(valid_labels, valid_preds))

              precision    recall  f1-score   support

      agreed       0.74      0.61      0.67     14975
   disagreed       0.73      0.18      0.29      1349
   unrelated       0.82      0.91      0.86     34965

    accuracy                           0.80     51289
   macro avg       0.76      0.57      0.61     51289
weighted avg       0.80      0.80      0.79     51289



In [65]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Calculate the confusion matrix on the validation data
cm = confusion_matrix(valid_labels, valid_preds)

# Calculate the accuracy score on the validation data
acc = accuracy_score(valid_labels, valid_preds)

print('Confusion Matrix:\n', cm)
print('Accuracy Score:', acc)

Confusion Matrix:
 [[ 9127    11  5837]
 [   49   244  1056]
 [ 3139    79 31747]]
Accuracy Score: 0.8016923706837723


In [69]:
# Make predictions on validation data
valid_preds = clf.predict(valid_features)

In [71]:
# Make predictions on test data
test_features = tfidf.transform(test_df['title1_en'] + ' ' + test_df['title2_en'])
test_preds = clf.predict(test_features)

In [72]:
# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_df['id'], 'label': test_preds})
submission_df.to_csv('submission.csv', index=False)