Muneel Haider
i21-0640

NLP - Assignment 2

In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter

In [None]:
dataset = pd.read_csv(r'factuality_annotations_xsum_summaries.csv')

dataset = dataset.dropna()  # Remove missing values

dataset['is_factual'] = dataset['is_factual'].map(lambda x: 1 if x == 'yes' else 0)

In [None]:
# Pre-processing
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.lower()  # Convert to lowercase

dataset['summary'] = dataset['summary'].apply(clean_text)

# Calculating term frequency
def term_frequency(text):
    words = text.split()
    total_words = len(words)
    tf_dict = Counter(words)
    
    for word in tf_dict:
        tf_dict[word] /= total_words
    return tf_dict

# Calculate inverse document frequency
def inverse_document_frequency(corpus):
    doc_count = len(corpus)
    idf_dict = {}
    
    for doc in corpus:
        for word in set(doc.split()):
            idf_dict[word] = idf_dict.get(word, 0) + 1
    
    for word in idf_dict:
        idf_dict[word] = np.log(doc_count / idf_dict[word])
    
    return idf_dict

# Calculate term_frequency and inverse_document_frequency of each document
def compute_tf_idf(tf, idf):
    tf_idf_dict = {}
    for word in tf:
        tf_idf_dict[word] = tf[word] * idf.get(word, 0)
    return tf_idf_dict

# inverse_document_frequency of entire Corpus
corpus = dataset['summary'].tolist()
idf_values = inverse_document_frequency(corpus)

# text to vector representation
def text_to_vector(text, vocab_dict, idf_values):
    tf = term_frequency(text)
    tf_idf = compute_tf_idf(tf, idf_values)
    vector = np.zeros(len(vocab_dict))
    
    for word, idx in vocab_dict.items():
        vector[idx] = tf_idf.get(word, 0)
    
    return vector

vocabulary = set()
for summary in dataset['summary']:
    vocabulary.update(summary.split())

vocab_dict = {word: i for i, word in enumerate(vocabulary)}

X = np.array([text_to_vector(summary, vocab_dict, idf_values) for summary in dataset['summary']])
y = dataset['is_factual'].values

def split_data(X, y, test_size=0.2, random_seed=42):
    np.random.seed(random_seed)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    split_index = int(X.shape[0] * (1 - test_size))
    
    train_indices = indices[:split_index]
    test_indices = indices[split_index:]
    
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

X_train, X_test, y_train, y_test = split_data(X, y)

class LogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=1000, class_weights=None):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = 0
        self.class_weights = class_weights

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        
        for _ in range(self.epochs):
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(linear_model)
            
            dw = np.zeros(num_features)
            db = 0
            
            for i in range(num_samples):
                error = y_pred[i] - y[i]
                weight = self.class_weights.get(y[i], 1) if self.class_weights else 1
                
                dw += weight * error * X[i]
                db += weight * error
            
            dw /= num_samples
            db /= num_samples

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_prob = self.sigmoid(linear_model)
        return np.array([1 if prob > 0.5 else 0 for prob in y_prob])

class_counts = np.bincount(y_train)
class_weights = {0: 1.0, 1: class_counts[0] / class_counts[1]}

model = LogisticRegression(learning_rate=0.01, epochs=1000, class_weights=class_weights)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

def evaluate_metrics(y_true, y_pred):
    true_pos = np.sum((y_true == 1) & (y_pred == 1))
    true_neg = np.sum((y_true == 0) & (y_pred == 0))
    false_pos = np.sum((y_true == 0) & (y_pred == 1))
    false_neg = np.sum((y_true == 1) & (y_pred == 0))

    accuracy = (true_pos + true_neg) / len(y_true)
    
    precision = true_pos / (true_pos + false_pos) if (true_pos + false_pos) != 0 else 0
    recall = true_pos / (true_pos + false_neg) if (true_pos + false_neg) != 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return accuracy, precision, recall, f1_score

accuracy, precision, recall, f1 = evaluate_metrics(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

def confusion_matrix(y_true, y_pred):
    true_pos = np.sum((y_true == 1) & (y_pred == 1))
    true_neg = np.sum((y_true == 0) & (y_pred == 0))
    false_pos = np.sum((y_true == 0) & (y_pred == 1))
    false_neg = np.sum((y_true == 1) & (y_pred == 0))

    matrix = np.array([[true_neg, false_pos], [false_neg, true_pos]])
    
    print("\nConfusion Matrix:")
    print(f"          Predicted: 0    Predicted: 1")
    print(f"Actual: 0    {matrix[0][0]}            {matrix[0][1]}")
    print(f"Actual: 1    {matrix[1][0]}             {matrix[1][1]}")

confusion_matrix(y_test, y_pred)

def k_fold_cv(X, y, k=5):
    fold_size = len(X) // k
    accuracy_list = []
    
    for i in range(k):
        start = i * fold_size
        end = start + fold_size
        
        X_val, y_val = X[start:end], y[start:end]
        X_train_fold = np.concatenate([X[:start], X[end:]], axis=0)
        y_train_fold = np.concatenate([y[:start], y[end:]], axis=0)
        
        model_cv = LogisticRegression(learning_rate=0.01, epochs=1000, class_weights=class_weights)
        model_cv.fit(X_train_fold, y_train_fold)
        y_pred_fold = model_cv.predict(X_val)
        
        fold_accuracy, _, _, _ = evaluate_metrics(y_val, y_pred_fold)
        accuracy_list.append(fold_accuracy)
    
    return np.mean(accuracy_list), np.std(accuracy_list)

average_accuracy, accuracy_std = k_fold_cv(X, y)

print(f"Average Accuracy using K-Fold cross-validation: {average_accuracy:.3f}")
print(f"Standard Deviation of Accuracy: {accuracy_std:.3f}")

misclassified_idx = np.where(y_pred != y_test)[0]
misclassified_examples = dataset.iloc[misclassified_idx]

true_labels = y_test[misclassified_idx]
pred_labels = y_pred[misclassified_idx]

print(f"\nMisclassified examples: {len(misclassified_examples)}")

print("\nMisclassified examples:")
for i in range(min(5, len(misclassified_examples))):
    print(f"Summary: {misclassified_examples.iloc[i]['summary']}")
    print(f"Actual Label: {true_labels[i]}, Predicted Label: {pred_labels[i]}\n")