In [1]:
import os
import math
import numpy as np
import pandas as pd
from collections import defaultdict,Counter

In [None]:
import nltk
nltk.download('all')

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Task-1: Read and perform text preprocessing

In [4]:
folder_path = '/content/drive/MyDrive/CSE 419/Lab Assign-4/1000_documents'

In [5]:
from nltk.tokenize import wordpunct_tokenize
import re

In [6]:
stop_words = set(stopwords.words('english'))
documents = []
labels = []

In [7]:
for filename in os.listdir(folder_path):
  with open(os.path.join(folder_path,filename),'r') as file:
    text =file.read().lower()
    text = re.sub(r'\W+',' ',text) #remove punctuation
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    documents.append(words)
    label = filename.split('_')[0]
    labels.append(label)

In [8]:
print(len(documents))
print(len(labels))

1000
1000


In [9]:
print(documents[0])
print(labels[0])

['saab', 'build', 'cadillacs', 'sweden', 'general', 'motors', 'world', 'largest', 'car', 'maker', 'confirmed', 'build', 'new', 'medium', 'sized', 'cadillac', 'bls', 'loss', 'making', 'saab', 'factory', 'sweden', 'car', 'unveiled', 'geneva', 'motor', 'show', 'intended', 'compete', 'medium', 'sized', 'luxury', 'car', 'market', 'sold', 'us', 'said', 'gm', 'europe', 'president', 'carl', 'peter', 'forster', 'part', 'efforts', 'make', 'us', 'marque', 'appeal', 'european', 'drivers', 'car', 'first', 'cadillac', 'diesel', 'engine', 'gm', 'announcement', 'go', 'way', 'allay', 'fears', 'saab', 'factory', 'closure', 'factory', 'trollhaettan', 'centre', 'rumours', 'gm', 'planned', 'severe', 'cutbacks', 'troubled', 'european', 'operations', 'group', 'new', 'commitment', 'swedish', 'factory', 'may', 'welcomed', 'group', 'opel', 'workers', 'ruesselsheim', 'germany', 'may', 'face', 'larger', 'proportion', 'gm', 'cuts', 'neither', 'announcement', 'seen', 'unalloyed', 'good', 'news', 'sweden', 'since', 

### Task-2: Split Data, Calculate TF, IDF, and TF-IDF

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train_docs,test_docs,train_labels,test_labels = train_test_split(documents,labels,stratify = labels,test_size = 0.3,random_state = 42)

In [12]:
def calculate_tf(docs):
  tf_list = []
  for doc in docs:
    term_counts = Counter(doc)
    doc_length = len(doc)
    tf = {term:count/doc_length for term,count in term_counts.items()}
    tf_list.append(tf)
  return tf_list

In [13]:
def calculate_idf(docs):
  N = len(docs)
  all_terms = set([term for doc in docs for term in doc])
  idf = {}
  for term in all_terms:
    doc_count = sum(1 for doc in docs if term in doc)
    idf[term] = math.log(N/(1+doc_count))
  return idf

In [14]:
# Calculate TF-IDF for each document
def calculate_tf_idf(tf_list, idf):
    tf_idf_list = []
    for tf in tf_list:
        tf_idf = {term: tf_val * idf.get(term, 0) for term, tf_val in tf.items()}  # Use idf.get(term, 0) to avoid KeyError
        tf_idf_list.append(tf_idf)
    return tf_idf_list


In [15]:
tf_train = calculate_tf(train_docs)

In [16]:
idf =calculate_idf(train_docs)

In [17]:
tf_idf_train = calculate_tf_idf(tf_train,idf)

In [18]:
tf_idf_train[0]

{'jones': 0.12321738440167893,
 'medals': 0.29949348015813965,
 'must': 0.022036691743195794,
 'go': 0.019357084311250192,
 'guilty': 0.12079184780551115,
 'world': 0.01311926168683611,
 'anti': 0.026657144743588473,
 'doping': 0.0763331638950227,
 'agency': 0.03251389722089092,
 'wada': 0.055075434811871674,
 'chief': 0.022295706193835568,
 'dick': 0.04516806861983404,
 'pound': 0.143650426726845,
 'says': 0.04235824320787671,
 'marion': 0.055075434811871674,
 'stripped': 0.14422184441864697,
 'found': 0.04034754431197705,
 'taking': 0.02626097592385836,
 'banned': 0.08795668197388254,
 'substances': 0.04026394926850372,
 'victor': 0.04397834098694127,
 'conte': 0.11015086962374335,
 'balco': 0.10433911058431343,
 'laboratories': 0.09614789627909799,
 'claims': 0.028911120009388575,
 'american': 0.023852783767848277,
 'sprinter': 0.038818486201426215,
 'regularly': 0.04026394926850372,
 'used': 0.017814026184458168,
 'drugs': 0.09256281132051287,
 'enhance': 0.048073948139548996,
 'pe

In [19]:
tf_test = calculate_tf(test_docs)

In [20]:
tf_idf_test = calculate_tf_idf(tf_test, idf)

###  Task 3.1: Calculate Centroids for each class


In [30]:
def calculate_centroids(tf_idf_train, train_labels, beta=16, gamma=4):
    unique_labels = set(train_labels)
    centroids = {}

    all_terms = set(term for doc in tf_idf_train for term in doc)
    all_terms_list = list(all_terms)

    # Standardize TF-IDF vectors for training documents
    standardized_tf_idf_train = []
    for doc in tf_idf_train:
        vector = {term: doc.get(term, 0) for term in all_terms_list}
        standardized_tf_idf_train.append(vector)

    for label in unique_labels:
        class_docs = [standardized_tf_idf_train[i] for i in range(len(train_labels)) if train_labels[i] == label]
        other_docs = [standardized_tf_idf_train[i] for i in range(len(train_labels)) if train_labels[i] != label]

        # Calculate centroid for the class
        np_class = len(class_docs)
        Nt = len(tf_idf_train)

        class_matrix = np.array([list(doc.values()) for doc in class_docs])
        other_matrix = np.array([list(doc.values()) for doc in other_docs])

        pos_sum = np.sum(class_matrix, axis=0)
        neg_sum = np.sum(other_matrix, axis=0)

        centroid = (beta / np_class) * pos_sum - (gamma / (Nt - np_class)) * neg_sum
        centroids[label] = centroid

    return centroids, all_terms_list



In [31]:
centroids, all_terms = calculate_centroids(tf_idf_train, train_labels)

### Task 3.2: Assign class using cosine similarity


In [32]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2) if norm_vec1 * norm_vec2 != 0 else 0

In [33]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [34]:
# Predict classes for test set
predicted_labels = []
for doc in tf_idf_test:
    # Standardize each test document using the stored vocabulary terms
    standardized_doc = np.array([doc.get(term, 0) for term in all_terms])
    similarities = {label: cosine_similarity(standardized_doc, centroid) for label, centroid in centroids.items()}
    predicted_labels.append(max(similarities, key=similarities.get))

In [35]:
accuracy = accuracy_score(test_labels, predicted_labels)

In [36]:
print("Accuracy:", accuracy)

Accuracy: 0.9566666666666667


### Task-3.3: Calculate the Accuracy, Precision, Recall, F-score.

In [37]:
precision, recall, fscore, _ = precision_recall_fscore_support(test_labels, predicted_labels, average=None, labels=list(centroids.keys()))


In [39]:
for idx, label in enumerate(centroids.keys()):
    print(f"\nClass '{label}' Metrics:")
    print(f"  Precision: {precision[idx]:.2f}")
    print(f"  Recall:    {recall[idx]:.2f}")
    print(f"  F-score:   {fscore[idx]:.2f}")



Class 'historical' Metrics:
  Precision: 1.00
  Recall:    1.00
  F-score:   1.00

Class 'entertainment' Metrics:
  Precision: 1.00
  Recall:    1.00
  F-score:   1.00

Class 'space' Metrics:
  Precision: 0.96
  Recall:    0.90
  F-score:   0.93

Class 'business' Metrics:
  Precision: 0.97
  Recall:    0.93
  F-score:   0.95

Class 'graphics' Metrics:
  Precision: 0.96
  Recall:    0.90
  F-score:   0.93

Class 'technologie' Metrics:
  Precision: 0.97
  Recall:    0.97
  F-score:   0.97

Class 'medical' Metrics:
  Precision: 0.88
  Recall:    0.97
  F-score:   0.92

Class 'food' Metrics:
  Precision: 0.96
  Recall:    0.90
  F-score:   0.93

Class 'politics' Metrics:
  Precision: 0.88
  Recall:    1.00
  F-score:   0.94

Class 'sport' Metrics:
  Precision: 1.00
  Recall:    1.00
  F-score:   1.00
