In [7]:
%pip install numpy nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: C:\Users\yuan2\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [199]:
import os
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from nltk.stem import PorterStemmer
# Ensure that NLTK's resources are downloaded
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load the stopwords
stop_words = set(stopwords.words('english'))

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def process_document(text):
    # Normalize to lower case
    text = text.lower()
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove digits and punctuation using a regular expression
    text = re.sub(r'[\d'+string.punctuation+']', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords and single-character words (mostly punctuation)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    # Stemming using Porter’s algorithm
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    # Lemmatize tokens (using the pos tag as 'v' for verbs and 'n' for nouns)
    tokens = [lemmatizer.lemmatize(lemmatizer.lemmatize(word, 'v'), 'n') for word in tokens]

    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yuan2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yuan2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuan2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Initialize a dictionary for document frequency
document_frequency = {}

# Iterate through all the files in the dataset directory
for filename in os.listdir('./data'):
    if filename.endswith('.txt'):
        filepath = os.path.join('./data', filename)
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = process_document(text)

        # Update document frequency - only count each term once per document
        unique_tokens = set(tokens)
        for token in unique_tokens:
            if token in document_frequency:
                document_frequency[token] += 1
            else:
                document_frequency[token] = 1

# Sort the terms in ascending order
sorted_terms = sorted(document_frequency.items(), key=lambda x: x[0])

# Save the dictionary and document frequency to a file
with open('dictionary.txt', 'w') as f:
    for index, (term, df) in enumerate(sorted_terms, start=1):
        f.write(f"{index}\t{term}\t{df}\n")

In [177]:
import math
# Build a term index dictionary for easy lookup
term_index = {term: index for index, (term, df) in enumerate(sorted_terms, start=1)}

# Now compute the tf-idf vectors for each document
for filename in os.listdir('./data'):
    if filename.endswith('.txt'):
        filepath = os.path.join('./data', filename)
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = process_document(text)
        
        # Compute term frequency
        tf = {}
        for token in tokens:
            if token in tf:
                tf[token] += 1
            else:
                tf[token] = 1
        
        # Create a zero vector of length equal to the number of terms
        tfidf_vector = np.zeros(len(term_index))
        
        for term, freq in tf.items():
            if term in term_index:
                tf_t = freq
                df_t = document_frequency[term]
                N = len(os.listdir('./data'))  # Assuming all files in the dataset directory are text documents
                idf_t = math.log10(N / df_t)
                tfidf_t = tf_t * idf_t
                tfidf_vector[term_index[term] - 1] = tfidf_t  # -1 because indices start from 1
        
        # Normalize the tf-idf vector to unit length
        norm = np.linalg.norm(tfidf_vector)
        if norm > 0:
            tfidf_vector_unit = tfidf_vector / norm
        else:
            tfidf_vector_unit = tfidf_vector  # avoid division by zero

        # Get non-zero entries for the sparse representation
        non_zero_entries = [(index + 1, tfidf) for index, tfidf in enumerate(tfidf_vector_unit) if tfidf > 0]

        # Save the tf-idf unit vector to a file
        doc_id = os.path.splitext(filename)[0]  # Assuming filename is 'DocID.txt'
        with open(f'./output/{doc_id}.txt', 'w') as f:
            f.write(f"{len(non_zero_entries)}\n")  # Write the number of non-zero entries
            for index, tfidf in non_zero_entries:
                f.write(f"{index}\t{tfidf:.3f}\n")  # Write the term index and tf-idf value, formatted to 3 decimal places


KeyboardInterrupt: 

In [6]:
def cosine(docx, docy):
    # Inline function to load vector
    def load(doc_id):
        with open(f'./output/{doc_id}.txt', 'r') as f:
            lines = f.readlines()
        vector = np.zeros(len(term_index))  # Ensure all vectors are of same length as term_index
        for line in lines[1:]:
            index, tfidf = line.strip().split()
            index = int(index) - 1  # Indices start from 1 in the file
            tfidf = float(tfidf)
            vector[index] = tfidf
        return vector
    
    vector_x = load(docx)
    vector_y = load(docy)
    
    # The vectors are already normalized (unit vectors), so just compute the dot product.
    cosine_similarity = np.dot(vector_x, vector_y)
    
    return cosine_similarity

# Example usage:
docx = '1'
docy = '2'
similarity = cosine(docx, docy)
print(f'Cosine similarity between {docx} and {docy}: {similarity:.3f}')

Cosine similarity between 1 and 2: 0.190


In [194]:
import os
import numpy as np
from scipy.stats import chi2_contingency
from collections import defaultdict

# Load the class labels for the training documents
train_data = {}
with open('training.txt', 'r') as f:
    for line in f:
        class_id, *doc_ids = line.strip().split()
        train_data[int(class_id)] = [int(doc_id) for doc_id in doc_ids]

# Calculate term frequencies within each class and overall
term_class_freq = defaultdict(lambda: defaultdict(int))
overall_term_freq = defaultdict(int)
class_sizes = defaultdict(int)

# Accumulate the class sizes
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        class_sizes[class_id] += 1

# Calculate term frequencies for each class and overall frequencies
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        with open(f'./data/{doc_id}.txt', 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = process_document(text)
        unique_tokens = set(tokens)
        for token in unique_tokens:
            term_class_freq[token][class_id] += 1
            overall_term_freq[token] += 1

# Calculate Likelihood Ratios for each term using chi-squared
likelihood_ratios = {}
total_docs = sum(class_sizes.values())

for term in term_class_freq:
    # Create a contingency table for each term across all classes
    table = []
    for class_id in train_data:
        # Frequency of term in class
        f_term_in_class = term_class_freq[term][class_id]
        # Frequency of term not in class
        f_term_not_in_class = overall_term_freq[term] - f_term_in_class
        # Frequency of not term in class
        f_not_term_in_class = class_sizes[class_id] - f_term_in_class
        # Frequency of not term not in class
        f_not_term_not_in_class = total_docs - class_sizes[class_id] - f_term_not_in_class

        table.append([f_term_in_class + 1, f_term_not_in_class + 1])
        table.append([f_not_term_in_class + 1, f_not_term_not_in_class + 1])

    # Calculate the chi-squared test statistic and p-value
    chi2_stat, p, dof, ex = chi2_contingency(table, correction=False)
    likelihood_ratios[term] = chi2_stat

# Sort terms by their likelihood ratios
sorted_terms_by_lr = sorted(likelihood_ratios.items(), key=lambda item: item[1], reverse=True)
print(sorted_terms_by_lr)
# Select the top 500 terms
top_terms = [term for term, lr in sorted_terms_by_lr[:500]]

[('earthquak', 176.41563840176266), ('kursk', 176.24523965520783), ('australian', 176.24523965520783), ('ivori', 176.24523965520783), ('carnahan', 176.2452396552078), ('peru', 176.2452396552078), ('alberto', 176.2452396552078), ('fujimori', 176.2452396552078), ('missouri', 166.17046012341754), ('vietnam', 166.17046012341754), ('submarin', 166.17046012341746), ('salvador', 163.08657102500516), ('convict', 161.29746729625626), ('milosev', 158.52953150220955), ('slobodan', 158.52953150220955), ('escap', 153.194936484862), ('mel', 153.194936484862), ('opposit', 153.03481551501548), ('torpedo', 152.9792423071506), ('cole', 150.15914474485447), ('guei', 150.15914474485447), ('governor', 150.15914474485447), ('hanoi', 150.15914474485447), ('sink', 150.15914474485444), ('quak', 148.01060365509755), ('pardon', 145.8440264191808), ('pope', 143.79105410530676), ('resign', 140.46924827623357), ('scandal', 140.46924827623357), ('barent', 137.4780116204486), ('match', 137.4780116204486), ('tournamen

In [200]:
import os
from collections import defaultdict
from math import log

# Assume process_document() is defined elsewhere

# Helper function to safely calculate the log of a probability
def safe_log(x):
    return log(x) if x > 0 else 0  

# Function to calculate the log likelihood ratio for given frequency counts
def log_likelihood_ratio(n11, n01, n10, n00):
    N = n11 + n01 + n10 + n00
    p = (n11 + n01) / N
    p1 = n11 / (n11 + n10) 
    p2 = n01 / (n01 + n00)

    # Compute log likelihoods using the provided formulas
    log_L1 = n11 * safe_log(p) + n10 * safe_log(1 - p) + n01 * safe_log(p) + n00 * safe_log(1 - p)
    log_L2 = n11 * safe_log(p1) + n10 * safe_log(1 - p1) + n01 * safe_log(p2) + n00 * safe_log(1 - p2)
    # Compute the log likelihood ratio
    return -2 * (log_L1 - log_L2)

# Load the class labels for the training documents
train_data = {}
with open('training.txt', 'r') as f:
    for line in f:
        class_id, *doc_ids = line.strip().split()
        train_data[int(class_id)] = [int(doc_id) for doc_id in doc_ids]

# Calculate term frequencies within each class and overall
term_class_freq = defaultdict(lambda: defaultdict(int))
overall_term_freq = defaultdict(int)
class_sizes = defaultdict(int)

# Accumulate the class sizes
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        class_sizes[class_id] += 1

# Calculate term frequencies for each class and overall frequencies
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        with open(f'./data/{doc_id}.txt', 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = process_document(text)
        unique_tokens = set(tokens)
        for token in unique_tokens:
            term_class_freq[token][class_id] += 1
            overall_term_freq[token] += 1

likelihood_ratios = defaultdict(lambda: defaultdict(int))
total_docs = sum(class_sizes.values())

for term in term_class_freq:
    # Calculate the frequencies for term absence

    for class_id, count in class_sizes.items():
        n11 = term_class_freq[term][class_id]
        n10 = class_sizes[class_id] - n11
        n01 = overall_term_freq[term] - n11
        n00 = total_docs - class_sizes[class_id] - n10

        llr_stat = log_likelihood_ratio(n11, n01, n10, n00)
        likelihood_ratios[term][class_id] = llr_stat
             
for term in likelihood_ratios:
    likelihood_ratios[term] = max(likelihood_ratios[term].values())

sorted_terms_by_lr = sorted(likelihood_ratios.items(), key=lambda item: item[1], reverse=True)
print(sorted_terms_by_lr)
top_terms = [term for term, lr in sorted_terms_by_lr[:500]]

[('kursk', 105.7638554863192), ('australian', 105.7638554863192), ('ivori', 105.7638554863192), ('carnahan', 105.7638554863192), ('peru', 105.7638554863192), ('alberto', 105.7638554863192), ('fujimori', 105.7638554863192), ('submarin', 98.44218198846085), ('missouri', 98.44218198846085), ('vietnam', 98.44218198846085), ('milosev', 93.76717005114243), ('slobodan', 93.76717005114243), ('salvador', 93.22299336302926), ('torpedo', 90.02010266282262), ('pope', 86.84063512087565), ('escap', 86.03560685299469), ('mel', 86.03560685299469), ('sink', 83.46502928598356), ('cole', 83.46502928598356), ('guei', 83.46502928598356), ('governor', 83.46502928598356), ('hanoi', 83.46502928598356), ('pardon', 81.48652207737611), ('resign', 76.42148556303377), ('scandal', 76.42148556303377), ('barent', 74.76392423713474), ('match', 74.76392423713474), ('tournament', 74.76392423713474), ('sampra', 74.76392423713474), ('peruvian', 74.76392423713474), ('vietnames', 74.76392423713474), ('seven', 72.11641072293

In [201]:
import os
import numpy as np
from collections import defaultdict


# Now, calculate the probability P(X=t_k | c) for each term in each class
# Initialize count dictionaries
term_class_counts = defaultdict(lambda: defaultdict(int))
class_total_counts = defaultdict(int)

# Calculate counts for each term in each class
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        with open(f'./data/{doc_id}.txt', 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = process_document(text)
        # Count terms only if they are in the top terms
        for token in tokens:
            if token in top_terms:
                term_class_counts[token][class_id] += 1
                class_total_counts[class_id] += 1

# Calculate probabilities with add-one smoothing
class_probabilities = defaultdict(dict)
V = len(top_terms)  # Vocabulary size after feature selection
for term in top_terms:
    for class_id in train_data:
        # Apply add-one smoothing
        term_count = term_class_counts[term][class_id]
        total_count = class_total_counts[class_id]
        class_probabilities[term][class_id] = (term_count + 1) / (total_count + V)

# Now class_probabilities contains P(X=t_k | c) for each term t_k and class c

# Use the class_probabilities to classify a new document
def classify(text, class_probabilities, class_total_counts):
    tokens = process_document(text)
    # Filter tokens based on the selected vocabulary
    tokens = [token for token in tokens if token in top_terms]
    # Initialize log probabilities to zero
    log_probs = {class_id: 0 for class_id in class_total_counts}
    for class_id in class_total_counts:
        # Start with the log probability of the class
        log_prob = np.log(class_total_counts[class_id] / sum(class_total_counts.values()))
        for token in tokens:
            # Add the log probability of the term given the class
            if token in class_probabilities:
                log_prob += np.log(class_probabilities[token][class_id])
        log_probs[class_id] = log_prob
    # Return the class with the highest log probability
    return max(log_probs, key=log_probs.get)

# Example usage:
new_doc = 'The text of a new document to classify goes here.'
predicted_class = classify(new_doc, class_probabilities, class_total_counts)
print(f'The predicted class for the new document is: {predicted_class}')

The predicted class for the new document is: 13


In [202]:
import csv

# Extract the training document IDs to exclude them from prediction
training_doc_ids = set()
for doc_ids in train_data.values():
    training_doc_ids.update(doc_ids)

# Open a CSV file to write the predictions
with open('hw3_output.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Id', 'Value'])  # Write the header

    # Classify each document that is not part of the training data
    for filename in os.listdir('./data'):
        if filename.endswith('.txt'):
            doc_id = int(os.path.splitext(filename)[0])
            # Skip documents that are in the training set
            if doc_id in training_doc_ids:
                continue
            with open(f'./data/{filename}', 'r', encoding='utf-8') as f:
                text = f.read()
            # Get the predicted class
            predicted_class = classify(text, class_probabilities, class_total_counts)
            # Write the result to the CSV file
            csvwriter.writerow([doc_id, predicted_class])

In [207]:
import csv

# Function to load a CSV file into a dictionary
def load_csv_to_dict(filename):
    results = {}
    with open(filename, mode='r', newline='') as csvfile:
        csvreader = csv.reader(csvfile)
        header = next(csvreader)  # Skip the header row
        for row in csvreader:
            doc_id, value = row
            results[int(doc_id)] = int(value)
    return results

# Load the two CSV files you want to compare
csv_results_1 = load_csv_to_dict('hw3_output_9811.csv')
csv_results_2 = load_csv_to_dict('hw3_output_9844.csv')

# Compare the two dictionaries and print differences
differences = {}
for doc_id in csv_results_1:
    if csv_results_1[doc_id] != csv_results_2.get(doc_id):
        differences[doc_id] = (csv_results_1[doc_id], csv_results_2.get(doc_id))

# Print out the differences
for doc_id, values in differences.items():
    print(f'Document ID {doc_id} has different classifications: {values[0]} vs {values[1]}')

# Print summary
if differences:
    print(f'There are {len(differences)} differences between the CSV files.')
else:
    print('The two CSV files are identical.')

Document ID 852 has different classifications: 13 vs 4
Document ID 878 has different classifications: 7 vs 3
Document ID 889 has different classifications: 6 vs 3
There are 3 differences between the CSV files.
