In [182]:
import os
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load the stopwords
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
digit = set(string.digits)

with open('stopwords.txt', 'r') as f:
    stop_words = set(f.read().splitlines())
def process_document(text):

    # Remove digits and punctuation
    translator = str.maketrans('', '', string.punctuation + string.digits)
    # Tokenize text
    tokens = text.translate(translator).split()
    tokens = [word.lower() for word in tokens]

    # Remove stopwords and single-character words
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    # # Stemming using Porterâ€™s algorithm
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]

    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yuan2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yuan2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuan2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Chi-squared

In [172]:
import os
import numpy as np
from scipy.stats import chi2_contingency
from collections import defaultdict

# Load the class labels for the training documents
train_data = {}
with open('training.txt', 'r') as f:
    for line in f:
        class_id, *doc_ids = line.strip().split()
        train_data[int(class_id)] = [int(doc_id) for doc_id in doc_ids]

# Calculate term frequencies within each class and overall
term_class_freq = defaultdict(lambda: defaultdict(int))
overall_term_freq = defaultdict(int)
class_sizes = defaultdict(int)

# Accumulate the class sizes
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        class_sizes[class_id] += 1

# Calculate term frequencies for each class and overall frequencies
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        with open(f'./data/{doc_id}.txt', 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = process_document(text)
        unique_tokens = set(tokens)
        for token in unique_tokens:
            term_class_freq[token][class_id] += 1
            overall_term_freq[token] += 1

# Calculate Likelihood Ratios for each term using chi-squared
likelihood_ratios = {}
total_docs = sum(class_sizes.values())

for term in term_class_freq:
    # Create a contingency table for each term across all classes
    table = []
    for class_id in train_data:
        # Frequency of term in class
        f_term_in_class = term_class_freq[term][class_id]
        # Frequency of term not in class
        f_term_not_in_class = overall_term_freq[term] - f_term_in_class
        # Frequency of not term in class
        f_not_term_in_class = class_sizes[class_id] - f_term_in_class
        # Frequency of not term not in class
        f_not_term_not_in_class = total_docs - class_sizes[class_id] - f_term_not_in_class

        table.append([f_term_in_class + 1, f_term_not_in_class + 1])
        table.append([f_not_term_in_class + 1, f_not_term_not_in_class + 1])

    # Calculate the chi-squared test statistic and p-value
    chi2_stat, p, dof, ex = chi2_contingency(table, correction=False)
    likelihood_ratios[term] = chi2_stat

# Sort terms by their likelihood ratios
sorted_terms_by_lr = sorted(likelihood_ratios.items(), key=lambda item: item[1], reverse=True)
print(sorted_terms_by_lr)
# Select the top 500 terms
top_terms = [term for term, lr in sorted_terms_by_lr[:500]]

[('earthquak', 176.41563840176266), ('kursk', 176.24523965520783), ('118', 176.24523965520783), ('australian', 176.24523965520783), ('ivori', 176.24523965520783), ('carnahan', 176.2452396552078), ('alberto', 176.2452396552078), ('peru', 176.2452396552078), ('fujimori', 176.2452396552078), ('missouri', 166.17046012341754), ('vietnam', 166.17046012341754), ('submarin', 166.17046012341746), ('salvador', 163.08657102500516), ('convict', 161.29746729625626), ('milosev', 158.52953150220955), ('slobodan', 158.52953150220955), ('escap', 153.194936484862), ('mel', 153.194936484862), ('opposit', 153.03481551501548), ('torpedo', 152.9792423071506), ('cole', 150.15914474485447), ('guei', 150.15914474485447), ('governor', 150.15914474485447), ('hanoi', 150.15914474485447), ('diver', 150.15914474485444), ('quak', 148.01060365509755), ('pardon', 145.8440264191808), ('pope', 143.79105410530676), ('resign', 140.46924827623357), ('scandal', 140.46924827623357), ('barent', 137.4780116204486), ('tournamen

In [79]:
import os
from collections import defaultdict
from math import log
from scipy.stats import chi2_contingency
# Assume process_document() is defined elsewhere

# Helper function to safely calculate the log of a probability
def safe_log(x):
    return log(x) if x > 0 else 1e-10

# Function to calculate the log likelihood ratio for given frequency counts
def log_likelihood_ratio(n11, n01, n10, n00):
    N = n11 + n01 + n10 + n00
    p = (n11 + n01) / N
    p1 = n11 / (n11 + n10) if n11+n10 > 0 else 0
    p2 = n01 / (n01 + n00) if n01+n00 > 0 else 0

    # Compute log likelihoods using the provided formulas
    log_L1 = n11 * safe_log(p) + n10 * safe_log(1 - p) + n01 * safe_log(p) + n00 * safe_log(1 - p)
    log_L2 = n11 * safe_log(p1) + n10 * safe_log(1 - p1) + n01 * safe_log(p2) + n00 * safe_log(1 - p2)
    # Compute the log likelihood ratio
    return -2 * (log_L1 - log_L2)

# Load the class labels for the training documents
train_data = {}
with open('training.txt', 'r') as f:
    for line in f:
        class_id, *doc_ids = line.strip().split()
        train_data[int(class_id)] = [int(doc_id) for doc_id in doc_ids]

# Calculate term frequencies within each class and overall
term_class_freq = defaultdict(lambda: defaultdict(int))
overall_term_freq = defaultdict(int)
class_sizes = defaultdict(int)
class_term_num = defaultdict(int)
total_docs_num = 0
# Accumulate the class sizes
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        class_sizes[class_id] += 1

# Calculate term frequencies for each class and overall frequencies
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        with open(f'./data/{doc_id}.txt', 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = process_document(text)
        # unique_tokens = set(tokens)
        for token in tokens:
            term_class_freq[token][class_id] += 1
            overall_term_freq[token] += 1
            class_term_num[class_id] += 1
            total_docs_num += 1

likelihood_ratios = defaultdict(lambda: defaultdict(int))
total_docs = sum(class_sizes.values())

for term in term_class_freq:
    # Calculate the frequencies for term absence

    # for class_id, count in class_sizes.items():
    #     n11 = term_class_freq[term][class_id]
    #     n10 = class_term_num[class_id] - n11
    #     n01 = overall_term_freq[term] - n11
    #     n00 = total_docs - class_sizes[class_id] - n10
    for class_id, count in class_sizes.items():
        # Frequency of term in class
        n11 = term_class_freq[term][class_id]
        # Frequency of term not in class
        n10 = overall_term_freq[term] - n11
        # Frequency of not term in class
        n01 = class_term_num[class_id] - n11
        # Frequency of not term not in class
        n00 = total_docs_num - class_term_num[class_id] - n10
        print(term,": ",n11, n10, n01, n00)
        llr_stat = log_likelihood_ratio(n11, n01, n10, n00)
        likelihood_ratios[term][class_id] = llr_stat

             
for term in likelihood_ratios:
    likelihood_ratios[term] = max(likelihood_ratios[term].values())

sorted_terms_by_lr = sorted(likelihood_ratios.items(), key=lambda item: item[1], reverse=True)
print(sorted_terms_by_lr)
top_terms = [term for term, lr in sorted_terms_by_lr[:500]]

navi :  17 52 2174 44893
navi :  0 69 4467 42600
navi :  0 69 3871 43196
navi :  3 66 3292 43775
navi :  0 69 3215 43852
navi :  0 69 3291 43776
navi :  0 69 3619 43448
navi :  7 62 2154 44913
navi :  42 27 3264 43803
navi :  0 69 3417 43650
navi :  0 69 4392 42675
navi :  0 69 4138 42929
navi :  0 69 5773 41294
unit :  3 88 2188 44857
unit :  10 81 4457 42588
unit :  6 85 3865 43180
unit :  5 86 3290 43755
unit :  0 91 3215 43830
unit :  7 84 3284 43761
unit :  4 87 3615 43430
unit :  0 91 2161 44884
unit :  1 90 3305 43740
unit :  8 83 3409 43636
unit :  1 90 4391 42654
unit :  5 86 4133 42912
unit :  41 50 5732 41313
state :  3 241 2188 44704
state :  29 215 4438 42454
state :  7 237 3864 43028
state :  3 241 3292 43600
state :  0 244 3215 43677
state :  30 214 3261 43631
state :  21 223 3598 43294
state :  11 233 2150 44742
state :  16 228 3290 43602
state :  19 225 3398 43494
state :  46 198 4346 42546
state :  11 233 4127 42765
state :  48 196 5725 41167
japan :  7 43 2184 44902


In [186]:
import os
from collections import defaultdict
from math import log

# Assume process_document() is defined elsewhere

# Helper function to safely calculate the log of a probability
def safe_log(x):
    return log(x) if x > 0 else 1e-10

# Function to calculate the log likelihood ratio for given frequency counts
def log_likelihood_ratio(n11, n01, n10, n00):
    N = n11 + n01 + n10 + n00
    p = (n11 + n01) / N
    p1 = n11 / (n11 + n10) if n11+n10 > 0 else 0
    p2 = n01 / (n01 + n00) if n01+n00 > 0 else 0

    # Compute log likelihoods using the provided formulas
    log_L1 = n11 * safe_log(p) + n10 * safe_log(1 - p) + n01 * safe_log(p) + n00 * safe_log(1 - p)
    log_L2 = n11 * safe_log(p1) + n10 * safe_log(1 - p1) + n01 * safe_log(p2) + n00 * safe_log(1 - p2)
    # Compute the log likelihood ratio
    return -2 * (log_L1 - log_L2)

# Load the class labels for the training documents
train_data = {}
with open('training.txt', 'r') as f:
    for line in f:
        class_id, *doc_ids = line.strip().split()
        train_data[int(class_id)] = [int(doc_id) for doc_id in doc_ids]

# Calculate term frequencies within each class and overall
term_class_freq = defaultdict(lambda: defaultdict(int))
overall_term_freq = defaultdict(int)
class_sizes = defaultdict(int)
class_term_num = defaultdict(int)
total_docs_num = 0
# Accumulate the class sizes
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        class_sizes[class_id] += 1

# Calculate term frequencies for each class and overall frequencies
for class_id, doc_ids in train_data.items():
    for doc_id in doc_ids:
        with open(f'./data/{doc_id}.txt', 'r', encoding='utf-8') as f:
            text = f.read()
        tokens = process_document(text)
        unique_tokens = set(tokens)
        for token in unique_tokens:
            term_class_freq[token][class_id] += 1
            overall_term_freq[token] += 1
            class_term_num[class_id] += 1

likelihood_ratios = defaultdict(lambda: defaultdict(int))
total_docs = sum(class_sizes.values())
for term in term_class_freq:
    # Calculate the frequencies for term absence

    for class_id, count in class_sizes.items():
        n11 = term_class_freq[term][class_id]
        n10 = class_sizes[class_id] - n11
        n01 = overall_term_freq[term] - n11
        n00 = total_docs - class_sizes[class_id] - n01
        llr_stat = log_likelihood_ratio(n11, n01, n10, n00)
        likelihood_ratios[term][class_id] = llr_stat

             
for term in likelihood_ratios:
    likelihood_ratios[term] = max(likelihood_ratios[term].values())

sorted_terms_by_lr = sorted(likelihood_ratios.items(), key=lambda item: item[1], reverse=True)
print(sorted_terms_by_lr)
top_terms = [term for term, lr in sorted_terms_by_lr[:500]]

[('kursk', 105.7638554863192), ('australian', 105.7638554863192), ('ivori', 105.7638554863192), ('carnahan', 105.7638554863192), ('alberto', 105.7638554863192), ('peru', 105.7638554863192), ('fujimori', 105.7638554863192), ('submarin', 98.28252240771253), ('missouri', 98.28252240771253), ('vietnam', 98.28252240771253), ('milosev', 93.44869654371398), ('slobodan', 93.44869654371398), ('salvador', 93.37239648856603), ('torpedo', 89.54365196713222), ('pope', 86.2070352000199), ('escap', 86.03560685299469), ('mel', 86.03560685299469), ('diver', 83.74247643308121), ('cole', 83.74247643308121), ('guei', 83.74247643308121), ('governor', 83.74247643308121), ('hanoi', 83.74247643308121), ('pardon', 81.3380357391053), ('resign', 76.55971544609307), ('scandal', 76.55971544609307), ('barent', 75.14806359884355), ('tournament', 75.14806359884355), ('sampra', 75.14806359884355), ('match', 75.14806359884355), ('peruvian', 75.14806359884355), ('vietnames', 75.14806359884355), ('el', 74.34779324253755)

In [183]:
import os
from collections import defaultdict
from math import log

# Train the Multinomial Naive Bayes classifier
def train_multinomial_nb(C, D):
    V = extract_vocabulary(D)
    N = len(D)
    prior = {}
    condprob = defaultdict(lambda: defaultdict(float))
    T_ct = defaultdict(lambda: defaultdict(int))
    for c in C:
        D_c = [d for d in D if D[d] == c]
        N_c = len(D_c)
        prior[c] = N_c / N
        text_c = concatenate_text_of_all_docs_in_class(D_c)
        # print(text_c)
        T_ct[c] = count_tokens_of_term(T_ct[c], text_c, V)
        print(T_ct[c])
        #print T_ct[c] 's key count
        print(len(T_ct[c]))
        for t in V:
            condprob[t][c] = (T_ct[c][t] + 1) / (sum(T_ct[c].values()) + len(T_ct[c]))
            
    return V, prior, condprob

# Apply the trained classifier to a new document
def apply_multinomial_nb(C, V, prior, condprob, d):
    W = extract_tokens_from_doc(V, d)
    score = defaultdict(float)
    
    for c in C:
        score[c] = log(prior[c])
        for t in W:
            score[c] += log(condprob[t][c])
            
    return max(score, key=score.get)

# Helper functions based on the pseudocode
def extract_vocabulary(D):
    V = set()
    for d in D:
        with open(d, 'r', encoding='utf-8') as f:
            V.update(process_document(f.read()))
            # extract the vocabulary which is in top_terms
            V = V.intersection(set(top_terms))
    return V

def concatenate_text_of_all_docs_in_class(D_c):
    text_c = ""
    for d in D_c:
        with open(d, 'r', encoding='utf-8') as f:
            d_text = process_document(f.read())
            # d_text = f.read()
            # concatenate the list of terms into string
            d_text = ' '.join(d_text)
            text_c += d_text

    return text_c

def count_tokens_of_term(t_ct, text_c, V):
    text_c = text_c.split()
    text_c = [term for term in text_c if term in V]
    for term in text_c:
        t_ct[term] += 1
    return t_ct

def extract_tokens_from_doc(V, d):
    tokens = process_document(d)
    return [token for token in tokens if token in V]

# Example usage
# Assuming D is a dictionary where keys are document paths and values are class labels
D = {f'./data/{doc_id}.txt': class_id for class_id, doc_ids in train_data.items() for doc_id in doc_ids}
C = set(train_data.keys())

V, prior, condprob = train_multinomial_nb(C, D)
new_doc_path = 'data/1.txt'
with open(new_doc_path, 'r', encoding='utf-8') as f:
    new_doc = f.read()
    predicted_class = apply_multinomial_nb(C, V, prior, condprob, new_doc)
print(f'The predicted class for the new document is: {predicted_class}')

defaultdict(<class 'int'>, {'navi': 17, 'japan': 7, 'submarin': 63, 'rescu': 13, 'ship': 7, 'china': 2, 'sea': 24, 'russian': 38, 'war': 7, 'sub': 12, 'suit': 3, 'nuclear': 15, 'attack': 1, 'uss': 1, 'defeat': 1, 'militari': 11, 'arm': 1, 'august': 6, 'kursk': 42, 'sank': 9, 'explos': 14, 'presid': 2, 'run': 1, 'recov': 14, 'seamen': 6, 'retriev': 7, 'chief': 4, 'resign': 1, 'vice': 1, 'presidenti': 1, 'st': 2, 'compart': 8, 'destroy': 3, 'buri': 1, 'norwegian': 14, 'collis': 10, 'deni': 1, 'torpedo': 4, 'barent': 14, 'sailor': 9, 'moscow': 4, 'main': 2, 'bodi': 24, 'danger': 2, 'diver': 38, 'hand': 1, 'store': 1, 'boat': 2, 'secret': 1, 'sunken': 7, 'port': 3, 'mission': 3, 'crash': 1, 'depart': 1, 'rule': 1, 'rescuer': 1, 'hull': 23, 'tuesday': 4, 'night': 3, 'weather': 7, 'western': 3, 'seven': 2})
62
defaultdict(<class 'int'>, {'white': 4, 'hous': 4, 'yugoslavia': 36, 'opposit': 71, 'presid': 55, 'slobodan': 17, 'milosev': 133, 'belgrad': 36, 'serbia': 32, 'eve': 3, 'strike': 19, '

In [184]:
import csv

# Extract the training document IDs to exclude them from prediction
training_doc_ids = set()
for doc_ids in train_data.values():
    training_doc_ids.update(doc_ids)

# Open a CSV file to write the predictions
with open('hw3_output.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Id', 'Value'])  # Write the header

    # Classify each document that is not part of the training data
    for filename in os.listdir('./data'):
        if filename.endswith('.txt'):
            doc_id = int(os.path.splitext(filename)[0])
            # Skip documents that are in the training set
            if doc_id in training_doc_ids:
                continue
            with open(f'./data/{filename}', 'r', encoding='utf-8') as f:
                text = f.read()
            # Get the predicted class
            predicted_class = apply_multinomial_nb(C, V, prior, condprob, text)
            # Write the result to the CSV file
            csvwriter.writerow([doc_id, predicted_class])

In [10]:
import os
import string
import nltk
import csv
from collections import defaultdict
from math import log
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
from scipy.stats import chi2_contingency
# Download necessary NLTK packages
nltk.download('stopwords')

# Function to process documents: tokenize, remove stop words, and stem
def process_document(text):
    """
    Process the given text by tokenizing, removing stop words, and stemming.
    """
    translator = str.maketrans('', '', string.punctuation + string.digits)
    tokens = text.translate(translator).split()
    tokens = [word.lower() for word in tokens]

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]

    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]
    return tokens

# Function to calculate the log likelihood ratio
def log_likelihood_ratio(n11, n01, n10, n00):
    """
    Calculate the log likelihood ratio for given frequency counts.
    """
    def safe_log(x):
        return log(x) if x > 0 else 1e-10

    N = n11 + n01 + n10 + n00
    p = (n11 + n01) / N
    p1 = n11 / (n11 + n10) if n11 + n10 > 0 else 0
    p2 = n01 / (n01 + n00) if n01 + n00 > 0 else 0

    log_L1 = n11 * safe_log(p) + n10 * safe_log(1 - p) + n01 * safe_log(p) + n00 * safe_log(1 - p)
    log_L2 = n11 * safe_log(p1) + n10 * safe_log(1 - p1) + n01 * safe_log(p2) + n00 * safe_log(1 - p2)

    return -2 * (log_L1 - log_L2)

# Function to calculate Chi-Square
def chi_square(term_class_freq, overall_term_freq, class_sizes, total_docs, term):
    """
    Calculate the Chi-Square statistic for a given term.
    """
    table = []
    for class_id in class_sizes:
        # Frequency of term in class
        f_term_in_class = term_class_freq[term].get(class_id, 0)
        # Frequency of term not in class
        f_term_not_in_class = overall_term_freq[term] - f_term_in_class
        # Frequency of not term in class
        f_not_term_in_class = class_sizes[class_id] - f_term_in_class
        # Frequency of not term not in class
        f_not_term_not_in_class = total_docs - class_sizes[class_id] - f_term_not_in_class

        table.append([f_term_in_class, f_term_not_in_class])
        table.append([f_not_term_in_class, f_not_term_not_in_class])

    chi2_stat, p, dof, ex = chi2_contingency(table, correction=False)
    return chi2_stat

# Modified feature extraction and selection function
def extract_features(train_data):
    term_class_freq = defaultdict(lambda: defaultdict(int))
    overall_term_freq = defaultdict(int)
    class_sizes = defaultdict(int)
    total_docs = 0

    for class_id, doc_ids in train_data.items():
        for doc_id in doc_ids:
            class_sizes[class_id] += 1
            with open(f'./data/{doc_id}.txt', 'r', encoding='utf-8') as f:
                text = f.read()
            tokens = process_document(text)
            for token in set(tokens):
                term_class_freq[token][class_id] += 1
                overall_term_freq[token] += 1

    total_docs = sum(class_sizes.values())
    llr_scores, chi_scores = defaultdict(int), defaultdict(int)
    
    for term, freq in term_class_freq.items():
        for class_id, count in class_sizes.items():
            n11 = freq[class_id]
            n10 = class_sizes[class_id] - n11
            n01 = overall_term_freq[term] - n11
            n00 = total_docs - class_sizes[class_id] - n01

            llr_scores[term] = max(llr_scores[term], log_likelihood_ratio(n11, n01, n10, n00))
            chi_scores[term] = max(chi_scores[term], chi_square(term_class_freq, overall_term_freq, class_sizes, total_docs, term))

    top_llr = set(term for term, _ in sorted(llr_scores.items(), key=lambda item: item[1], reverse=True)[:500])
    top_chi = set(term for term, _ in sorted(chi_scores.items(), key=lambda item: item[1], reverse=True)[:500])

    # Return the intersection of the top terms from each method
    return top_llr & top_chi

# Load training data
train_data = {}
with open('training.txt', 'r') as f:
    for line in f:
        class_id, *doc_ids = line.strip().split()
        train_data[int(class_id)] = [int(doc_id) for doc_id in doc_ids]
# Extract the training document IDs to exclude them from prediction
training_doc_ids = set()
for doc_ids in train_data.values():
    training_doc_ids.update(doc_ids)

# Extract features
top_terms = extract_features(train_data)

# Functions for Multinomial Naive Bayes Classifier
def train_multinomial_nb(C, D):
    """
    Train the Multinomial Naive Bayes classifier.
    """
    # Helper functions for the classifier
    def extract_vocabulary(D):
        """ 
        Extract vocabulary from documents using only the top terms.
        """
        V = set()
        for d in D:
            with open(d, 'r', encoding='utf-8') as f:
                V.update([word for word in process_document(f.read()) if word in top_terms])
        return V


    def concatenate_text_of_all_docs_in_class(D_c):
        """ Concatenate text of all documents in a class """
        text_c = ""
        for d in D_c:
            with open(d, 'r', encoding='utf-8') as f:
                d_text = ' '.join(process_document(f.read()))
                text_c += d_text
        return text_c

    def count_tokens_of_term(t_ct, text_c, V):
        """ Count tokens of term """
        text_c = text_c.split()
        for term in text_c:
            if term in V:
                t_ct[term] += 1
        return t_ct

    V = extract_vocabulary(D)
    N = len(D)
    prior = {}
    condprob = defaultdict(lambda: defaultdict(float))
    T_ct = defaultdict(lambda: defaultdict(int))

    for c in C:
        D_c = [d for d in D if D[d] == c]
        N_c = len(D_c)
        prior[c] = N_c / N
        text_c = concatenate_text_of_all_docs_in_class(D_c)
        T_ct[c] = count_tokens_of_term(T_ct[c], text_c, V)
        for t in V:
            condprob[t][c] = (T_ct[c][t] + 1) / (sum(T_ct[c].values()) + len(V))

    return V, prior, condprob

def apply_multinomial_nb(C, V, prior, condprob, d):
    """
    Apply the trained Multinomial Naive Bayes classifier to a new document.
    """
    def extract_tokens_from_doc(V, d):
        """ Extract tokens from document """
        tokens = process_document(d)
        return [token for token in tokens if token in V]

    W = extract_tokens_from_doc(V, d)
    score = defaultdict(float)
    for c in C:
        score[c] = log(prior[c])
        for t in W:
            score[c] += log(condprob[t][c])
    return max(score, key=score.get)

# Main program
D = {f'./data/{doc_id}.txt': class_id for class_id, doc_ids in train_data.items() for doc_id in doc_ids}
C = set(train_data.keys())
V, prior, condprob = train_multinomial_nb(C, D)

# Predict and write output to a CSV file
with open('hw3_output.csv', 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(['Id', 'Value'])
    for filename in os.listdir('./data'):
        if filename.endswith('.txt'):
            doc_id = int(os.path.splitext(filename)[0])
            if doc_id in training_doc_ids:
                continue
            with open(f'./data/{filename}', 'r', encoding='utf-8') as f:
                text = f.read()
            predicted_class = apply_multinomial_nb(C, V, prior, condprob, text)
            csvwriter.writerow([doc_id, predicted_class])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yuan2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
import csv

# Function to load a CSV file into a dictionary
def load_csv_to_dict(filename):
    results = {}
    with open(filename, mode='r', newline='') as csvfile:
        csvreader = csv.reader(csvfile)
        header = next(csvreader)  # Skip the header row
        for row in csvreader:
            doc_id, value = row
            results[int(doc_id)] = int(value)
    return results

# Load the two CSV files you want to compare
csv_results_1 = load_csv_to_dict('hw3_output.csv')
csv_results_2 = load_csv_to_dict('hw3_output_98444.csv')

# Compare the two dictionaries and print differences
differences = {}
for doc_id in csv_results_1:
    if csv_results_1[doc_id] != csv_results_2.get(doc_id):
        differences[doc_id] = (csv_results_1[doc_id], csv_results_2.get(doc_id))

# Print out the differences
for doc_id, values in differences.items():
    print(f'Document ID {doc_id} has different classifications: {values[0]} vs {values[1]}')

# Print summary
if differences:
    print(f'There are {len(differences)} differences between the CSV files.')
else:
    print('The two CSV files are identical.')

Document ID 129 has different classifications: 13 vs 2
Document ID 486 has different classifications: 11 vs 10
Document ID 671 has different classifications: 4 vs 8
Document ID 865 has different classifications: 13 vs 4
Document ID 917 has different classifications: 13 vs 4
There are 5 differences between the CSV files.
