In [4]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.util import ngrams

train_data_path = 'train.tsv'
test_data_path = 'valid.tsv'
output_path = 'output.txt'
stop_path = 'stopwords.txt'

In [5]:
train_df= pd.read_csv(train_data_path,header=None,sep='\t',quoting=3)
train_x= train_df[2]
train_y= train_df[1]
stop= pd.read_csv(stop_path,header=None)[0]
stop=stop.astype(str).tolist()

In [6]:
stemmer = PorterStemmer()

def preprocess_text(text):
    words = text.lower().split()
    # Filter out stopwords and apply stemming
    filtered_words = [stemmer.stem(word) for word in words if word not in stop]

    # Create unigrams (individual words)
    unigrams = filtered_words
    
    # Create bigrams (pairs of consecutive words)
    bigrams = ['_'.join(bigram) for bigram in ngrams(filtered_words, 2)]
    
    # Combine unigrams and bigrams
    return unigrams + bigrams

train_x= train_x.apply(preprocess_text)

In [7]:
label_to_index = {'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}
index_to_label = {v: k for k, v in label_to_index.items()}


In [8]:
def create_feature_matrix(processed_texts, labels):
    vocabulary = set(word for text in processed_texts for word in text)
    vocab_size = len(vocabulary)
    
    word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
    
    feature_matrix = np.zeros((len(processed_texts), vocab_size), dtype=int)
    class_count = np.zeros(len(set(labels)))

    for i, text in enumerate(processed_texts):
        for word in text:
            if word in word_to_index: 
                feature_matrix[i, word_to_index[word]] = 1
    
    encoded_labels = np.array([label_to_index[label] for label in labels])
    
    for label in labels:
        class_count[label_to_index[label]] += 1

    return feature_matrix, encoded_labels, class_count, word_to_index

X, Y, class_count, word_to_index= create_feature_matrix(train_x, train_y)

In [9]:
class BernoulliNaiveBayes:
    def __init__(self):
        self.class_priors = None
        self.feature_probs = None
        self.unseen_word_value_per_class = None

    def fit(self, X, y):
        self.class_priors = np.log(np.bincount(y) / len(y))
        self.feature_probs = np.zeros((len(self.class_priors), X.shape[1]))
        
        for c in range(len(self.class_priors)):
            X_c = X[y == c]
            self.feature_probs[c] = (X_c.sum(axis=0) + 1) / (X_c.shape[0] + 2)
        
        class_count = np.bincount(y)
        self.unseen_word_value_per_class = [1 / (2 + count) for count in class_count]

    def predict(self, X):
        eps = 1e-10
        log_feature_probs = np.log(self.feature_probs.T.clip(eps, 1 - eps))
        log_complement_probs = np.log((1 - self.feature_probs.T).clip(eps, 1 - eps))
        log_probs = X @ log_feature_probs + (1 - X) @ log_complement_probs + self.class_priors
        print(log_probs)
        return np.argmax(log_probs, axis=1)

    def predict_test(self, X_test, unseen_word_count):
        eps = 1e-10
        log_feature_probs = np.log(self.feature_probs.T.clip(eps, 1 - eps))
        log_complement_probs = np.log((1 - self.feature_probs.T).clip(eps, 1 - eps))
        log_probs = X_test @ log_feature_probs + (1 - X_test) @ log_complement_probs
        log_probs += self.class_priors
        
        for i in range(X_test.shape[0]): 
            for c in range(len(self.class_priors)): 
                if unseen_word_count[i] > 0:
                    unseen_contribution = np.log(self.unseen_word_value_per_class[c])
                    log_probs[i, c] += unseen_contribution 

        print(log_probs)
        return np.argmax(log_probs, axis=1)

model = BernoulliNaiveBayes()
model.fit(X, Y)

predictions = model.predict(X)

accuracy = np.mean(predictions == Y)
print(f'Accuracy: {accuracy * 100:.2f}%')

[[-235.20380341 -168.63961103 -186.30283374 -177.16646186 -182.69276768
  -187.57487855]
 [-298.38248998 -240.68287134 -246.94754023 -230.63294187 -243.70049808
  -250.09137345]
 [-278.26466442 -218.65515404 -227.2157545  -219.8620189  -212.80996452
  -227.75229483]
 ...
 [-313.0900261  -262.02293427 -266.92541586 -246.09543848 -265.62019584
  -267.78534876]
 [-215.14044379 -150.3372631  -166.98466602 -157.80106424 -156.92993406
  -165.07055479]
 [-310.5747344  -279.27016658 -288.31677209 -281.55392622 -284.08483628
  -285.57965497]]
Accuracy: 70.74%


In [10]:
predicted_labels = [index_to_label[pred] for pred in predictions]
print(predicted_labels)
print(len(predicted_labels))
with open(output_path, 'w') as f:
    for label in predicted_labels:
        f.write(f"{label}\n")

['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'half-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'false', 'mostly-true', 'half-true', 'half-true', 'true', 'barely-true', 'half-true', 'mostly-true', 'false', 'mostly-true', 'mostly-true', 'half-true', 'half-true', 'false', 'mostly-true', 'half-true', 'false', 'mostly-true', 'half-true', 'half-true', 'false', 'mostly-true', 'mostly-true', 'false', 'half-true', 'true', 'half-true', 'mostly-true', 'false', 'half-true', 'false', 'half-true', 'false', 'half-true', 'false', 'false', 'false', 'false', 'barely-true', 'half-true', 'false', 'true', 'half-true', 'half-true', 'half-true', 'barely-true', 'false', 'mostly-true', 'false', 'mostly-true', 'true', 'mostly-true', 'false', 'half-true', 'true', 'false', 'barely-true', 'half-true', 'true', 'half-true', 'mostly-true', 'half-true', 'mostly-true', 'true', 'false', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'false', 'false', '

In [11]:
file= 'checker_files/bernoulli_bigrams_probas_train.npy'
given= np.load(file)
print(np.log(given))
argmax_indices = np.argmax(given, axis=1)
argmax_indices= [index_to_label[z] for z in argmax_indices]
print(argmax_indices)
print(len(argmax_indices))
print(np.mean(argmax_indices==predicted_labels))
print(argmax_indices==predicted_labels)

[[-235.20380341 -168.63961103 -186.30283374 -177.16646186 -182.69276768
  -187.57487855]
 [-298.38248998 -240.68287134 -246.94754023 -230.63294187 -243.70049808
  -250.09137345]
 [-278.26466442 -218.65515404 -227.2157545  -219.8620189  -212.80996452
  -227.75229483]
 ...
 [-313.0900261  -262.02293427 -266.92541586 -246.09543848 -265.62019584
  -267.78534876]
 [-215.14044379 -150.3372631  -166.98466602 -157.80106424 -156.92993406
  -165.07055479]
 [-310.5747344  -279.27016658 -288.31677209 -281.55392622 -284.08483628
  -285.57965497]]
['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'half-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'false', 'mostly-true', 'half-true', 'half-true', 'true', 'barely-true', 'half-true', 'mostly-true', 'false', 'mostly-true', 'mostly-true', 'half-true', 'half-true', 'false', 'mostly-true', 'half-true', 'false', 'mostly-true', 'half-true', 'half-true', 'false', 'mostly-true', 'mostly-true', 'false', 'half-tru

In [12]:
test_df= pd.read_csv(test_data_path,header=None,sep='\t',quoting=3)
test_x= test_df[2]
test_y= test_df[1]

In [13]:
def create_test_feature_matrix(processed_texts, word_to_index):
    vocab_size = len(word_to_index)
    
    feature_matrix = np.zeros((len(processed_texts), vocab_size), dtype=int)
    
    unseen_word_count = np.zeros(len(processed_texts), dtype=int)
    
    for i, text in enumerate(processed_texts):
        unique_unseen_words = set() 
        for word in text:
            if word in word_to_index:
                feature_matrix[i, word_to_index[word]] = 1
            else:
                unique_unseen_words.add(word)  
        unseen_word_count[i] = len(unique_unseen_words)

    return feature_matrix, unseen_word_count


test_x= test_x.apply(preprocess_text)
X_test, unseen = create_test_feature_matrix(test_x, word_to_index)

test_predictions = model.predict_test(X_test,unseen)
test_predictions = [index_to_label[pred] for pred in test_predictions]
test_accuracy = np.mean(test_predictions == test_y)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')


[[-168.13596995 -103.6324742  -114.78884665 -100.55925587 -103.83446566
  -112.27347019]
 [-191.40361508 -129.3111251  -139.08383399 -127.89565767 -131.62572947
  -136.60984574]
 [-240.5108114  -177.73863764 -187.70690172 -177.88022263 -179.4452085
  -186.37920737]
 ...
 [-211.64762733 -147.86079468 -158.53174684 -144.78459951 -149.45867864
  -153.75705553]
 [-276.41881458 -215.16666544 -223.31943905 -212.42290712 -218.42926297
  -224.03748064]
 [-212.15265854 -140.81147183 -156.00635342 -142.27044054 -146.35707328
  -154.99581866]]
Test Accuracy: 21.73%


In [14]:
print(test_predictions)

['half-true', 'half-true', 'false', 'mostly-true', 'half-true', 'half-true', 'half-true', 'false', 'false', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'half-true', 'false', 'false', 'half-true', 'mostly-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'false', 'half-true', 'false', 'mostly-true', 'half-true', 'half-true', 'false', 'half-true', 'half-true', 'false', 'false', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'false', 'half-true', 'false', 'half-true', 'false',

In [15]:
print(test_y)

0       barely-true
1        pants-fire
2             false
3         half-true
4         half-true
           ...     
1279      half-true
1280    mostly-true
1281           true
1282          false
1283    barely-true
Name: 1, Length: 1284, dtype: object


In [16]:
valfile= 'checker_files/bernoulli_bigrams_probas_test.npy'
givenval= np.load(valfile)
print(np.log(givenval))
argmax_indices2 = np.argmax(givenval, axis=1)
argmax_indices2= [index_to_label[z] for z in argmax_indices2]
print(argmax_indices2)
print(len(argmax_indices2))
print(len(test_predictions))
print(argmax_indices2==test_predictions)
print(np.mean(np.array(argmax_indices2)==np.array(test_predictions)))

[[-168.13596995 -103.6324742  -114.78884665 -100.55925587 -103.83446566
  -112.27347019]
 [-191.40361508 -129.3111251  -139.08383399 -127.89565767 -131.62572947
  -136.60984574]
 [-240.5108114  -177.73863764 -187.70690172 -177.88022263 -179.4452085
  -186.37920737]
 ...
 [-211.64762733 -147.86079468 -158.53174684 -144.78459951 -149.45867864
  -153.75705553]
 [-276.41881458 -215.16666544 -223.31943905 -212.42290712 -218.42926297
  -224.03748064]
 [-212.15265854 -140.81147183 -156.00635342 -142.27044054 -146.35707328
  -154.99581866]]
['half-true', 'half-true', 'false', 'mostly-true', 'half-true', 'half-true', 'half-true', 'false', 'false', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'half-true', 'false', 'false', 'half-true', 'mostly-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'half-true', 'fa

In [17]:
for i in range(1284):
    if argmax_indices2[i]!=test_predictions[i]:
        print(i)