In [118]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer

train_data_path = 'train.tsv'
test_data_path = 'valid.tsv'
output_path = 'output.txt'
stop_path = 'stopwords.txt'

In [119]:
train_df= pd.read_csv(train_data_path,header=None,sep='\t',quoting=3)
train_x= train_df[2]
train_y= train_df[1]
stop= pd.read_csv(stop_path,header=None)[0]
stop=stop.astype(str).tolist()

In [120]:
stemmer = PorterStemmer()

def preprocess_text(text):
    words = text.lower().split()
    filtered_words = [stemmer.stem(word) for word in words if word not in stop]
    return filtered_words

train_x= train_x.apply(preprocess_text)

In [121]:
label_to_index = {'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}
index_to_label = {v: k for k, v in label_to_index.items()}


In [122]:
def create_feature_matrix(processed_texts, labels):
    vocabulary = set(word for text in processed_texts for word in text)
    vocab_size = len(vocabulary)
    
    word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
    
    feature_matrix = np.zeros((len(processed_texts), vocab_size), dtype=int)
    class_count = np.zeros(len(set(labels)))

    for i, text in enumerate(processed_texts):
        for word in text:
            if word in word_to_index: 
                feature_matrix[i, word_to_index[word]] = 1
    
    encoded_labels = np.array([label_to_index[label] for label in labels])
    
    for label in labels:
        class_count[label_to_index[label]] += 1

    return feature_matrix, encoded_labels, class_count, word_to_index

X, Y, class_count, word_to_index= create_feature_matrix(train_x, train_y)

In [123]:
class BernoulliNaiveBayes:
    def __init__(self):
        self.class_priors = None
        self.feature_probs = None

    def fit(self, X, y):
        self.class_priors = np.log(np.bincount(y) / len(y))
        self.feature_probs = np.zeros((len(self.class_priors), X.shape[1]))
        
        for c in range(len(self.class_priors)):
            X_c = X[y == c]
            self.feature_probs[c] = (X_c.sum(axis=0) + 1) / (X_c.shape[0] + 6) 

    def predict(self, X):
        eps = 1e-10 
        log_feature_probs = np.log(self.feature_probs.T.clip(eps, 1 - eps))
        log_complement_probs = np.log((1 - self.feature_probs.T).clip(eps, 1 - eps))
        log_probs = X @ log_feature_probs + (1 - X) @ log_complement_probs + self.class_priors
        print(log_probs)
        return np.argmax(log_probs, axis=1)

model = BernoulliNaiveBayes()
model.fit(X, Y)

predictions = model.predict(X)

accuracy = np.mean(predictions == Y)
print(f'Accuracy: {accuracy * 100:.2f}%')

[[ -78.54867317  -65.53345251  -69.9773403   -68.96822327  -71.91069839
   -71.70205113]
 [-108.7538688   -96.11380135  -96.0512531   -93.55983052  -97.49219335
   -98.18265378]
 [ -99.4930915   -87.37031253  -86.80334203  -87.94567879  -85.74417732
   -88.848292  ]
 ...
 [-112.30867859 -104.45006604 -103.94368392  -99.02617572 -105.09728541
  -104.49759669]
 [ -71.94803549  -60.22804046  -64.78641312  -64.22695987  -61.31183102
   -64.05029852]
 [-117.20238078 -117.3050118  -120.11115634 -119.73893478 -120.91819384
  -119.1054621 ]]
Accuracy: 66.37%


In [124]:
predicted_labels = [index_to_label[pred] for pred in predictions]
print(predicted_labels)
print(len(predicted_labels))
with open(output_path, 'w') as f:
    for label in predicted_labels:
        f.write(f"{label}\n")

['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'barely-true', 'half-true', 'true', 'barely-true', 'half-true', 'half-true', 'barely-true', 'mostly-true', 'mostly-true', 'false', 'half-true', 'false', 'mostly-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'false', 'mostly-true', 'mostly-true', 'false', 'true', 'true', 'false', 'false', 'false', 'half-true', 'false', 'half-true', 'false', 'half-true', 'false', 'false', 'false', 'false', 'barely-true', 'mostly-true', 'false', 'half-true', 'half-true', 'mostly-true', 'half-true', 'false', 'false', 'mostly-true', 'half-true', 'mostly-true', 'true', 'mostly-true', 'false', 'mostly-true', 'true', 'false', 'barely-true', 'barely-true', 'true', 'half-true', 'mostly-true', 'half-true', 'mostly-true', 'true', 'false', 'false', 'half-true', 'half-true', 'false', 'half-true', 'false', 'false', 'fa

In [125]:
file= 'checker_files/bernoulli_probas_train.npy'
given= np.load(file)
print(np.log(given))
argmax_indices = np.argmax(given, axis=1)
argmax_indices= [index_to_label[z] for z in argmax_indices]
print(argmax_indices)
print(len(argmax_indices))
print(argmax_indices==predicted_labels)

[[ -78.54985311  -65.53395164  -69.9779418   -68.96869308  -71.91120561
   -71.70264338]
 [-108.75504874  -96.11430048  -96.05185461  -93.56030034  -97.49270058
   -98.18324602]
 [ -99.49427144  -87.37081166  -86.80394354  -87.9461486   -85.74468455
   -88.84888424]
 ...
 [-112.30985854 -104.45056516 -103.94428542  -99.02664553 -105.09779264
  -104.49818893]
 [ -71.94921543  -60.22853958  -64.78701462  -64.22742969  -61.31233824
   -64.05089076]
 [-117.20356072 -117.30551092 -120.11175784 -119.73940459 -120.91870107
  -119.10605434]]
['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'barely-true', 'half-true', 'true', 'barely-true', 'half-true', 'half-true', 'barely-true', 'mostly-true', 'mostly-true', 'false', 'half-true', 'false', 'mostly-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'false', 'mostly-true', 'mostly-true', 'false', 'tr

In [126]:
test_df= pd.read_csv(test_data_path,header=None,sep='\t',quoting=3)
test_x= train_df[2]
test_y= train_df[1]

In [127]:
def create_test_feature_matrix(processed_texts, word_to_index):
    vocab_size = len(word_to_index)
    
    # Initialize feature matrix for test data
    feature_matrix = np.zeros((len(processed_texts), vocab_size), dtype=int)
    
    for i, text in enumerate(processed_texts):
        for word in text:
            if word in word_to_index:  # Only include words from the training vocabulary
                feature_matrix[i, word_to_index[word]] = 1
                
    return feature_matrix

test_x= test_x.apply(preprocess_text)
X_test = create_test_feature_matrix(test_x, word_to_index)

test_predictions = model.predict(X_test)
test_predictions = [index_to_label[pred] for pred in test_predictions]
test_accuracy = np.mean(test_predictions == test_y)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')


[[ -78.54867317  -65.53345251  -69.9773403   -68.96822327  -71.91069839
   -71.70205113]
 [-108.7538688   -96.11380135  -96.0512531   -93.55983052  -97.49219335
   -98.18265378]
 [ -99.4930915   -87.37031253  -86.80334203  -87.94567879  -85.74417732
   -88.848292  ]
 ...
 [-112.30867859 -104.45006604 -103.94368392  -99.02617572 -105.09728541
  -104.49759669]
 [ -71.94803549  -60.22804046  -64.78641312  -64.22695987  -61.31183102
   -64.05029852]
 [-117.20238078 -117.3050118  -120.11115634 -119.73893478 -120.91819384
  -119.1054621 ]]
Test Accuracy: 66.37%


In [128]:
print(test_predictions)

['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'barely-true', 'half-true', 'true', 'barely-true', 'half-true', 'half-true', 'barely-true', 'mostly-true', 'mostly-true', 'false', 'half-true', 'false', 'mostly-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'false', 'mostly-true', 'mostly-true', 'false', 'true', 'true', 'false', 'false', 'false', 'half-true', 'false', 'half-true', 'false', 'half-true', 'false', 'false', 'false', 'false', 'barely-true', 'mostly-true', 'false', 'half-true', 'half-true', 'mostly-true', 'half-true', 'false', 'false', 'mostly-true', 'half-true', 'mostly-true', 'true', 'mostly-true', 'false', 'mostly-true', 'true', 'false', 'barely-true', 'barely-true', 'true', 'half-true', 'mostly-true', 'half-true', 'mostly-true', 'true', 'false', 'false', 'half-true', 'half-true', 'false', 'half-true', 'false', 'false', 'fa

In [129]:
print(test_y)

0              false
1          half-true
2        mostly-true
3              false
4          half-true
            ...     
10264    mostly-true
10265    mostly-true
10266      half-true
10267          false
10268     pants-fire
Name: 1, Length: 10269, dtype: object


In [130]:
valfile= 'checker_files/bernoulli_probas_test.npy'
givenval= np.load(valfile)
print(np.log(givenval))
argmax_indices2 = np.argmax(given, axis=1)
argmax_indices2= [index_to_label[z] for z in argmax_indices2]
print(argmax_indices2)
print(len(argmax_indices2))
print(argmax_indices2==test_predictions)

[[ -45.13544117  -32.84820932  -34.82105561  -32.04217734  -32.06072376
   -34.22427098]
 [ -81.90607163  -73.04613407  -73.95738422  -72.6307026   -73.93305691
   -72.73905061]
 [-104.09056459  -91.77636558  -94.55185138  -93.7742541   -92.81360526
   -95.11079717]
 ...
 [ -70.25559736  -57.58633127  -59.51837587  -56.70071003  -58.23138323
   -58.23069958]
 [-131.76417449 -120.64881953 -121.99675545 -119.86037504 -123.13146894
  -121.91500502]
 [ -83.14304886  -68.28296702  -72.92550916  -70.32666497  -70.86815571
   -73.41578415]]
['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'barely-true', 'half-true', 'true', 'barely-true', 'half-true', 'half-true', 'barely-true', 'mostly-true', 'mostly-true', 'false', 'half-true', 'false', 'mostly-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'false', 'mostly-true', 'mostly-true', 'false', 'tr