In [109]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer

train_data_path = 'train.tsv'
test_data_path = 'valid.tsv'
output_path = 'output.txt'
stop_path = 'stopwords.txt'

In [110]:
train_df= pd.read_csv(train_data_path,header=None,sep='\t')
train_x= train_df[2]
train_y= train_df[1]
stop= pd.read_csv(stop_path,header=None)[0]
stop=stop.astype(str).tolist()

In [111]:
stemmer = PorterStemmer()

def preprocess_text(text):
    words = text.lower().split(' ')
    filtered_words = [stemmer.stem(word) for word in words if word not in stop]
    return filtered_words

train_x= train_x.apply(preprocess_text)

In [112]:
label_to_index = {'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}
index_to_label = {v: k for k, v in label_to_index.items()}


In [113]:
def create_feature_matrix(processed_texts, labels):
    vocabulary = set(word for text in processed_texts for word in text)
    vocab_size = len(vocabulary)
    
    word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
    
    feature_matrix = np.zeros((len(processed_texts), vocab_size), dtype=int)
    class_count = np.zeros(len(set(labels)))

    for i, text in enumerate(processed_texts):
        for word in text:
            if word in word_to_index: 
                feature_matrix[i, word_to_index[word]] = 1
    
    encoded_labels = np.array([label_to_index[label] for label in labels])
    
    for label in labels:
        class_count[label_to_index[label]] += 1

    return feature_matrix, encoded_labels, class_count, word_to_index

# Call the function and capture outputs
X, Y, class_count, word_to_index= create_feature_matrix(train_x, train_y)

In [114]:
class BernoulliNaiveBayes:
    def __init__(self):
        self.class_priors = None
        self.feature_probs = None

    def fit(self, X, y):
        self.class_priors = np.log(np.bincount(y) / len(y))
        self.feature_probs = np.zeros((len(self.class_priors), X.shape[1]))
        
        for c in range(len(self.class_priors)):
            X_c = X[y == c]
            self.feature_probs[c] = (X_c.sum(axis=0) + 1) / (X_c.shape[0] + 2) 

    def predict(self, X):
        log_probs = X @ np.log(self.feature_probs.T) + (1 - X) @ np.log(1 - self.feature_probs.T)
        return np.argmax(log_probs + self.class_priors, axis=1)

model = BernoulliNaiveBayes()
model.fit(X, Y)

predictions = model.predict(X)

accuracy = np.mean(predictions == Y)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 66.18%


In [115]:
predicted_labels = [index_to_label[pred] for pred in predictions]
print(predicted_labels)
with open(output_path, 'w') as f:
    for label in predicted_labels:
        f.write(f"{label}\n")

['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'barely-true', 'half-true', 'true', 'barely-true', 'half-true', 'half-true', 'barely-true', 'mostly-true', 'mostly-true', 'false', 'half-true', 'false', 'mostly-true', 'half-true', 'false', 'half-true', 'half-true', 'half-true', 'false', 'mostly-true', 'mostly-true', 'false', 'true', 'true', 'false', 'false', 'false', 'half-true', 'false', 'half-true', 'false', 'half-true', 'false', 'false', 'false', 'false', 'barely-true', 'mostly-true', 'false', 'half-true', 'half-true', 'mostly-true', 'half-true', 'false', 'false', 'mostly-true', 'half-true', 'mostly-true', 'true', 'mostly-true', 'false', 'mostly-true', 'true', 'false', 'barely-true', 'barely-true', 'true', 'half-true', 'mostly-true', 'half-true', 'mostly-true', 'true', 'false', 'mostly-true', 'half-true', 'half-true', 'false', 'half-true', 'false', 'false

In [116]:
file= 'checker_files/bernoulli_probas_train.npy'
given= np.load(file)
print(given)
argmax_indices = np.argmax(given, axis=1)
argmax_indices= [index_to_label[z] for z in argmax_indices]
print(argmax_indices)

[[9.09388999e-22 3.62931681e-21 5.50739822e-22 2.11042403e-22
  1.61177398e-23 5.76501247e-23]
 [1.04894515e-34 2.63693527e-34 3.51893973e-33 6.14270411e-33
  1.48824695e-34 3.12125505e-34]
 [9.58801428e-31 1.58321946e-30 3.93897989e-29 1.45105136e-30
  2.18278473e-29 9.51556055e-31]
 ...
 [2.34626051e-36 4.16843369e-38 1.10060270e-36 1.91716371e-35
  5.40024362e-38 3.66612100e-37]
 [9.62083537e-19 8.95487796e-19 1.34765061e-19 3.13039266e-20
  7.31026202e-19 1.27370929e-19]
 [2.23113399e-38 1.49898638e-43 1.34140124e-43 2.57781179e-44
  9.45918484e-45 1.73433140e-43]]
['false', 'half-true', 'barely-true', 'pants-fire', 'pants-fire', 'true', 'true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'half-true', 'mostly-true', 'pants-fire', 'half-true', 'true', 'barely-true', 'half-true', 'half-true', 'barely-true', 'mostly-true', 'mostly-true', 'false', 'barely-true', 'pants-fire', 'pants-fire', 'half-true', 'pants-fire', 'half-true', 'half-true', 'half-true', 'false

In [117]:
test_df= pd.read_csv(test_data_path,header=None,sep='\t')
test_x= train_df[2]
test_y= train_df[1]
