In [46]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.util import ngrams

train_data_path = 'train.tsv'
test_data_path = 'valid.tsv'
output_path = 'output.txt'
stop_path = 'stopwords.txt'

In [47]:
train_df= pd.read_csv(train_data_path,header=None,sep='\t',quoting=3)
train_x= train_df[2]
train_y= train_df[1]
stop= pd.read_csv(stop_path,header=None)[0]
stop=stop.astype(str).tolist()

In [48]:
stemmer = PorterStemmer()

def preprocess_text(text):
    words = text.lower().split()
    # Filter out stopwords and apply stemming
    filtered_words = [stemmer.stem(word) for word in words if word not in stop]

    # Create unigrams (individual words)
    unigrams = filtered_words
    
    # Create bigrams (pairs of consecutive words)
    bigrams = ['_'.join(bigram) for bigram in ngrams(filtered_words, 2)]
    
    # Combine unigrams and bigrams
    return unigrams + bigrams

train_x= train_x.apply(preprocess_text)

In [49]:
label_to_index = {'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5}
index_to_label = {v: k for k, v in label_to_index.items()}


In [50]:
def create_feature_matrix(processed_texts, labels):
    vocabulary = set(word for text in processed_texts for word in text)
    vocab_size = len(vocabulary)
    
    word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
    
    feature_matrix = np.zeros((len(processed_texts), vocab_size), dtype=int)
    class_count = np.zeros(len(set(labels)))

    for i, text in enumerate(processed_texts):
        for word in text:
            if word in word_to_index:
                feature_matrix[i, word_to_index[word]] += 1 
    
    encoded_labels = np.array([label_to_index[label] for label in labels])
    
    for label in labels:
        class_count[label_to_index[label]] += 1

    return feature_matrix, encoded_labels, class_count, word_to_index

X, Y, class_count, word_to_index= create_feature_matrix(train_x, train_y)

In [51]:
class MultinomialNaiveBayes:
    def __init__(self):
        self.class_priors = None
        self.feature_probs = None

    def fit(self, X, y):
        self.class_priors = np.log(np.bincount(y) / len(y))
        self.feature_probs = np.zeros((len(self.class_priors), X.shape[1]))
        
        for c in range(len(self.class_priors)):
            X_c = X[y == c]
            self.feature_probs[c] = (X_c.sum(axis=0) + 1) / (X_c.sum() + X.shape[1]) 

    def predict(self, X):
        eps = 1e-10 
        log_feature_probs = np.log(self.feature_probs.T.clip(eps, 1))
        log_probs = X @ log_feature_probs + self.class_priors
        print(log_probs)
        return np.argmax(log_probs, axis=1)

model = MultinomialNaiveBayes()
model.fit(X, Y)

predictions = model.predict(X)

accuracy = np.mean(predictions == Y)
print(f'Accuracy: {accuracy * 100:.2f}%')

[[-185.45151938 -172.99983292 -181.72029755 -181.94302542 -185.22959393
  -184.65105993]
 [-301.81531162 -291.80457439 -291.10050719 -281.73105164 -293.11483803
  -295.40259494]
 [-267.46008916 -256.48833336 -257.41741076 -257.73186898 -249.01183869
  -259.4139584 ]
 ...
 [-335.98118079 -329.41300567 -327.11515991 -313.94336048 -331.97032548
  -329.65756994]
 [-145.4951275  -137.72866481 -144.74182277 -145.60474022 -142.52171391
  -144.52220173]
 [-333.80902103 -347.65448754 -349.60287578 -349.97133151 -351.62657002
  -347.95865049]]
Accuracy: 96.08%


In [52]:
predicted_labels = [index_to_label[pred] for pred in predictions]
print(predicted_labels)
print(len(predicted_labels))
with open(output_path, 'w') as f:
    for label in predicted_labels:
        f.write(f"{label}\n")

['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'barely-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'false', 'mostly-true', 'barely-true', 'half-true', 'true', 'barely-true', 'half-true', 'mostly-true', 'false', 'mostly-true', 'mostly-true', 'half-true', 'true', 'false', 'mostly-true', 'half-true', 'false', 'mostly-true', 'half-true', 'barely-true', 'false', 'mostly-true', 'mostly-true', 'true', 'true', 'true', 'pants-fire', 'true', 'false', 'half-true', 'pants-fire', 'pants-fire', 'false', 'half-true', 'pants-fire', 'false', 'pants-fire', 'true', 'barely-true', 'barely-true', 'false', 'true', 'barely-true', 'half-true', 'half-true', 'barely-true', 'false', 'mostly-true', 'false', 'mostly-true', 'true', 'mostly-true', 'false', 'pants-fire', 'true', 'false', 'barely-true', 'barely-true', 'true', 'barely-true', 'mostly-true', 'mostly-true', 'mostly-true', 'true', 'false', 'barely-true', 'false', 'half-true', 'barely-true', 'half-true', 

In [53]:
file= 'checker_files/multinomial_bigrams_probas_train.npy'
given= np.load(file)
print(np.log(given))
argmax_indices = np.argmax(given, axis=1)
argmax_indices= [index_to_label[z] for z in argmax_indices]
print(argmax_indices)
print(len(argmax_indices))
print(argmax_indices==predicted_labels)
# for i in range(10269):
#     if argmax_indices[i]!= predicted_labels[i]:
#         print(argmax_indices[i])
#         print(i)
#         print(predicted_labels[i])

[[-185.45151938 -172.99983292 -181.72029755 -181.94302542 -185.22959393
  -184.65105993]
 [-301.81531162 -291.80457439 -291.10050719 -281.73105164 -293.11483803
  -295.40259494]
 [-267.46008916 -256.48833336 -257.41741076 -257.73186898 -249.01183869
  -259.4139584 ]
 ...
 [-335.98118079 -329.41300567 -327.11515991 -313.94336048 -331.97032548
  -329.65756994]
 [-145.4951275  -137.72866481 -144.74182277 -145.60474022 -142.52171391
  -144.52220173]
 [-333.80902103 -347.65448754 -349.60287578 -349.97133151 -351.62657002
  -347.95865049]]
['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'barely-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'false', 'mostly-true', 'barely-true', 'half-true', 'true', 'barely-true', 'half-true', 'mostly-true', 'false', 'mostly-true', 'mostly-true', 'half-true', 'true', 'false', 'mostly-true', 'half-true', 'false', 'mostly-true', 'half-true', 'barely-true', 'false', 'mostly-true', 'mostly-true', 'true', 'true', '

  print(np.log(given))


In [54]:
test_df= pd.read_csv(test_data_path,header=None,sep='\t',quoting=3)
test_x= train_df[2]
test_y= train_df[1]

In [55]:
def create_test_feature_matrix(processed_texts, word_to_index):
    vocab_size = len(word_to_index)
    
    feature_matrix = np.zeros((len(processed_texts), vocab_size), dtype=int)
    
    for i, text in enumerate(processed_texts):
        for word in text:
            if word in word_to_index:
                feature_matrix[i, word_to_index[word]] += 1  # Count occurrences of each word
    
    return feature_matrix

test_x= test_x.apply(preprocess_text)
X_test = create_test_feature_matrix(test_x, word_to_index)

test_predictions = model.predict(X_test)
test_predictions = [index_to_label[pred] for pred in test_predictions]
test_accuracy = np.mean(test_predictions == test_y)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')


[[-185.45151938 -172.99983292 -181.72029755 -181.94302542 -185.22959393
  -184.65105993]
 [-301.81531162 -291.80457439 -291.10050719 -281.73105164 -293.11483803
  -295.40259494]
 [-267.46008916 -256.48833336 -257.41741076 -257.73186898 -249.01183869
  -259.4139584 ]
 ...
 [-335.98118079 -329.41300567 -327.11515991 -313.94336048 -331.97032548
  -329.65756994]
 [-145.4951275  -137.72866481 -144.74182277 -145.60474022 -142.52171391
  -144.52220173]
 [-333.80902103 -347.65448754 -349.60287578 -349.97133151 -351.62657002
  -347.95865049]]
Test Accuracy: 96.08%


In [56]:
print(test_predictions)

['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'barely-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'false', 'mostly-true', 'barely-true', 'half-true', 'true', 'barely-true', 'half-true', 'mostly-true', 'false', 'mostly-true', 'mostly-true', 'half-true', 'true', 'false', 'mostly-true', 'half-true', 'false', 'mostly-true', 'half-true', 'barely-true', 'false', 'mostly-true', 'mostly-true', 'true', 'true', 'true', 'pants-fire', 'true', 'false', 'half-true', 'pants-fire', 'pants-fire', 'false', 'half-true', 'pants-fire', 'false', 'pants-fire', 'true', 'barely-true', 'barely-true', 'false', 'true', 'barely-true', 'half-true', 'half-true', 'barely-true', 'false', 'mostly-true', 'false', 'mostly-true', 'true', 'mostly-true', 'false', 'pants-fire', 'true', 'false', 'barely-true', 'barely-true', 'true', 'barely-true', 'mostly-true', 'mostly-true', 'mostly-true', 'true', 'false', 'barely-true', 'false', 'half-true', 'barely-true', 'half-true', 

In [57]:
print(test_y)

0              false
1          half-true
2        mostly-true
3              false
4          half-true
            ...     
10264    mostly-true
10265    mostly-true
10266      half-true
10267          false
10268     pants-fire
Name: 1, Length: 10269, dtype: object


In [58]:
valfile= 'checker_files/multinomial_bigrams_probas_test.npy'
givenval= np.load(valfile)
print(np.log(givenval))
argmax_indices2 = np.argmax(given, axis=1)
argmax_indices2= [index_to_label[z] for z in argmax_indices2]
print(argmax_indices2)
print(len(argmax_indices2))
print(argmax_indices2==test_predictions)

[[ -52.60324827  -49.68910516  -50.08264455  -46.89853261  -47.73201329
   -49.42684004]
 [ -95.5744476   -92.2779949   -92.00834707  -91.19961939  -92.63831329
   -91.18848751]
 [-198.8643237  -189.30315815 -190.39187846 -189.03333272 -188.60652883
  -191.05231262]
 ...
 [-135.20597765 -127.49292849 -128.85855218 -124.44163017 -127.08422734
  -125.63551499]
 [-285.42185396 -268.98638741 -270.02650187 -266.70150875 -271.31862751
  -271.98687459]
 [-135.787149   -120.51690182 -126.35418497 -122.18215722 -123.95478606
  -126.69968302]]
['false', 'half-true', 'mostly-true', 'false', 'half-true', 'true', 'barely-true', 'half-true', 'half-true', 'mostly-true', 'mostly-true', 'half-true', 'false', 'mostly-true', 'barely-true', 'half-true', 'true', 'barely-true', 'half-true', 'mostly-true', 'false', 'mostly-true', 'mostly-true', 'half-true', 'true', 'false', 'mostly-true', 'half-true', 'false', 'mostly-true', 'half-true', 'barely-true', 'false', 'mostly-true', 'mostly-true', 'true', 'true', '

In [59]:
for i in range(10269):
    if argmax_indices2[i]!= test_predictions[i]:
        print(i)

4206
7292
9991
