In [13]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,  StratifiedKFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
from analyser import Analyser

# Load and transform data

In [4]:
liar_analyser = Analyser()

In [5]:
liar_analyser.prep_liar_data('liar_dataset/train.tsv','liar_dataset/valid.tsv','liar_dataset/test.tsv')

2025-03-12 17:16:11,884 - analyser - INFO - Training data shape: (11524, 2)
2025-03-12 17:16:11,886 - analyser - INFO - Test data shape: (1267, 2)
2025-03-12 17:16:11,893 - analyser - INFO - Label distribution in training: label
1    0.557098
0    0.442902
Name: proportion, dtype: float64


(                                               statement  label
 0      says the annies list political group supports ...      0
 1      when did the decline of coal start it started ...      1
 2      hillary clinton agrees with john mccain by vot...      1
 3      health care reform legislation is likely to ma...      0
 4      the economic turnaround started at the end of ...      1
 ...                                                  ...    ...
 11519  for the first time in more than a decade impor...      1
 11520  says donald trump has bankrupted his companies...      1
 11521  john mccain and george bush have absolutely no...      1
 11522  a new poll shows 62 percent support the presid...      0
 11523  no one claims the report vindicating new jerse...      0
 
 [11524 rows x 2 columns],
                                               statement  label
 0     building a wall on the u s mexico border will ...      1
 1     wisconsin is on pace to double the number of l...      0

In [6]:
train,test = liar_analyser.train, liar_analyser.test

# Test data transformations

In [9]:
len(train)

11524

In [8]:
len(test)

1267

In [10]:
train.isna().sum()

statement    0
label        0
dtype: int64

In [11]:
test.isna().sum()

statement    0
label        0
dtype: int64

In [12]:
print(train['label'].value_counts())
print(test['label'].value_counts())

label
1    6420
0    5104
Name: count, dtype: int64
label
1    714
0    553
Name: count, dtype: int64


# Lemmatisation

In [17]:
lemmatiser = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [18]:
def lemmatise(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatiser.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

In [19]:
train['statement'] = train['statement'].apply(lemmatise)
test['statement'] = test['statement'].apply(lemmatise)

In [20]:
train.head()

Unnamed: 0,statement,label
0,say annies list political group support third ...,0
1,decline coal start started natural gas took st...,1
2,hillary clinton agrees john mccain voting give...,1
3,health care reform legislation likely mandate ...,0
4,economic turnaround started end term,1


# Test train split & Crossvalidation