In [81]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,  StratifiedKFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [24]:
columns = ["id", "label", "statement", "subject", "speaker", "speaker_job", "state", "party", "barely_true", "false", "half_true", "mostly_true", "pants_on_fire", "context"]

In [68]:
test = pd.read_csv('liar_dataset/test.tsv', sep='\t', header=0, names=columns)
train = pd.read_csv('liar_dataset/train.tsv', sep='\t',header=0, names=columns)
valid = pd.read_csv('liar_dataset/valid.tsv', sep='\t',header=0, names=columns)

In [26]:
train.head()

Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state,party,barely_true,false,half_true,mostly_true,pants_on_fire,context
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
4,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece


In [23]:
train.loc[0][2]

'Says the Annies List political group supports third-trimester abortions on demand.'

In [8]:
len(test)

1266

In [11]:
len(train)

10239

In [17]:
len(valid)

1283

In [69]:
train = pd.concat([train, valid])

In [70]:
train.head()

Unnamed: 0,id,label,statement,subject,speaker,speaker_job,state,party,barely_true,false,half_true,mostly_true,pants_on_fire,context
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
4,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece


In [71]:
len(train)

11522

In [72]:
train = train[['statement','label']]

In [37]:
train.head()

Unnamed: 0,statement,label
0,When did the decline of coal start? It started...,half-true
1,"Hillary Clinton agrees with John McCain ""by vo...",mostly-true
2,Health care reform legislation is likely to ma...,false
3,The economic turnaround started at the end of ...,half-true
4,The Chicago Bears have had more starting quart...,true


In [73]:
label_mapping = {"true": 1, "mostly-true": 1, "half-true": 1, "barely-true": 0, "false": 0, "pants-fire": 0}

In [74]:
train.loc[:,'label'] = train['label'].map(label_mapping)

In [75]:
train.head()

Unnamed: 0,statement,label
0,When did the decline of coal start? It started...,1
1,"Hillary Clinton agrees with John McCain ""by vo...",1
2,Health care reform legislation is likely to ma...,0
3,The economic turnaround started at the end of ...,1
4,The Chicago Bears have had more starting quart...,1


In [41]:
train.isna().sum()

statement    0
label        0
dtype: int64

In [96]:
print(train['label'].value_counts())

label
1    6420
0    5102
Name: count, dtype: int64


In [60]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = text.strip()
    return text

In [76]:
train.loc[:,'statement'] = train['statement'].apply(clean_text)

In [86]:
train.head()

Unnamed: 0,statement,label
0,when did the decline of coal start it started ...,1
1,hillary clinton agrees with john mccain by vot...,1
2,health care reform legislation is likely to ma...,0
3,the economic turnaround started at the end of ...,1
4,the chicago bears have had more starting quart...,1


In [64]:
(1283 + 1266)/10239

0.2489500927824983

In [91]:
train['label'] = train['label'].astype(int)

In [92]:
X = train['statement']
y = train['label']

print(train['label'].dtype)
print(train['label'].unique())

int64
[1 0]


In [79]:
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

In [84]:
pipeline = Pipeline([
    ('tfidf',TfidfVectorizer(stop_words='english', max_features=5000)),
    ('classifier', MultinomialNB())
])

In [93]:
cv_scores = cross_val_score(pipeline,X,y,cv=cv,scoring='accuracy')

In [94]:
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.2f}")
print(f"Standard Deviation: {cv_scores.std():.2f}")

Cross-validation scores: [0.6        0.60780911 0.609375   0.59244792 0.58984375]
Mean Accuracy: 0.60
Standard Deviation: 0.01
