In [3]:
pip install pandas numpy transformers scikit-learn streamlit


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [6]:
# Download required resources for NLP processing
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hudas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hudas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hudas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Loading Data

In [8]:
train_data = pd.read_csv('train.tsv', sep='\t', header=None) # reads the .csv files
valid_data = pd.read_csv('valid.tsv', sep='\t', header=None)
test_data = pd.read_csv('test.tsv', sep='\t', header=None)

columns = ["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
           "Barely True Count", "False Count", "Half True Count", "Mostly True Count",
           "Pants on Fire Count", "Context"]
train_data.columns = valid_data.columns = test_data.columns = columns # defines the columns that are in the dataset

In [9]:
#selects onky the statement and label colums from the datasets and removes all others, 
#this helps filter out unnessary coloumns 
train_data = train_data[["Statement", "Label"]]
valid_data = valid_data[["Statement", "Label"]]
test_data = test_data[["Statement", "Label"]]

In [None]:
# Initializing the lemmatizer and gets stopword list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
    # Tokenizes the sentences
    words = word_tokenize(text.lower())  # converts to lowercase and tokenize
    
    filtered_tokens = [word for word in words if word not in stop_words]
    
    # Removes stopwords and applys lemmatization
    processed_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Rejoins words into a cleaned sentence
    return " ".join(processed_words)

In [None]:
# Applying the preprocessing to the datasets
train_data["Statement"] = train_data["Statement"].apply(preprocess_text)
valid_data["Statement"] = valid_data["Statement"].apply(preprocess_text)
test_data["Statement"] = test_data["Statement"].apply(preprocess_text)

In [None]:
def map_labels(label): # only choosing the label argument to convert categorical labels to numerical values
    return 0 if label in ['false', 'barely-true', 'pants-fire'] else 1 # returns 0 if the label is false otherwise return 1

train_data['Label'] = train_data['Label'].apply(map_labels)
valid_data['Label'] = valid_data['Label'].apply(map_labels)
test_data['Label'] = test_data['Label'].apply(map_labels)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 features 

# Fit and transform training data
X_train_tfidf = vectorizer.fit_transform(train_data['Statement'])
X_valid_tfidf = vectorizer.transform(valid_data['Statement'])
X_test_tfidf = vectorizer.transform(test_data['Statement'])

In [None]:
#extracts the the label coloum and adds that to the y_train y_valid and y_test 
y_train = train_data['Label']
y_valid = valid_data['Label']
y_test = test_data['Label']

## Implementing traditional Models (Logistic Regression, SVM, Random Forests)

In [None]:
def evaluate_ml_model(model, X_train, X_valid, y_train, y_valid):
    model.fit(X_train, y_train) #this trains the model on the training data
    y_pred = model.predict(X_valid) # predicts labels for the validation datasets
    print(classification_report(y_valid, y_pred))  

In [None]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000) # this ensures that it goes through enough iterations
evaluate_ml_model(lr_model, X_train_tfidf, X_valid_tfidf, y_train, y_valid) # the model is evaluated using this function

print("Logistic Regression Results:")

In [None]:
# importing SVC from the sklearn 
from sklearn.svm import SVC 

In [None]:
# SVM
svm_model = SVC() #creating a svm moddel from sklearn thats helpful in text classification
evaluate_ml_model(svm_model, X_train_tfidf, X_valid_tfidf, y_train, y_valid) #the X_train_tfidf and X_valid_tfidf help
#with converting text numerically

print("SVM Results:")

In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100) #this creates random forest classifier 
#with 100 decision trees
evaluate_ml_model(rf_model, X_train_tfidf, X_valid_tfidf, y_train, y_valid)

print("Random Forest Results:")

In [None]:
### Implementing and fine-tunning DistilBERT 

In [None]:
#loading in the pre-trained distilBERT model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') 
#the uncased means that its not case sensetive

In [None]:
def encode_texts(data, tokenizer, max_length=128): 
    return tokenizer(data.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    
# adding padding and truncation, making sure that all sequences are same length
#trucation set to cut off text that is longer than 128 tokens 
# return_tensor="pt" is used to make sure that the output is in pytorch format whihc is a requirement

In [None]:
#applying the encode function to all the datasets 

train_encodings = encode_texts(train_data['Statement'], tokenizer)
valid_encodings = encode_texts(valid_data['Statement'], tokenizer)
test_encodings = encode_texts(test_data['Statement'], tokenizer)