In [1]:
pip install pandas numpy transformers scikit-learn streamlit datasets accelerate


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [22]:
# Download required resources for NLP processing
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hudas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hudas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hudas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Loading Data

In [23]:
train_data = pd.read_csv('train.tsv', sep='\t', header=None) # reads the .csv files
valid_data = pd.read_csv('valid.tsv', sep='\t', header=None)
test_data = pd.read_csv('test.tsv', sep='\t', header=None)

columns = ["ID", "Label", "Statement", "Subject", "Speaker", "Job Title", "State", "Party",
           "Barely True Count", "False Count", "Half True Count", "Mostly True Count",
           "Pants on Fire Count", "Context"]
train_data.columns = valid_data.columns = test_data.columns = columns # defines the columns that are in the dataset

In [24]:
#selects onky the statement and label colums from the datasets and removes all others, 
#this helps filter out unnessary coloumns 
train_data = train_data[["Statement", "Label"]]
valid_data = valid_data[["Statement", "Label"]]
test_data = test_data[["Statement", "Label"]]

In [25]:
# Initializing the lemmatizer and gets stopword list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [26]:
def preprocess_text(text):
    # Tokenizes the sentences
    words = word_tokenize(text.lower())  # converts to lowercase and tokenize
    
    filtered_tokens = [word for word in words if word not in stop_words]
    
    # Removes stopwords and applys lemmatization
    processed_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Rejoins words into a cleaned sentence
    return " ".join(processed_words)

In [27]:
# Applying the preprocessing to the datasets
train_data["Statement"] = train_data["Statement"].apply(preprocess_text)
valid_data["Statement"] = valid_data["Statement"].apply(preprocess_text)
test_data["Statement"] = test_data["Statement"].apply(preprocess_text)

In [28]:
def map_labels(label): # only choosing the label argument to convert categorical labels to numerical values
    return 0 if label in ['false', 'barely-true', 'pants-fire'] else 1 # returns 0 if the label is false otherwise return 1

train_data['Label'] = train_data['Label'].apply(map_labels)
valid_data['Label'] = valid_data['Label'].apply(map_labels)
test_data['Label'] = test_data['Label'].apply(map_labels)

In [29]:
vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 features 

# Fit and transform training data
X_train_tfidf = vectorizer.fit_transform(train_data['Statement'])
X_valid_tfidf = vectorizer.transform(valid_data['Statement'])
X_test_tfidf = vectorizer.transform(test_data['Statement'])

In [30]:
#extracts the the label coloum and adds that to the y_train y_valid and y_test 
y_train = train_data['Label']
y_valid = valid_data['Label']
y_test = test_data['Label']

In [31]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [32]:
def evaluate_ml_model(model, X_train, X_valid, y_train, y_valid, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)

    accuracy = accuracy_score(y_valid, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_valid, y_pred, average='weighted')

    return {

        "Model": model_name,
        "Accuracy": accuracy * 100,
        "Precision": precision * 100,
        "Recall": recall * 100,
        "F1 Score": f1 * 100
    }

## Implementing traditional Models (Logistic Regression, SVM, Random Forests)

In [33]:
ml_results = []

In [34]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000) # this ensures that it goes through enough iterations
ml_results.append(evaluate_ml_model(lr_model,
                                    X_train_tfidf,
                                    X_valid_tfidf,
                                    y_train,
                                    y_valid,
                                    "Logistic Regression"))

In [35]:
# importing SVC from the sklearn 
from sklearn.svm import SVC 

In [36]:
# SVM
svm_model = SVC() #creating a svm moddel from sklearn thats helpful in text classification
ml_results.append(evaluate_ml_model(svm_model,
                                    X_train_tfidf,
                                    X_valid_tfidf,
                                    y_train,
                                    y_valid,
                                    "SVM"))


In [37]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100) #this creates random forest classifier 
#with 100 decision trees
ml_results.append(evaluate_ml_model(rf_model,
                                    X_train_tfidf,
                                    X_valid_tfidf,
                                    y_train,
                                    y_valid,
                                    "Random Forest"))

## Implementing and fine-tunning distilBERT

In [38]:
#loading in the pre-trained distilBERT model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') 
#the uncased means that its not case sensitive

In [39]:
def encode_texts(data, tokenizer, max_length=128): 
    return tokenizer(data.tolist(),
                     padding=True,
                     truncation=True,
                     max_length=max_length,
                     return_tensors="pt")
    
# adding padding and truncation, making sure that all sequences are same length
#trucation set to cut off text that is longer than 128 tokens 
# return_tensor="pt" is used to make sure that the output is in pytorch format whihc is a requirement

In [40]:
#applying the encode function to all the datasets 

train_encodings = encode_texts(train_data['Statement'], distilbert_tokenizer)
valid_encodings = encode_texts(valid_data['Statement'], distilbert_tokenizer)
test_encodings = encode_texts(test_data['Statement'], distilbert_tokenizer)


In [41]:
#importing torch to make sure that its up
import torch 
from torch.utils.data import TensorDataset, DataLoader 
#importing modules are are needed
#to handle datasets and batch loading 

In [42]:
#creating TensorDatasets for training, validations, and testing
train_dataset = TensorDataset(train_encodings['input_ids'],
                              train_encodings['attention_mask'],
                              torch.tensor(train_data['Label'].tolist()))

valid_dataset = TensorDataset(valid_encodings['input_ids'],
                              valid_encodings['attention_mask'],
                              torch.tensor(valid_data['Label'].tolist()))

test_dataset = TensorDataset(test_encodings['input_ids'],
                             test_encodings['attention_mask'],
                             torch.tensor(test_data['Label'].tolist()))

#attention_mask indicated which token is actual word(1) and which is not (0)

In [43]:
#creating DataLoaders for training, validations and testing 
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) 
# shuffle is used to ensure that the data is shuffled in each epouch for different results
valid_loader = DataLoader(valid_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [44]:
#Loading the distilBERT model 
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model = model.to('cuda') #using GPU 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01) #setting the learning rate of AdamW to be 5e-5
loss_fn = torch.nn.CrossEntropyLoss() #defineing the loss function

In [46]:
import gc
import torch

gc.collect()
#torch.cuda.empty_cache()


114

In [47]:

for epoch in range(10):
    model.train() # seting the model into training mode
    for batch in train_loader:
        input_ids, attention_mask, labels = [x.to('cuda') for x in batch] # moves the data to the GPU
        optimizer.zero_grad() # clears the old weights if any 
        outputs = model(input_ids, attention_mask=attention_mask) # passes the input data through the model
        loss = loss_fn(outputs.logits, labels) # shows how accurate the predictions were from the actual labels 
        loss.backward() # learn from the former predictions and adjusts 
        optimizer.step()
    print(f"Epoch {epoch + 1} completed.")

Epoch 1 completed.
Epoch 2 completed.
Epoch 3 completed.
Epoch 4 completed.
Epoch 5 completed.
Epoch 6 completed.
Epoch 7 completed.
Epoch 8 completed.
Epoch 9 completed.
Epoch 10 completed.


In [48]:

# Evaluating the models 
all_predictions = []
all_labels = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to('cuda') for x in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# the metrics 
accuracy = accuracy_score(all_labels, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted', zero_division=0)


ml_results.append({
    "Model": "DistilBERT",
    "Accuracy": accuracy * 100,
    "Precision": precision * 100,
    "Recall": recall * 100,
    "F1 Score": f1 * 100

})

In [49]:
#Displaying results in a tabular form 

import pandas as pd 

df_results = pd.DataFrame(ml_results)
print(df_results)

                 Model   Accuracy  Precision     Recall   F1 Score
0  Logistic Regression  61.760125  62.084163  61.760125  61.013597
1                  SVM  62.538941  63.425728  62.538941  61.314431
2        Random Forest  60.280374  60.678494  60.280374  59.240332
3           DistilBERT  60.220994  60.602454  60.220994  60.345947
