In [43]:
# import youtube comment scraper
from comment_scraper import comment_scraper

# Preprocess Data
import pandas as pd
import numpy as np
import re
import string
import nltk
import torch
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download("vader_lexicon")
nltk.download("stopwords")
nltk.download("punkt")
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
# Build Model for sentiment analysis
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Instantiate Class and place Video that I want to scrape comments from

In [44]:
comment_scraper = comment_scraper(videoId = "jb_lnAvZSa4")

# Loading Data

In [45]:
df = pd.read_pickle("youtube_comments_df.pkl")

In [46]:
df.head()

Unnamed: 0,author,updated_at,like_count,text,public
0,@joeconley2023,2024-02-04T07:41:58Z,0,This game was just a pure failure by the raven...,True
1,@JustinUrgo,2024-02-04T07:35:41Z,0,How many times mahomes gotta get tackled for t...,True
2,@Jaem-ml4lx,2024-02-04T05:11:10Z,0,Taunting after somebody tackles you,True
3,@faafouinatsai8826,2024-02-04T03:45:46Z,0,Titan fan 😂,True
4,@Prettyfresh20,2024-02-04T02:37:24Z,0,"After watching this game, I will say a few thi...",True


# Preprocessing Text

In [47]:
df = df.drop(columns = ["author", "updated_at", "like_count", "public"], axis = 1)
df.head()

Unnamed: 0,text
0,This game was just a pure failure by the raven...
1,How many times mahomes gotta get tackled for t...
2,Taunting after somebody tackles you
3,Titan fan 😂
4,"After watching this game, I will say a few thi..."


In [48]:
# Removing punctuations and special characters
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text
df["text"] = df["text"].apply(lambda x: preprocess(x))

In [49]:
# Tokenize, stem, lemmatize text and remove stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
    stemmer = PorterStemmer()
    tokens = text.split()
    tokens = [word for word in tokens if word.lower() not in stop_words]
    stemmed_text = [stemmer.stem(token) for token in tokens]
    lemmatized_text = [stemmer.stem(token) for token in stemmed_text]
    return lemmatized_text
df["text"] = df["text"].apply(lambda x: remove_stopwords(x))

# Labelling the data using nltk

In [50]:
sia = SIA()
def get_sentiment(tokens):
    text = " ".join(tokens)
    scores = sia.polarity_scores(text)
    if scores["compound"] >= 0.05:
        return "Positive"
    elif scores["compound"] <= 0.05:
        return "Negative"
    else:
        return "Neutral"

In [51]:
df["sentiment"] = df["text"].apply(lambda x: get_sentiment(x))

In [52]:
df["sentiment"].value_counts()

sentiment
Negative    2575
Positive    2135
Name: count, dtype: int64

# Train Test Split

In [53]:
df["text"] = df["text"].apply(lambda x: " ".join(x))

In [54]:
sentiment_replace_map = {"Negative": 0, "Positive": 1}
df["sentiment"] = df["sentiment"].replace(sentiment_replace_map)

  df["sentiment"] = df["sentiment"].replace(sentiment_replace_map)


In [55]:
df.head()

Unnamed: 0,text,sentiment
0,game pure failur raven defen play good play ma...,1
1,mani time mahom gotta get tackl ref blow whistl,0
2,taunt somebodi tackl,0
3,titan fan,1
4,watch game say thing biggest thing mahom posse...,1


In [56]:
X = df["text"]
y = df["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [57]:
X_train.head()

2651                                         ref get paid
4397    lamar nutrid trash josh allen better energi bo...
910                                    tf even swift girl
124        christ sport beyond bore stop start stop start
2955      surpri raven alway lose typic raven fashion sad
Name: text, dtype: object

# Text Vectorization

In [58]:
# Encode Text To Numbers using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(X_train)
X_test = tfidf_vectorizer.transform(X_test)

# Build Baseline Model

In [59]:
multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train, y_train)
y_pred = multinomial_nb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.79      0.80       760
           1       0.76      0.80      0.78       653

    accuracy                           0.79      1413
   macro avg       0.79      0.79      0.79      1413
weighted avg       0.79      0.79      0.79      1413



# Logistic Regression Model

In [60]:
# Base LogReg Model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       760
           1       0.87      0.77      0.82       653

    accuracy                           0.84      1413
   macro avg       0.85      0.84      0.84      1413
weighted avg       0.84      0.84      0.84      1413



In [61]:
# Tuned LogReg Model
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
grid = dict(solver=solvers,penalty=penalty,C=c_values)
grid_search = GridSearchCV(estimator=logreg, param_grid=grid, n_jobs=-1, cv=5, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
print(grid_result.best_params_)

{'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}


In [62]:
logreg = LogisticRegression(C = 10, penalty = "l2", solver = "lbfgs")
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.89      0.89       760
           1       0.87      0.86      0.87       653

    accuracy                           0.88      1413
   macro avg       0.88      0.88      0.88      1413
weighted avg       0.88      0.88      0.88      1413



# Decision Tree Classifier

In [63]:
# Base Dtree Model
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.84      0.86       760
           1       0.82      0.86      0.84       653

    accuracy                           0.85      1413
   macro avg       0.85      0.85      0.85      1413
weighted avg       0.85      0.85      0.85      1413



In [64]:
# Tuned DTree Model
criterion = ["gini", "entropy"]
max_depth = np.arange(1, 21).tolist()[0::2]
min_samples_split = np.arange(2, 11).tolist()[0::2]
max_leaf_nodes = np.arange(3, 26).tolist()[0::2]
grid = dict(criterion = criterion, max_depth = max_depth, min_samples_split = min_samples_split, max_leaf_nodes = max_leaf_nodes)
grid_search = HalvingGridSearchCV(dtree, param_grid= grid, n_jobs = -1, cv = 5, scoring = "accuracy", error_score=0)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'criterion': 'entropy', 'max_depth': 19, 'max_leaf_nodes': 23, 'min_samples_split': 6}


In [65]:
dtree_tuned = DecisionTreeClassifier(criterion = "gini", max_depth = 19, max_leaf_nodes = 23, min_samples_split = 6)
dtree_tuned.fit(X_train, y_train)
y_pred = dtree_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.86       760
           1       0.83      0.84      0.84       653

    accuracy                           0.85      1413
   macro avg       0.85      0.85      0.85      1413
weighted avg       0.85      0.85      0.85      1413



# Random Forest Classifier

In [66]:
# Base Random Forest Classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87       760
           1       0.83      0.88      0.85       653

    accuracy                           0.86      1413
   macro avg       0.86      0.86      0.86      1413
weighted avg       0.86      0.86      0.86      1413



In [67]:
n_estimators = [25, 50, 100, 150]
max_features = ["sqrt", "log2", None]
max_depth = [3, 6, 9]
max_leaf_nodes = [3, 6, 9]
grid = dict(n_estimators=n_estimators,max_features=max_features, max_depth = max_depth, max_leaf_nodes = max_leaf_nodes)
grid_search = RandomizedSearchCV(estimator=rf, param_distributions=grid, n_iter = 20, n_jobs=-1, cv=5, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
print(grid_result.best_params_)

{'n_estimators': 25, 'max_leaf_nodes': 9, 'max_features': None, 'max_depth': 9}


In [68]:
# Tuned Random Forest Classifier
rf_tuned = RandomForestClassifier(n_estimators = 25, max_leaf_nodes = 6, max_features = None, max_depth = 6)
rf_tuned.fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.90      0.79       760
           1       0.83      0.55      0.66       653

    accuracy                           0.74      1413
   macro avg       0.77      0.73      0.73      1413
weighted avg       0.76      0.74      0.73      1413



# XGBoost Classifier

In [69]:
# Base XGB Classifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87       760
           1       0.85      0.85      0.85       653

    accuracy                           0.86      1413
   macro avg       0.86      0.86      0.86      1413
weighted avg       0.86      0.86      0.86      1413



In [70]:
min_child_weight = [1, 5, 10, 20]
gamma = [0.5, 1, 1.5]
subsample = [0.6, 0.8, 1.0]
colsample_bytree = [0.6, 0.8, 1.0]
max_depth = [3, 4, 5]
n_estimators = [50, 100, 200]
learning_rate = [0.01, 0.05, 0.1]
grid = dict(min_child_weight = min_child_weight, gamma = gamma, subsample = subsample, colsample_bytree = colsample_bytree, max_depth = max_depth, learning_rate = learning_rate, n_estimators= n_estimators)
grid_search = HalvingGridSearchCV(estimator=xgb, param_grid=grid, n_jobs=-1, cv=5, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
print(grid_result.best_params_)

{'colsample_bytree': 0.6, 'gamma': 1.5, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.6}


In [71]:
# Tuned XGBoost Classifier
xgb_tuned = XGBClassifier(colsample_bytree = 0.6, gamma = 1.5, learning_rate = 0.1, max_depth = 5, min_child_weight = 1, n_estimators = 300, subsample = 1.0)
xgb_tuned.fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       760
           1       0.86      0.85      0.86       653

    accuracy                           0.87      1413
   macro avg       0.87      0.87      0.87      1413
weighted avg       0.87      0.87      0.87      1413



# Support Vector Machine 

In [72]:
# Base Support Vector Machine
svc= SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86       760
           1       0.86      0.79      0.82       653

    accuracy                           0.84      1413
   macro avg       0.85      0.84      0.84      1413
weighted avg       0.85      0.84      0.84      1413



In [73]:
C = [0.1, 1, 10, 100, 1000]
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
kernel = ["rbf", "linear"]
grid = dict(C = C, gamma = gamma, kernel = kernel)
grid_search = HalvingGridSearchCV(estimator=svc, param_grid=grid, n_jobs=-1, cv=5, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
print(grid_result.best_params_)

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}


In [74]:
# Tuned Support Vector Machine
tuned_svc= SVC(C = 100, gamma = 0.1, kernel = "rbf")
tuned_svc.fit(X_train, y_train)
y_pred = tuned_svc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87       760
           1       0.84      0.86      0.85       653

    accuracy                           0.86      1413
   macro avg       0.86      0.86      0.86      1413
weighted avg       0.86      0.86      0.86      1413



# Bert Model

In [33]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
X = list(df["text"])
y = list(df["sentiment"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
X_train_tokenized = tokenizer(X_train, padding = True, truncation = True, max_length = 512)
X_test_tokenized = tokenizer(X_test, padding = True, truncation = True, max_length = 512)

In [35]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels = None):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings["input_ids"])

In [36]:
train_dataset = Dataset(X_train_tokenized, y_train)
test_dataset = Dataset(X_test_tokenized, y_test)

In [37]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis = 1)

    accuracy = accuracy_score(y_true = labels, y_pred = pred)
    recall = recall_score(y_true = labels, y_pred = pred)
    precision = precision_score(y_true = labels, y_pred = pred)
    f1 = f1_score(y_true = labels, y_pred = pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [38]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  

In [39]:
args = TrainingArguments(
    output_dir = "output",
    num_train_epochs = 1,
    per_device_train_batch_size = 8,
)
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics
)

In [40]:
trainer.train()

  0%|          | 0/413 [00:00<?, ?it/s]

100%|██████████| 413/413 [1:07:41<00:00,  9.84s/it]

{'train_runtime': 4061.8934, 'train_samples_per_second': 0.812, 'train_steps_per_second': 0.102, 'train_loss': 0.4385557013042903, 'epoch': 1.0}





TrainOutput(global_step=413, training_loss=0.4385557013042903, metrics={'train_runtime': 4061.8934, 'train_samples_per_second': 0.812, 'train_steps_per_second': 0.102, 'train_loss': 0.4385557013042903, 'epoch': 1.0})

In [41]:
trainer.evaluate()

100%|██████████| 177/177 [01:52<00:00,  1.57it/s]

<class 'transformers.trainer_utils.EvalPrediction'>





{'eval_loss': 0.3409484624862671,
 'eval_accuracy': 0.8925035360678925,
 'eval_precision': 0.8653001464128843,
 'eval_recall': 0.9078341013824884,
 'eval_f1': 0.8860569715142429,
 'eval_runtime': 113.2985,
 'eval_samples_per_second': 12.48,
 'eval_steps_per_second': 1.562,
 'epoch': 1.0}

In [42]:
trainer.save_model("Model_1")