**LIBRARYS**

In [1]:
%pip install google-api-python-client #API handling
%pip install pandas #Dataframes
%pip install numpy  #Arrays
%pip install scikit-learn #ML tools
%pip install imblearn #imbalance 

from googleapiclient.discovery import build #to call API
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from imblearn.pipeline import Pipeline #chains our models tgt
from xgboost import XGBClassifier #gradient boosted model

from sklearn.feature_extraction.text import TfidfVectorizer #convert to TF-IDF features (higher importance to low occurance)
from sklearn.ensemble import RandomForestClassifier #builds multiple decision trees and combines for better results
from imblearn.over_sampling import SMOTE #Synthetic Minority Over-sampling Technique
from imblearn.under_sampling import RandomUnderSampler #randomly removes from majority

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: C:\Users\JT2ju\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#API': Expected package name at the start of dependency specifier
    #API
    ^


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: C:\Users\JT2ju\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#Dataframes': Expected package name at the start of dependency specifier
    #Dataframes
    ^


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: C:\Users\JT2ju\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#Arrays': Expected package name at the start of dependency specifier
    #Arrays
    ^


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: C:\Users\JT2ju\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#ML': Expected package name at the start of dependency specifier
    #ML
    ^

[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: C:\Users\JT2ju\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#imbalance': Expected package name at the start of dependency specifier
    #imbalance
    ^


Note: you may need to restart the kernel to use updated packages.


*DATASET* 

In [2]:
dataset = pd.read_csv('allcomments_labled.csv') #Change File location if necessary

*STAGE 1 MODEL*

In [3]:
X = dataset['text']
y = dataset['sentiment']

vectorizer = TfidfVectorizer(max_features=30000, max_df=0.7, min_df=5, ngram_range=(1, 3)) #ignores less than 5, more than 70%, and also considers trigrams
X_tfidf = vectorizer.fit_transform(X) #transforms into TF-IDF

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

undersample = RandomUnderSampler(sampling_strategy='majority', random_state=42) #undersample majority
oversample = SMOTE(sampling_strategy='not majority', random_state=42) #oversample minority

pipeline = Pipeline(steps=[('o', oversample), ('u', undersample)]) #chains SMOTE and RUS tgt

X_train_balanced, y_train_balanced = pipeline.fit_resample(X_train, y_train) #balance

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_balanced), y=y_train_balanced)
class_weights_dict = dict(zip(np.unique(y_train_balanced), class_weights)) #dictionary where class is mapped to weight ->adjust learning for each class

xgb_model = XGBClassifier(scale_pos_weight=class_weights_dict, random_state=42) #scale pos weight adjusts to class weights
xgb_model.fit(X_train_balanced, y_train_balanced)

# Evaluation
y_pred = xgb_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.79      0.76       823
           1       0.47      0.43      0.45       213
           2       0.66      0.49      0.56       215
           3       0.54      0.39      0.45        36
           4       0.69      0.71      0.70       219

    accuracy                           0.68      1506
   macro avg       0.62      0.56      0.58      1506
weighted avg       0.67      0.68      0.67      1506

Accuracy: 0.6759628154050464


*API FUNCTION*

In [4]:
api_key = "AIzaSyC5qnX0CLK3CYerum4WpsqopEtnlzQHN-I" #May need to change
youtube = build('youtube', 'v3', developerKey=api_key) #setup API


def fetch_all_comments(video_id, max_results=100): #get all comments for video
    comments = [] #empty array
    next_page_token = None

    while True:
        try:
            response = youtube.commentThreads().list(
                part='snippet', #details on comment
                videoId=video_id,
                pageToken=next_page_token, #next comment
                maxResults=max_results,  #get up to max_results per request
                textFormat='plainText'
            ).execute()
        except Exception as e: #error handling
            print(f"An error occurred: {e}")
            break

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']
            comments.append({
                'author': comment['authorDisplayName'],
                'text': comment['textDisplay'],
                'like_count': comment['likeCount'],
                'published_at': comment['publishedAt'] #details appended to comments and attach to empty array
            })
            #print(f"Added comment from {comment['authorDisplayName']}")

        next_page_token = response.get('nextPageToken') #updates token/node
        if not next_page_token:
            break #ends if last

    return comments


*HELPER FUNCTION*

In [5]:
def get_youtube_video_title(video_id, api_key):
    youtube = build('youtube', 'v3', developerKey=api_key) 
    request = youtube.videos().list(part="snippet", id=video_id)
    response = request.execute()
    
    if response["items"]:
        title = response["items"][0]["snippet"]["title"]
        #print("Video Title:", title)
        return title  # return title
    else:
        print("Video not found.")
        return None
    
def interpret_sentiment(predicted_sentiment):
    sentiment_mapping = {
        1: "pleased",
        2: "funny",
        3: "fear",
        4: "sad"
    }
    
    return sentiment_mapping.get(predicted_sentiment, "Unknown sentiment") #matches each number to sentiment

**FINAL CODE**


In [6]:
input_file_path = r'trainingimproved.csv' #change file location if necessary
df = pd.read_csv(input_file_path)

X = df[['Count_0', 'Count_1', 'Count_2', 'Count_3', 'Count_4']]  
y = df['Actual Sentiment'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42, class_weight='balanced') #random forest model

rf_model.fit(X_train, y_train) #training data

y_pred = rf_model.predict(X_test) #tests on testing data (from train test split)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Process video ID and use RF model
def predict_final_sentiment(video_id):
    comments = fetch_all_comments(video_id)

    comments_df = pd.DataFrame(comments)
    sorted_comments = comments_df.sort_values(by='like_count', ascending=False) #sorts comments by like value
    valid_predictions = []

    for index, row in sorted_comments.iterrows():
        if len(valid_predictions) == 30:  #only take top 30
            break
        
        text = row['text'] #extract text
        text_tfidf = vectorizer.transform([text]) #TF-IDF
        prediction = xgb_model.predict(text_tfidf)[0] #predict each comment
        if 0 <= prediction <= 4:
            valid_predictions.append(prediction) 

    prediction_counts = Counter(valid_predictions)
    counts = [prediction_counts.get(i, 0) for i in range(0, 5)] #count each and add to array
    predicted_sentiment = rf_model.predict([counts])[0]  #Final Predicted based on array
    
    return predicted_sentiment

# CHANGE THIS
video_id = 'nIHyr_fp_yI'  

title = get_youtube_video_title(video_id, api_key)
if title:
    predicted_sentiment = predict_final_sentiment(video_id)
    sentiment_label = interpret_sentiment(predicted_sentiment)
    print(f"Predicted sentiment for video '{title}': {sentiment_label}")


Predicted sentiment for video 'Mr Bean Goes Shopping... | Mr Bean Live Action | Funny Clips | Mr Bean': funny
