In [1]:
import numpy as np
import pandas as pd
import re
import csv
from googleapiclient.discovery import build


# 1.  API key & Video URL

API_KEY = "AIzaSyC_s-Das41CMY6I4c1aCopzJ53b-9kIG-s"   
video_url = "https://www.youtube.com/watch?v=TZ78SFvWGS0"


# 2. Extract video ID

video_id_match = re.search(r"v=([a-zA-Z0-9_-]+)", video_url)
if video_id_match:
    video_id = video_id_match.group(1)
else:
    raise ValueError("Invalid YouTube URL")

print("Video ID:", video_id)


# 3. Build YouTube API client

youtube = build("youtube", "v3", developerKey=API_KEY)


# 4. Fetch comments + reactions

comments = []
next_page_token = None

while True:
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        maxResults=100,
        pageToken=next_page_token,
        textFormat="plainText"
    )
    response = request.execute()

    for item in response["items"]:
        comment_data = item["snippet"]["topLevelComment"]["snippet"]
        comment = comment_data["textDisplay"]
        like_count = comment_data.get("likeCount", 0)  # reaction
        comments.append([comment, like_count])

    next_page_token = response.get("nextPageToken")
    if not next_page_token:
        break

print(f"Fetched {len(comments)} comments")

# ==========================
# 5. Save to CSV
# ==========================
with open("youtube_comments.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["comment", "react"])  # two columns
    writer.writerows(comments)

print("✅ Comments with reactions saved to youtube_comments.csv")


Video ID: TZ78SFvWGS0
Fetched 457 comments
✅ Comments with reactions saved to youtube_comments.csv


In [2]:
data = pd.read_csv("youtube_comments.csv")

In [3]:
#data.duplicate()

In [4]:
data.head(10)

Unnamed: 0,comment,react
0,King Kohli 🗿\nFantastic du plessis 🗿\nRaPa 🗿\n...,0
1,King Kohli k hatters ko itna bolaunga bhai log...,0
2,12:29,0
3,Isi k baad jalkutre ne haath nhi milaya tha,0
4,Thumare jit se jyada hamare har ki charcha hot...,1
5,Csk ki mkc wo bhi 7 baaaarr 😂,1
6,Rcb ❤❤❤❤❤❤,1
7,Let’s all laugh at csk and dhobi 🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣🤣...,0
8,Waara farewell for chappal chor dhobi 🤣🤣🤣🤣🤣🤣🤣🤣...,1
9,This match has more symbolic violence packed i...,1


# Start Preprocessing 

# 01 : Removing HTML tag

In [5]:
def remove_html(text):
    if not isinstance(text, str):   # if it's not a string, make it empty
        return ""
    clean_text = re.sub(r'<[^>]*>', '', text)   # remove HTML
    clean_text = clean_text.replace('\n', ' ')  # remove line breaks
    return clean_text

# Apply to your column
data['comment'] = data['comment'].apply(remove_html)

In [6]:
data['comment']=data['comment'].apply(remove_html)

In [7]:
data

Unnamed: 0,comment,react
0,King Kohli 🗿 Fantastic du plessis 🗿 RaPa 🗿 Gre...,0
1,King Kohli k hatters ko itna bolaunga bhai log...,0
2,12:29,0
3,Isi k baad jalkutre ne haath nhi milaya tha,0
4,Thumare jit se jyada hamare har ki charcha hot...,1
...,...,...
452,Never gonna forget this match ❤🥺rcb,506
453,RCB FOREVER ❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤,15
454,Thanks bto,4
455,#tzsumitrazz,1


In [8]:
data['comment'][0]

'King Kohli 🗿 Fantastic du plessis 🗿 RaPa 🗿 Green bhai 🗿 Dk 🗿  Mad max 🗿 Yash Dayal 🗿'

# 02 : Convert to Lowercase

In [9]:
data['comment']=data['comment'].str.lower()

In [10]:
data

Unnamed: 0,comment,react
0,king kohli 🗿 fantastic du plessis 🗿 rapa 🗿 gre...,0
1,king kohli k hatters ko itna bolaunga bhai log...,0
2,12:29,0
3,isi k baad jalkutre ne haath nhi milaya tha,0
4,thumare jit se jyada hamare har ki charcha hot...,1
...,...,...
452,never gonna forget this match ❤🥺rcb,506
453,rcb forever ❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤,15
454,thanks bto,4
455,#tzsumitrazz,1


# 03 : Remove Url

In [11]:
def remove_url(text):
    return re.sub(r'http\S+|www\S+', '', text)

In [12]:
data['comment']=data['comment'].apply(remove_url)

In [13]:
data['comment'][0]

'king kohli 🗿 fantastic du plessis 🗿 rapa 🗿 green bhai 🗿 dk 🗿  mad max 🗿 yash dayal 🗿'

# 04 : Remove Punctuation Marks 

In [14]:
import string
string.punctuation 

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
exclude=string.punctuation

In [16]:
def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

In [17]:
data['comment']=data['comment'].apply(remove_punc)

In [18]:
data

Unnamed: 0,comment,react
0,king kohli 🗿 fantastic du plessis 🗿 rapa 🗿 gre...,0
1,king kohli k hatters ko itna bolaunga bhai log...,0
2,1229,0
3,isi k baad jalkutre ne haath nhi milaya tha,0
4,thumare jit se jyada hamare har ki charcha hot...,1
...,...,...
452,never gonna forget this match ❤🥺rcb,506
453,rcb forever ❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤,15
454,thanks bto,4
455,tzsumitrazz,1


# 05 : Handling the Slang

In [19]:
slang = {
    "A3": "Anytime, Anywhere, Anyplace",
    "ADIH": "Another Day In Hell",
    "AFK": "Away From Keyboard",
    "AFAIK": "As Far As I Know",
    "ASAP": "As Soon As Possible",
    "ASL": "Age, Sex, Location",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "BAE": "Before Anyone Else",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRUH": "Bro",
    "BRT": "Be Right There",
    "BSAAW": "Big Smile And A Wink",
    "BTW": "By The Way",
    "BWL": "Bursting With Laughter",
    "CSL": "Can’t Stop Laughing",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "DM": "Direct Message",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FIMH": "Forever In My Heart",
    "FOMO": "Fear Of Missing Out",
    "FR": "For Real",
    "FWIW": "For What It's Worth",
    "FYP": "For You Page",
    "FYI": "For Your Information",
    "G9": "Genius",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GMTA": "Great Minds Think Alike",
    "GN": "Good Night",
    "GOAT": "Greatest Of All Time",
    "GR8": "Great!",
    "HBD": "Happy Birthday",
    "IC": "I See",
    "ICQ": "I Seek You",
    "IDC": "I Don’t Care",
    "IDK": "I Don't Know",
    "IFYP": "I Feel Your Pain",
    "ILU": "I Love You",
    "ILY": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMU": "I Miss You",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "IYKYK": "If You Know, You Know",
    "JK": "Just Kidding",
    "KISS": "Keep It Simple, Stupid",
    "L": "Loss",
    "L8R": "Later",
    "LDR": "Long Distance Relationship",
    "LMK": "Let Me Know",
    "LMAO": "Laughing My A** Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "M8": "Mate",
    "MFW": "My Face When",
    "MID": "Mediocre",
    "MRW": "My Reaction When",
    "MTE": "My Thoughts Exactly",
    "NVM": "Never Mind",
    "NRN": "No Reply Necessary",
    "NPC": "Non-Player Character",
    "OIC": "Oh I See",
    "OP": "Overpowered",
    "PITA": "Pain In The A**",
    "POV": "Point Of View",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A** Off",
    "RN": "Right Now",
    "SK8": "Skate",
    "STATS": "Your Sex And Age",
    "SUS": "Suspicious",
    "TBH": "To Be Honest",
    "TFW": "That Feeling When",
    "THX": "Thank You",
    "TIME": "Tears In My Eyes",
    "TLDR": "Too Long, Didn’t Read",
    "TNTL": "Trying Not To Laugh",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "W": "Win",
    "W8": "Wait...",
    "WB": "Welcome Back",
    "WTF": "What The F**k",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "WYD": "What You Doing?",
    "WYWH": "Wish You Were Here",
    "ZZZ": "Sleeping, Bored, Tired"
}




In [20]:
def handle_slang(text):
    new_text=[]
    for word in text.split():
        if word.upper() in slang:
            new_text.append((slang[word.upper()]).lower())
        else:
            new_text.append(word)
    return " ".join(new_text)

In [21]:
text="tbh i don't love you "
text

"tbh i don't love you "

In [22]:
handle_slang(text)

"to be honest i don't love you"

In [23]:
data['comment']= data['comment'].apply(handle_slang)

In [24]:
data

Unnamed: 0,comment,react
0,king kohli 🗿 fantastic du plessis 🗿 rapa 🗿 gre...,0
1,king kohli k hatters ko itna bolaunga bhai log...,0
2,1229,0
3,isi k baad jalkutre ne haath nhi milaya tha,0
4,thumare jit se jyada hamare har ki charcha hot...,1
...,...,...
452,never gonna forget this match ❤🥺rcb,506
453,rcb forever ❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤❤,15
454,thanks bto,4
455,tzsumitrazz,1


# 06 : Spell correction

In [25]:
from textblob import TextBlob

In [26]:
#data['comment']=data['comment'].apply(lambda x : str(TextBlob(x).correct()))

# 07 : Remove Stop word

In [27]:
from nltk.corpus import stopwords

In [28]:
def remove_stopword(text):
    new_text=[]
    for word in text.split():
        if word in stopwors.words("english"):
            new_text.append('')
        else:
            new_text.append(word)
    return " ".join(new_text)
    

In [29]:
data['comment']=data['comment'].apply(remove_stopwords)

NameError: name 'remove_stopwords' is not defined

# 08 : Keeping only english sentences 

In [None]:
from nltk.corpus import words
import nltk

In [None]:
nltk.download('words')

In [None]:
english_words = set(words.words())

In [None]:
len(english_words)

In [None]:
def is_pure_english(text, threshold=0.7):
    tokens=text.split()
    if len(tokens) == 0:
        return False
    english_count= sum(1 for w in tokens if w in english_words)
    return (english_count/len(tokens))>=threshold

In [None]:
data = data[data['comment'].apply(is_pure_english)].reset_index(drop=True)

In [None]:
data

# 09 : Handling Emoji

In [None]:
import emoji

In [None]:
def emoji_to_text(text):
    text = emoji.demojize(text)
    text = text.replace(":"," ")
    return text

In [None]:
data['comment']=data['comment'].apply(emoji_to_text)

In [None]:
data

# 10 : Tokenization

In [None]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [None]:
data['tokens'] = data['comment'].apply(lambda x : [token.text for token in nlp(x)])

In [None]:
data

# 11 : Stemming 

In [None]:
from nltk.stem import PorterStemmer



In [None]:
stemmer = PorterStemmer()

In [None]:
data['Stemmed'] = data['tokens'].apply(lambda x : [stemmer.stem(word) for word in x])

In [None]:
data

# 11 : Vectorization with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf=TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=3, max_df=0.9)

In [None]:
data['Stemmed_text'] = data['Stemmed'].apply(lambda x : " ".join(x))

In [None]:
data

In [None]:
data['Stemmed_text'][0]

In [None]:
x = tf.fit_transform(data['Stemmed_text'])

In [None]:
x.toarray()

In [None]:
df = pd.DataFrame(x.toarray(),columns=tf.get_feature_names_out())

In [None]:
X = df.copy()

X

In [None]:
# Add React_count as a new feature
X['React_count'] = data['react']

In [None]:
X

# 12 : Scaling 

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_scaled=pd.DataFrame(X_scaled,columns=scaler.get_feature_names_out())

In [None]:
X_scaled

# 13 : ML Model Building

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
# Finding the best value for K


In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(X_scaled)

# Cluster labels
data['cluster'] = kmeans.labels_

print(data[['Stemmed_text', 'react', 'cluster']].head())

In [None]:
data['cluster'].value_counts()

In [None]:
data[data['cluster']==0]

In [None]:
data[data['cluster']==3]

In [None]:
data[data['cluster']==1]

In [None]:
data[data['cluster']==2]

In [None]:
data[data['cluster']==5]

In [None]:
data[data['cluster']==4]