<a href="https://colab.research.google.com/github/Manavbangotra/YT_VIDEO_Sentiment_Analysis/blob/main/YT_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
import os
import json
import csv
from google.oauth2 import service_account
from googleapiclient.discovery import build

# Path to your service account JSON key file
credentials_path = '/content/solar-modem-352814-474d4df85ee5.json'

# Define the YouTube video ID you want to scrape comments from
video_id = 'EIT6U6j4zN8'

# Authenticate using the JSON key file
credentials = service_account.Credentials.from_service_account_file(
    credentials_path, scopes=['https://www.googleapis.com/auth/youtube.force-ssl']
)

# Create a YouTube Data API client
youtube = build('youtube', 'v3', credentials=credentials)

# Function to retrieve comments from a video
def get_video_comments(youtube, **kwargs):
    comments = []
    results = youtube.commentThreads().list(**kwargs).execute()

    while results:
        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)

        # Retrieve the next page of comments, if available
        kwargs['pageToken'] = results.get('nextPageToken')
        if kwargs['pageToken']:
            results = youtube.commentThreads().list(**kwargs).execute()
        else:
            break

    return comments

# Call the function to retrieve comments
video_comments = get_video_comments(
    youtube, part='snippet', videoId=video_id, textFormat='plainText'
)

# Specify the CSV file path where you want to save the comments
csv_file_path = 'comments.csv'

# Save the comments to the CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Comment']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for comment in video_comments:
        writer.writerow({'Comment': comment})

print(f'Comments have been saved to {csv_file_path}')


Comments have been saved to comments.csv


In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os

# Import functions for data preprocessing & data preparation
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string
from string import punctuation
import nltk
import re

In [58]:
data1 = pd.read_csv('/content/comments.csv')
data.columns

Index(['Comment'], dtype='object')

In [59]:
nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()
data1["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in data1["Comment"]]
data1["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in data1["Comment"]]
data1["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in data1["Comment"]]
data1['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in data1["Comment"]]
score = data1["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
data1["Sentiment"] = sentiment
data1.head()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Comment,Positive,Negative,Neutral,Compound,Sentiment
0,This video is actually very close to my heart....,0.133,0.031,0.837,0.7181,Positive
1,We miss you legend,0.0,0.348,0.652,-0.1531,Negative
2,REST IN PEACE OUR LEGEND SUSHANT SINGH RAJPUT❤,0.333,0.0,0.667,0.5423,Positive
3,This is not edited by pen its edited by pain 💔,0.0,0.268,0.732,-0.5106,Negative
4,Sry bro from starting I saw his every film as ...,0.156,0.154,0.69,-0.2382,Negative


In [60]:
data2=data1.drop(['Positive','Negative','Neutral','Compound'],axis=1)
data2.head()

Unnamed: 0,Comment,Sentiment
0,This video is actually very close to my heart....,Positive
1,We miss you legend,Negative
2,REST IN PEACE OUR LEGEND SUSHANT SINGH RAJPUT❤,Positive
3,This is not edited by pen its edited by pain 💔,Negative
4,Sry bro from starting I saw his every film as ...,Negative


In [64]:
stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snowball_stemer = SnowballStemmer(language="english")
lzr = WordNetLemmatizer()

In [68]:

def text_processing(text):
    # convert text into lowercase
    text = text.lower()

    # remove new line characters in text
    text = re.sub(r'\n',' ', text)

    # remove punctuations from text
    text = re.sub('[%s]' % re.escape(punctuation), "", text)

    # remove references and hashtags from text
    text = re.sub("^a-zA-Z0-9$,.", "", text)

    # remove multiple spaces from text
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    # remove special characters from text
    text = re.sub(r'\W', ' ', text)

    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])

    # stemming using porter stemmer from nltk package - msh a7sn 7aga - momken: lancaster, snowball
    # text=' '.join([porter_stemmer.stem(word) for word in word_tokenize(text)])
    # text=' '.join([lancaster_stemmer.stem(word) for word in word_tokenize(text)])
    # text=' '.join([snowball_stemer.stem(word) for word in word_tokenize(text)])

    # lemmatizer using WordNetLemmatizer from nltk package
    text=' '.join([lzr.lemmatize(word) for word in word_tokenize(text)])

    return text


In [71]:
nltk.download('omw-1.4')
data_copy = data2.copy()
data_copy.Comment = data_copy.Comment.apply(lambda text: text_processing(text))

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [72]:
le = LabelEncoder()
data_copy['Sentiment'] = le.fit_transform(data_copy['Sentiment'])

In [73]:
processed_data = {
    'Sentence':data_copy.Comment,
    'Sentiment':data_copy['Sentiment']
}

processed_data = pd.DataFrame(processed_data)
processed_data.head()

Unnamed: 0,Sentence,Sentiment
0,video actually close heart editor felt emotion...,2
1,miss legend,0
2,rest peace legend sushant singh rajput,2
3,edited pen edited pain,0
4,sry bro starting saw every film normal hero en...,0


In [74]:
processed_data['Sentiment'].value_counts()

1    690
2    469
0    221
Name: Sentiment, dtype: int64

In [76]:
df_neutral = processed_data[(processed_data['Sentiment']==1)]
df_negative = processed_data[(processed_data['Sentiment']==0)]
df_positive = processed_data[(processed_data['Sentiment']==2)]

# upsample minority classes
df_negative_upsampled = resample(df_negative,
                                 replace=True,
                                 n_samples= 205,
                                 random_state=42)

df_neutral_upsampled = resample(df_neutral,
                                 replace=True,
                                 n_samples= 205,
                                 random_state=42)


# Concatenate the upsampled dataframes with the neutral dataframe
final_data = pd.concat([df_negative_upsampled,df_neutral_upsampled,df_positive])

In [77]:
final_data['Sentiment'].value_counts()

2    469
0    205
1    205
Name: Sentiment, dtype: int64

In [78]:
corpus = []
for sentence in final_data['Sentence']:
    corpus.append(sentence)
corpus[0:5]

['beat drop english lyric dat smthng damn',
 'bhai dislike ka option hta de bro',
 'miss',
 'one replace shushant singh sir',
 '120 drop put word feeling']

In [79]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = final_data.iloc[:, -1].values

In [80]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [81]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[48,  2, 10],
       [ 4, 48, 18],
       [19, 17, 98]])

In [82]:
nb_score = accuracy_score(y_test, y_pred)
print('accuracy',nb_score)

accuracy 0.7348484848484849
