In [1]:
#Libraries 
import numpy as np
import pandas as pd 
import os

# Import functions for data preprocessing & data preparation
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from string import punctuation
import nltk
import re

In [2]:
# Reading the dataset
data = pd.read_csv(r"Datasets\\commentscraperfile.csv")
data.columns

Index(['Comment'], dtype='object')

In [3]:
data.head()

Unnamed: 0,Comment
0,Hahaha😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂...
1,He is moaning like his butt got fucked 😂😂😂😂
2,So not cool! Btw he tapped out twice and you s...
3,Best part. There there. Pat's on the back.
4,he sounds like a bata-male. or a women.


In [4]:
# #Removing the unwanted columns
# data1=data.drop(['Unnamed: 0','Likes','Time','user','UserLink'],axis=1)
# data1

In [5]:
data['Comment'][0]

'Hahaha😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂'

In [6]:
data1=data.copy()

In [7]:
data1.head()

Unnamed: 0,Comment
0,Hahaha😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂...
1,He is moaning like his butt got fucked 😂😂😂😂
2,So not cool! Btw he tapped out twice and you s...
3,Best part. There there. Pat's on the back.
4,he sounds like a bata-male. or a women.


In [8]:
#Labeling the dataset
nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()
data1["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in data1["Comment"]]
data1["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in data1["Comment"]]
data1["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in data1["Comment"]]
data1['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in data1["Comment"]]
score = data1["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
data1["Sentiment"] = sentiment
data1.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\shibi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Comment,Positive,Negative,Neutral,Compound,Sentiment
0,Hahaha😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂...,0.0,0.0,1.0,0.0,Neutral
1,He is moaning like his butt got fucked 😂😂😂😂,0.175,0.406,0.42,-0.5106,Negative
2,So not cool! Btw he tapped out twice and you s...,0.107,0.222,0.671,-0.4464,Negative
3,Best part. There there. Pat's on the back.,0.375,0.0,0.625,0.6369,Positive
4,he sounds like a bata-male. or a women.,0.333,0.0,0.667,0.3612,Positive


In [9]:
#Droping the unwanted columns
data2=data1.drop(['Positive','Negative','Neutral','Compound'],axis=1)
data2.head()

Unnamed: 0,Comment,Sentiment
0,Hahaha😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂...,Neutral
1,He is moaning like his butt got fucked 😂😂😂😂,Negative
2,So not cool! Btw he tapped out twice and you s...,Negative
3,Best part. There there. Pat's on the back.,Positive
4,he sounds like a bata-male. or a women.,Positive


In [10]:
stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()
lzr = WordNetLemmatizer()

In [11]:
st = [
    'no', 'very', "don't", "aren't", "couldn't", "didn't",
    "doesn't", "hadn't", "hasn't", "haven't", "isn't", "mightn't",
    "mustn't", "needn't", "shan't", "shouldn't", "wasn't", "weren't",
    'won', "won't", "wouldn't"
]

In [12]:
stop_words=[word for word in stop_words if word not in st]

In [13]:
def text_processing(text):   
    # convert text into lowercase
    text = text.lower()

    # remove new line characters in text
    text = re.sub(r'\n',' ', text)
    
    # remove punctuations from text
    text = re.sub('[%s]' % re.escape(punctuation), "", text)
    
    # remove references and hashtags from text
    text = re.sub("^a-zA-Z0-9$,.", "", text)
    
    # remove multiple spaces from text
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # remove special characters from text
    text = re.sub(r'\W', ' ', text)

    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
    
    # stemming using porter stemmer from nltk package - msh a7sn 7aga - momken: lancaster, snowball
    text=' '.join([porter_stemmer.stem(word) for word in word_tokenize(text)])
    
    # lemmatizer using WordNetLemmatizer from nltk package
    text=' '.join([lzr.lemmatize(word) for word in word_tokenize(text)])

    return text

In [14]:
#copying the data 
nltk.download('omw-1.4')
data_copy = data2.copy()
data_copy.Comment = data_copy.Comment.apply(lambda text: text_processing(text))

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shibi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [15]:
#Encoding the text values to numeric
le = LabelEncoder()
data_copy['Sentiment'] = le.fit_transform(data_copy['Sentiment'])

In [16]:
processed_data = {
    'Sentence':data_copy.Comment,
    'Sentiment':data_copy['Sentiment']
}

processed_data = pd.DataFrame(processed_data)
processed_data.head()

Unnamed: 0,Sentence,Sentiment
0,hahaha,1
1,moan like butt got fuck,0
2,cool btw tap twice still help beyond danger ev...,0
3,best part pat back,2
4,sound like batamal woman,2


In [17]:
processed_data['Sentiment'].value_counts()

Sentiment
2    2955
1    2433
0    2339
Name: count, dtype: int64

In [18]:
#resmapleing teh dataset
df_neutral = processed_data[(processed_data['Sentiment']==1)] 
df_negative = processed_data[(processed_data['Sentiment']==0)]
df_positive = processed_data[(processed_data['Sentiment']==2)]

# # upsample minority classes
# df_negative_upsampled = resample(df_negative, 
#                                  replace=True,    
#                                  n_samples= 205, 
#                                  random_state=42)  

# df_neutral_upsampled = resample(df_neutral, 
#                                  replace=True,    
#                                  n_samples= 205, 
#                                  random_state=42)  


# Concatenate the upsampled dataframes with the neutral dataframe
# final_data = pd.concat([df_negative_upsampled,df_neutral_upsampled,df_positive])
final_data = pd.concat([df_negative,df_neutral,df_positive])

In [19]:
final_data['Sentiment'].value_counts()

Sentiment
2    2955
1    2433
0    2339
Name: count, dtype: int64

In [20]:
final_data.head()

Unnamed: 0,Sentence,Sentiment
1,moan like butt got fuck,0
2,cool btw tap twice still help beyond danger ev...,0
6,omg felt bad liter want hug,0
7,didnt bring fact exact thing put hot sauc hamb...,0
9,could die,0


In [21]:
corpus = []
for sentence in final_data['Sentence']:
    corpus.append(sentence)
corpus[0:5]

['moan like butt got fuck',
 'cool btw tap twice still help beyond danger even prank',
 'omg felt bad liter want hug',
 'didnt bring fact exact thing put hot sauc hamburg ultim reveng',
 'could die']

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = final_data.iloc[:, -1].values

In [23]:
import pickle
# Save the vectorizer to a file
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(cv, f)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [25]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Create the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

In [27]:
# from sklearn.naive_bayes import GaussianNB
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# classifier = GaussianNB()
# classifier.fit(X_train, y_train)

In [28]:
# Predictions
y_pred = rf_model.predict(X_test)

In [29]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8272962483829237
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.74      0.79       486
           1       0.82      0.88      0.85       475
           2       0.82      0.86      0.84       585

    accuracy                           0.83      1546
   macro avg       0.83      0.83      0.83      1546
weighted avg       0.83      0.83      0.83      1546



In [30]:
 import pickle

In [31]:
filename='trained_model.sav'
pickle.dump(rf_model,open(filename,'wb'))

In [32]:
#loading the model
loaded_model = pickle.load(open('trained_model.sav','rb'))

In [33]:
import pandas as pd

# Prompt the user for input
s = input("Enter comments : ")
user_input=text_processing(s)
# Split the input into a list of comments
comments = user_input.split(',')

# Create a dictionary with the 'comment' column
data = {'Comment': comments}

# Create a DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
print("DataFrame:")
print(df)


DataFrame:
  Comment
0        


In [34]:
df["Comment"][0]

''

In [35]:
X = cv.transform(df["Comment"]).toarray()
Y= rf_model.predict(X)

In [36]:
if(Y==0):
    print("Negative")
elif(Y==1):
    print("Neutral")
else:
    print("Positive")

Neutral


In [37]:
# from sklearn.metrics import confusion_matrix, accuracy_score
# y_pred = classifier.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# cm

In [38]:
# nb_score = accuracy_score(y_test, y_pred)
# print('accuracy',nb_score)

In [39]:
import pandas as pd
from googleapiclient.discovery import build
import re

# Prompt the user for the YouTube API key
API_KEY = 'YOUR - API - KEY'

def extract_video_id(url):
    # Regular expression to extract the video ID from various YouTube URL formats
    video_id_match = re.search(r"(?:v=|v\/|vi\/|videos\/|embed\/|youtu.be\/|watch\?v=|\?v=|&v=|\?id=)([a-zA-Z0-9_-]+)", url)

    if video_id_match:
        return video_id_match.group(1)
    else:
        return None

def get_youtube_comments(api_key, video_id, max_comments):
    youtube = build('youtube', 'v3', developerKey=api_key)

    # Retrieve video comments
    comments = []
    nextPageToken = None
    total_comments_retrieved = 0

    while True and total_comments_retrieved < max_comments:
        request = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            textFormat='plainText',
            pageToken=nextPageToken
        )
        response = request.execute()

        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)
            total_comments_retrieved += 1

            if total_comments_retrieved >= max_comments:
                break

        nextPageToken = response.get('nextPageToken')

        if not nextPageToken or total_comments_retrieved >= max_comments:
            break

    return comments

# Prompt the user for the YouTube video URL
youtube_url = input("Enter the YouTube video URL: ")

# Extract the video ID from the URL
video_id = extract_video_id(youtube_url)

if video_id:
    MAX_COMMENTS = 2000  # Maximum comments to retrieve
    # Fetch YouTube comments for the specified video
    comments = get_youtube_comments(API_KEY, video_id, MAX_COMMENTS)

    # Create a DataFrame from the comments
    df = pd.DataFrame(comments, columns=['Comment'])

    # Display the DataFrame
    print(df)
else:
    print("Invalid YouTube video URL. Please provide a valid URL.")


Invalid YouTube video URL. Please provide a valid URL.


In [40]:
df.rename(columns={'text': 'Comment'}, inplace=True)
df

Unnamed: 0,Comment
0,


In [41]:
df.head()

Unnamed: 0,Comment
0,


In [42]:
df["Comment"] = df["Comment"].apply(lambda x : text_processing(x))

In [43]:
df

Unnamed: 0,Comment
0,


In [44]:
X = cv.transform(df["Comment"]).toarray()

In [46]:
Y= rf_model.predict(X)

In [47]:
Y

array([1])

In [48]:
nega = np.sum(Y == 0)
neu=np.sum(Y==1)
pos=np.sum(Y==2)

In [49]:
nega

0

In [50]:
neu

1

In [51]:
pos

0

In [52]:
pos+neu+nega

1