<h1>Youtube Comments Spam Detection</h1>

In [567]:
import pandas as pd
import numpy as np
import re

from googleapiclient.discovery import build
from dotenv import load_dotenv
import os

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [493]:
# Downloading Stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/kevin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [45]:
# Getting api_key from environment variable
load_dotenv()
api_key = os.getenv('api_key')

# Creating build object for Youtube
youtube = build('youtube', 'v3', developerKey=api_key)

In [75]:
def get_video_ids(youtube, playlist_id):
    video_ids = []
    
    request = youtube.playlistItems().list(
        part='contentDetails',
        playlistId=playlist_id,
        maxResults = 50
    )
    
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part='contentDetails',
            playlistId = playlist_id,
            maxResults = 50,
            pageToken = next_page_token
        )
        response = request.execute()
        
        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])
            
        next_page_token = response.get('nextPageToken')
        
    return video_ids

In [71]:
def get_playlists(youtube, channel_ids):
    upload_playlists = []
    
    request = youtube.channels().list(
        part='contentDetails',
        id=','.join(channel_ids)
    )
    
    response = request.execute()
    
    for item in response['items']:
        upload_playlists.append(item['contentDetails']['relatedPlaylists']['uploads'])
        
    return upload_playlists

In [273]:
def get_comments_in_videos(youtube, video_ids):
    comments = np.array([])
    
    for video_id in video_ids:
        request = youtube.commentThreads().list(
            part='snippet,replies',
            videoId=video_id,
            order = 'time',
            maxResults = 100
        )
        response = request.execute()
        
        comments_in_video = [
            comment['snippet']['topLevelComment']['snippet']['textOriginal']
            for comment in response['items']
        ]
        comments = np.append(comments, comments_in_video)
        
        while response:   
            comments_in_video = [
                comment['snippet']['topLevelComment']['snippet']['textOriginal']
                for comment in response['items']
            ]
            comments = np.append(comments, comments_in_video)
            
            if 'nextPageToken' in response:
                request = youtube.commentThreads().list(
                    part='snippet,replies',
                    videoId=video_id,
                    order = 'time',
                    maxResults = 100,
                    pageToken=response['nextPageToken']
                )
                response = request.execute()
            else:
                break
                
            if len(set(comments)) > 10000:
                break
                
    return list(set(comments))

In [530]:
# Processes and tokenizes the text
def process_text(text): 
    out = re.sub(r'[^\w\s]', '', text) # Removing Punctuation
    out = [word for word in out.split() if word.lower() not in stopwords.words('english')] # Removes Stopwords
    return out

In [373]:
# Getting upload playlist from channel_id
channel_ids = [
    'UCX6OQ3DkcsbYNE6H8uQQuVA', #MrBeast
]
playlists = get_playlists(youtube, channel_ids)

In [374]:
# Getting all video ids from MrBeast
mr_beast_video_ids = get_video_ids(youtube, mr_beast_playlist[0])

# Only getting comments from MrBeast's Latest Video
mr_beast_comments = get_comments_in_videos(youtube, [mr_beast_video_ids[0]])

In [375]:
# Number of MrBeast Videos
len(mr_beast_video_ids)

723

In [418]:
# Initial Setup of Comments DataFrame
youtube_comments = pd.DataFrame(
    data = {'comment': mr_beast_comments[1:], 'is_spam': [np.nan for i in range(len(mr_beast_comments[1:]))]}
)
youtube_comments.to_csv('youtube_comments.csv', index = False)
youtube_comments

Unnamed: 0,comment,is_spam
0,"Mr beast, create the backrooms in real life",
1,Mr. Beast is the world's famous as well as ide...,
2,Hello sir \nI am Sarvesh From India\nI need 1k...,
3,Only 1M please guys you can do it,
4,i love you mrbeast😻😻,
...,...,...
10051,"Bro i send some, direct message on your legit ...",
10052,Hi mr beast you are my favorite YouTuber,
10053,Where’s karl,
10054,You basically got a haircut no need for a hat,


In [485]:
# Labeling Comments if Spam or Not
youtube_comment_temp = pd.read_csv('youtube_comments.csv')
for i in range(youtube_comments.shape[0]):
    if np.isnan(youtube_comment_temp['is_spam'].iloc[i]):
        print(youtube_comment_temp.iloc[i]['comment'])
        is_spam = input()
        if is_spam in ['0', '1']:
            youtube_comment_temp['is_spam'].iloc[i] = is_spam
        elif is_spam in ['2']:
            youtube_comment_temp['is_spam'].iloc[i] = 2
        else:
            break
youtube_comment_temp.to_csv('youtube_comments.csv', index = False)

Ammogabs blog
h


In [486]:
labeled_data = pd.read_csv('youtube_comments.csv')
labeled_data

Unnamed: 0,comment,is_spam
0,"Mr beast, create the backrooms in real life",0.0
1,Mr. Beast is the world's famous as well as ide...,0.0
2,Hello sir \nI am Sarvesh From India\nI need 1k...,1.0
3,Only 1M please guys you can do it,0.0
4,i love you mrbeast😻😻,0.0
...,...,...
10051,"Bro i send some, direct message on your legit ...",
10052,Hi mr beast you are my favorite YouTuber,
10053,Where’s karl,
10054,You basically got a haircut no need for a hat,


In [487]:
# Number of valid data
sum(labeled_data['is_spam'] < 2)

695

In [488]:
# Number of invalid data
labeled_data[labeled_data['is_spam'] == 2].shape[0]

59

In [489]:
# Number of Not-Spam
labeled_data[labeled_data['is_spam'] == 0].shape[0]

628

In [490]:
# Number of Spam
labeled_data[labeled_data['is_spam'] == 1].shape[0]

67

Creating Spam Detection ML Model

In [547]:
filtered_data = labeled_data[labeled_data['is_spam'] < 2]
filtered_data

Unnamed: 0,comment,is_spam
0,"Mr beast, create the backrooms in real life",0.0
1,Mr. Beast is the world's famous as well as ide...,0.0
2,Hello sir \nI am Sarvesh From India\nI need 1k...,1.0
3,Only 1M please guys you can do it,0.0
4,i love you mrbeast😻😻,0.0
...,...,...
749,You look good bald,0.0
750,Fianlly got a Verifted Bagdge,0.0
751,ME hungy,0.0
752,bruh i dont have a dog and a cat,0.0


In [551]:
# Convert text to matrix of token counts
bag_of_words = CountVectorizer(analyzer=process_text).fit_transform(filtered_data['comment'])

In [553]:
# Splitting data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(
    bag_of_words,
    filtered_data['is_spam'],
    test_size = 0.3,
    random_state = 0
)

In [556]:
# Creating and training Naive Bayes Classifier
classifier = MultinomialNB().fit(X_train, y_train)

Evaluating Model on Training Data Set

In [575]:
pred_train = classifier.predict(X_train)
print('Classification Report: \n', classification_report(y_train, pred_train))
print('Confusion matrix: \n', confusion_matrix(y_train, pred_train))
print('Accuracy: \n', accuracy_score(y_train, pred_train))

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       433
         1.0       0.98      0.87      0.92        53

    accuracy                           0.98       486
   macro avg       0.98      0.93      0.96       486
weighted avg       0.98      0.98      0.98       486

Confusion matrix: 
 [[432   1]
 [  7  46]]
Accuracy: 
 0.9835390946502057


Evaluating Model on Test Data Set

In [579]:
# Evaluating Model on training data set
pred_test = classifier.predict(X_test)
print('Classification Report: \n', classification_report(y_test, pred_test))
print('Confusion matrix: \n', confusion_matrix(y_test, pred_test))
print('Accuracy: \n', accuracy_score(y_test, pred_test))

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.97      0.94      0.96       195
         1.0       0.42      0.57      0.48        14

    accuracy                           0.92       209
   macro avg       0.69      0.76      0.72       209
weighted avg       0.93      0.92      0.92       209

Confusion matrix: 
 [[184  11]
 [  6   8]]
Accuracy: 
 0.9186602870813397
