<h1>Youtube Comments Spam Detection</h1>

In [655]:
import pandas as pd
import numpy as np
import re

from googleapiclient.discovery import build
from dotenv import load_dotenv
import os

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [656]:
# Downloading Stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/kevin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [45]:
# Getting api_key from environment variable
load_dotenv()
api_key = os.getenv('api_key')

# Creating build object for Youtube
youtube = build('youtube', 'v3', developerKey=api_key)

In [75]:
def get_video_ids(youtube, playlist_id):
    video_ids = []
    
    request = youtube.playlistItems().list(
        part='contentDetails',
        playlistId=playlist_id,
        maxResults = 50
    )
    
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part='contentDetails',
            playlistId = playlist_id,
            maxResults = 50,
            pageToken = next_page_token
        )
        response = request.execute()
        
        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])
            
        next_page_token = response.get('nextPageToken')
        
    return video_ids

In [71]:
def get_playlists(youtube, channel_ids):
    upload_playlists = []
    
    request = youtube.channels().list(
        part='contentDetails',
        id=','.join(channel_ids)
    )
    
    response = request.execute()
    
    for item in response['items']:
        upload_playlists.append(item['contentDetails']['relatedPlaylists']['uploads'])
        
    return upload_playlists

In [715]:
def get_comments_in_videos(youtube, video_ids):
    comments = np.array([])
    
    for video_id in video_ids:
        request = youtube.commentThreads().list(
            part='snippet,replies',
            videoId=video_id,
            order = 'time',
            maxResults = 100
        )
        response = request.execute()
        
        comments_in_video = [
            comment['snippet']['topLevelComment']['snippet']['textOriginal']
            for comment in response['items']
        ]
        comments = np.append(comments, comments_in_video)
        
        while response:   
            comments_in_video = [
                comment['snippet']['topLevelComment']['snippet']['textOriginal']
                for comment in response['items']
            ]
            comments = np.append(comments, comments_in_video)
            
            if 'nextPageToken' in response:
                request = youtube.commentThreads().list(
                    part='snippet,replies',
                    videoId=video_id,
                    order = 'time',
                    maxResults = 100,
                    pageToken=response['nextPageToken']
                )
                response = request.execute()
            else:
                break
                
            if len(set(comments)) > 2000:
                break
                
    return list(set(comments))

In [530]:
# Processes and tokenizes the text
def process_text(text): 
    out = re.sub(r'[^\w\s]', '', text) # Removing Punctuation
    out = [word for word in out.split() if word.lower() not in stopwords.words('english')] # Removes Stopwords
    return out

In [373]:
# Getting upload playlist from channel_id
channel_ids = [
    'UCX6OQ3DkcsbYNE6H8uQQuVA', # MrBeast
    'UCoOjH8D2XAgjzQlneM2W0EQ', # JakeTran
    'UCgv4dPk_qZNAbUW9WkuLPSA', # Atrioc
    'UCYzPXprvl5Y-Sf0g4vX-m6g', # Jacksepticeye
    'UCBJycsmduvYEL83R_U4JriQ'  # Marques Brownlee
]
playlists = get_playlists(youtube, channel_ids)

In [685]:
# Getting all video ids for each Youtuber
jacksepticeye_video_ids = get_video_ids(youtube, playlists[0])
mr_beast_video_ids = get_video_ids(youtube, playlists[1])
marques_brownlee_video_ids = get_video_ids(youtube, playlists[2])
jake_tran_video_ids = get_video_ids(youtube, playlists[3])
atrioc_video_ids = get_video_ids(youtube, playlists[4])

In [688]:
# Number of videos for each Youtuber
print('Number of Jacksepticeye Videos: ', len(jacksepticeye_video_ids))
print('Number of MrBeast Videos: ', len(mr_beast_video_ids))
print('Number of Marques Brownlee Videos: ', len(marques_brownlee_video_ids))
print('Number of Jake Tran Videos: ', len(jake_tran_video_ids))
print('Number of Atrioc Videos: ', len(atrioc_video_ids))

Number of Jacksepticeye Videos:  4979
Number of MrBeast Videos:  723
Number of Marques Brownlee Videos:  1449
Number of Jake Tran Videos:  203
Number of Atrioc Videos:  615


In [716]:
# Only getting roughly at most 2000 comments from Youtuber's Latest Video
jacksepticeye_comments = get_comments_in_videos(youtube, [jacksepticeye_video_ids[0]])
mr_beast_comments = get_comments_in_videos(youtube, [mr_beast_video_ids[0]])
marques_brownlee_comments = get_comments_in_videos(youtube, [marques_brownlee_video_ids[0]])
jake_tran_comments = get_comments_in_videos(youtube, [jake_tran_video_ids[1]])
atrioc_comments = get_comments_in_videos(youtube, [atrioc_video_ids[1]])
# At the time, Jake Tran and Atrioc's Latest Video just came out so decided to use second latest video

In [717]:
# Combing all comments
combined_comments = (
    jacksepticeye_comments +
    mr_beast_comments + 
    marques_brownlee_comments + 
    jake_tran_comments +
    atrioc_comments
)

In [724]:
# Initial Setup of Comments DataFrame
youtube_comments = pd.DataFrame(
    data = {'comment': combined_comments, 'is_spam': [np.nan for i in range(len(combined_comments))]}
)
youtube_comments.to_csv('youtube_comments.csv', index = False)
youtube_comments

Unnamed: 0,comment,is_spam
0,Cats can eat grass it helps them when there st...,
1,I can't wait for Part 2 on this :D look's awes...,
2,"Its okay about the poncho Jack, you return to ...",
3,Jack you forgot to find the first few memory s...,
4,"I really hope Jack plays Sky some day, I reall...",
...,...,...
7982,"Forgot Paper Mario, classic mistake",
7983,what?!?,
7984,I was not ready for him stabbing himself,
7985,I love the sound of falling onto a window in M...,


In [582]:
# Labeling Comments if Spam or Not
youtube_comment_temp = pd.read_csv('youtube_comments.csv')
for i in range(youtube_comments.shape[0]):
    if np.isnan(youtube_comment_temp['is_spam'].iloc[i]):
        print(youtube_comment_temp.iloc[i]['comment'])
        is_spam = input()
        if is_spam in ['0', '1']:
            youtube_comment_temp['is_spam'].iloc[i] = is_spam
        elif is_spam in ['2']:
            youtube_comment_temp['is_spam'].iloc[i] = 2
        else:
            break
youtube_comment_temp.to_csv('youtube_comments.csv', index = False)

Ammogabs blog
2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


SO CLOSE TO 100M!
0
9:05
0
Guess what I shaved my head to weeks agai and I’m now just seeing this
0
100M SUSCRIPTORES SUSCRIBE A MRBEAST 🙏😈🤙😊😊
0
I'm that guy.
0
عقبال 100مليون مستر بيست انا انتظر ان اوفيك ال100مليون بنفسي مستر بيستIlove♥♥♥
2
Can we have one thousand to pay off a canon camrea
0
Today is my birthday:)
0
18
0
Jimmy, your taking this to far
0
It's so cool, because you can kinda see the behind the scenes of Jimmy's life lol
0
What you’re saying is that you used to come and see your family
0
Visit me
0
day one of asking for money i need 1000$ to get a new dirt bike
1
😎👍👍👍👍👍
0
Is this a world record
0
That surfing pikachu hoodie was so cool
0
Of you go too my birthday I will give you food ☺️
0
the odds of being picked fora vid is so rare i will never get to meet you im subbed
0
You should buy a private island and make a micro nation (boost this so he can see)
0
Cmon pepple subscribe we are almost at 100m
0
Subscribe to mrbeast the king himself I will kidnap you if you don’t 🥱

In [583]:
labeled_data = pd.read_csv('youtube_comments.csv')
labeled_data

Unnamed: 0,comment,is_spam
0,"Mr beast, create the backrooms in real life",0.0
1,Mr. Beast is the world's famous as well as ide...,0.0
2,Hello sir \nI am Sarvesh From India\nI need 1k...,1.0
3,Only 1M please guys you can do it,0.0
4,i love you mrbeast😻😻,0.0
...,...,...
10051,"Bro i send some, direct message on your legit ...",
10052,Hi mr beast you are my favorite YouTuber,
10053,Where’s karl,
10054,You basically got a haircut no need for a hat,


In [584]:
# Number of valid data
sum(labeled_data['is_spam'] < 2)

780

In [585]:
# Number of invalid data
labeled_data[labeled_data['is_spam'] == 2].shape[0]

67

In [586]:
# Number of Not-Spam
labeled_data[labeled_data['is_spam'] == 0].shape[0]

710

In [587]:
# Number of Spam
labeled_data[labeled_data['is_spam'] == 1].shape[0]

70

To get more data for spam comments, I utilized the UCI Machine Learning Repository to gather more spam comments.
https://archive.ics.uci.edu/ml/datasets/YouTube+Spam+Collection

In [677]:
# Importing data from UCI Machine Learning Repository
psy_data = pd.read_csv('YouTube-Spam-Collection-v1/Youtube01-Psy.csv')[['CONTENT', 'CLASS']]
katy_perry_data = pd.read_csv('YouTube-Spam-Collection-v1/Youtube02-KatyPerry.csv')[['CONTENT', 'CLASS']]
lmfao_data = pd.read_csv('YouTube-Spam-Collection-v1/Youtube03-LMFAO.csv')[['CONTENT', 'CLASS']]
eminem_data = pd.read_csv('YouTube-Spam-Collection-v1/Youtube04-Eminem.csv')[['CONTENT', 'CLASS']]
shakira_data = pd.read_csv('YouTube-Spam-Collection-v1/Youtube05-Shakira.csv')[['CONTENT', 'CLASS']]

In [682]:
# Concat all spam comments, standardizing column names, filter only spam comments
spam_data = pd.concat([psy_data, katy_perry_data, lmfao_data, eminem_data, shakira_data])
spam_comments = spam_data.rename(columns={'CONTENT':'comments', 'CLASS':'is_spam'})[spam_data['CLASS'] == 1]
spam_comments

Unnamed: 0,comments,is_spam
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1
...,...,...
357,********OMG Facebook is OLD! Check out ------...,1
358,Hey Music Fans I really appreciate all of you ...,1
359,**CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...,1
360,**CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...,1


<h2>Creating Spam Detection ML Model</h2>

In [588]:
filtered_data = labeled_data[labeled_data['is_spam'] < 2]
filtered_data

Unnamed: 0,comment,is_spam
0,"Mr beast, create the backrooms in real life",0.0
1,Mr. Beast is the world's famous as well as ide...,0.0
2,Hello sir \nI am Sarvesh From India\nI need 1k...,1.0
3,Only 1M please guys you can do it,0.0
4,i love you mrbeast😻😻,0.0
...,...,...
841,How is meatlover,0.0
842,I’d die doing this,0.0
843,Please go to Colorado Springs and help the hom...,0.0
844,Bro I honestly love the bald it looks great,0.0


In [589]:
# Convert text to matrix of token counts
bag_of_words = CountVectorizer(analyzer=process_text).fit_transform(filtered_data['comment'])

In [590]:
# Splitting data into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(
    bag_of_words,
    filtered_data['is_spam'],
    test_size = 0.3,
    random_state = 0
)

In [591]:
# Creating and training Naive Bayes Classifier
classifier = MultinomialNB().fit(X_train, y_train)

<h2>Evaluating Model on Training Data Set</h2>

In [592]:
pred_train = classifier.predict(X_train)
print('Classification Report: \n', classification_report(y_train, pred_train))
print('Confusion matrix: \n', confusion_matrix(y_train, pred_train), '\n')
print('Accuracy: \n', accuracy_score(y_train, pred_train))

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       501
         1.0       1.00      0.73      0.85        45

    accuracy                           0.98       546
   macro avg       0.99      0.87      0.92       546
weighted avg       0.98      0.98      0.98       546

Confusion matrix: 
 [[501   0]
 [ 12  33]] 

Accuracy: 
 0.978021978021978


<h2>Evaluating Model on Test Data Set</h2>

In [593]:
# Evaluating Model on training data set
pred_test = classifier.predict(X_test)
print('Classification Report: \n', classification_report(y_test, pred_test))
print('Confusion matrix: \n', confusion_matrix(y_test, pred_test), '\n')
print('Accuracy: \n', accuracy_score(y_test, pred_test))

Classification Report: 
               precision    recall  f1-score   support

         0.0       0.93      0.93      0.93       209
         1.0       0.44      0.44      0.44        25

    accuracy                           0.88       234
   macro avg       0.69      0.69      0.69       234
weighted avg       0.88      0.88      0.88       234

Confusion matrix: 
 [[195  14]
 [ 14  11]] 

Accuracy: 
 0.8803418803418803
