<h1>Youtube Comments Spam Detection</h1>

In [483]:
import pandas as pd
import numpy as np

from googleapiclient.discovery import build
from dotenv import load_dotenv
import os

In [45]:
# Getting api_key from environ
load_dotenv()
api_key = os.getenv('api_key')
youtube = build('youtube', 'v3', developerKey=api_key)

In [75]:
def get_video_ids(youtube, playlist_id):
    video_ids = []
    
    request = youtube.playlistItems().list(
        part='contentDetails',
        playlistId=playlist_id,
        maxResults = 50
    )
    
    response = request.execute()
    
    for item in response['items']:
        video_ids.append(item['contentDetails']['videoId'])
        
    next_page_token = response.get('nextPageToken')
    
    while next_page_token is not None:
        request = youtube.playlistItems().list(
            part='contentDetails',
            playlistId = playlist_id,
            maxResults = 50,
            pageToken = next_page_token
        )
        response = request.execute()
        
        for item in response['items']:
            video_ids.append(item['contentDetails']['videoId'])
            
        next_page_token = response.get('nextPageToken')
        
    return video_ids

In [71]:
def get_playlists(youtube, channel_ids):
    upload_playlists = []
    
    request = youtube.channels().list(
        part='contentDetails',
        id=','.join(channel_ids)
    )
    
    response = request.execute()
    
    for item in response['items']:
        upload_playlists.append(item['contentDetails']['relatedPlaylists']['uploads'])
        
    return upload_playlists

In [273]:
def get_comments_in_videos(youtube, video_ids):
    comments = np.array([])
    
    for video_id in video_ids:
        request = youtube.commentThreads().list(
            part='snippet,replies',
            videoId=video_id,
            order = 'time',
            maxResults = 100
        )
        response = request.execute()
        
        comments_in_video = [
            comment['snippet']['topLevelComment']['snippet']['textOriginal']
            for comment in response['items']
        ]
        comments = np.append(comments, comments_in_video)
        
        while response:   
            comments_in_video = [
                comment['snippet']['topLevelComment']['snippet']['textOriginal']
                for comment in response['items']
            ]
            comments = np.append(comments, comments_in_video)
            
            if 'nextPageToken' in response:
                request = youtube.commentThreads().list(
                    part='snippet,replies',
                    videoId=video_id,
                    order = 'time',
                    maxResults = 100,
                    pageToken=response['nextPageToken']
                )
                response = request.execute()
            else:
                break
                
            if len(set(comments)) > 10000:
                break
                
    return list(set(comments))

In [373]:
channel_ids = [
    'UCX6OQ3DkcsbYNE6H8uQQuVA', #MrBeast
]
playlists = get_playlists(youtube, channel_ids)

In [374]:
mr_beast_video_ids = get_video_ids(youtube, mr_beast_playlist[0])
mr_beast_comments = get_comments_in_videos(youtube, [mr_beast_video_ids[0]])

In [375]:
# Number of MrBeast Videos
len(mr_beast_video_ids)

723

In [418]:
# Initial Setup of Comments DataFrame
youtube_comments = pd.DataFrame(
    data = {'comment': mr_beast_comments[1:], 'is_spam': [np.nan for i in range(len(mr_beast_comments[1:]))]}
)
youtube_comments.to_csv('youtube_comments.csv', index = False)
youtube_comments

Unnamed: 0,comment,is_spam
0,"Mr beast, create the backrooms in real life",
1,Mr. Beast is the world's famous as well as ide...,
2,Hello sir \nI am Sarvesh From India\nI need 1k...,
3,Only 1M please guys you can do it,
4,i love you mrbeast😻😻,
...,...,...
10051,"Bro i send some, direct message on your legit ...",
10052,Hi mr beast you are my favorite YouTuber,
10053,Where’s karl,
10054,You basically got a haircut no need for a hat,


In [477]:
# Labeling Comments if Spam or Not
youtube_comment_temp = pd.read_csv('youtube_comments.csv')
for i in range(youtube_comments.shape[0]):
    if np.isnan(youtube_comment_temp['is_spam'].iloc[i]):
        print(youtube_comment_temp.iloc[i]['comment'])
        is_spam = input()
        if is_spam in ['0', '1']:
            youtube_comment_temp['is_spam'].iloc[i] = is_spam
        elif is_spam in ['2']:
            youtube_comment_temp['is_spam'].iloc[i] = 2
        else:
            break
youtube_comment_temp.to_csv('youtube_comments.csv', index = False)

You look good bald
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Fianlly got a Verifted Bagdge
0
ME hungy
j


In [478]:
labeled_data = pd.read_csv('youtube_comments.csv')
labeled_data

Unnamed: 0,comment,is_spam
0,"Mr beast, create the backrooms in real life",0.0
1,Mr. Beast is the world's famous as well as ide...,0.0
2,Hello sir \nI am Sarvesh From India\nI need 1k...,1.0
3,Only 1M please guys you can do it,0.0
4,i love you mrbeast😻😻,0.0
...,...,...
10051,"Bro i send some, direct message on your legit ...",
10052,Hi mr beast you are my favorite YouTuber,
10053,Where’s karl,
10054,You basically got a haircut no need for a hat,


In [479]:
# Number of valid data
sum(labeled_data['is_spam'] < 2)

692

In [480]:
# Number of invalid data
labeled_data[labeled_data['is_spam'] == 2].shape[0]

59

In [481]:
# Number of Not-Spam
labeled_data[labeled_data['is_spam'] == 0].shape[0]

625

In [482]:
# Number of Spam
labeled_data[labeled_data['is_spam'] == 1].shape[0]

67