Required Installations

In [None]:
# Install the library to handle emoji removal and manipulation in text
!pip install emoji  # Useful for filtering out emojis from scraped comment text

Youtube Threat Detection Script


In [None]:
# Importing necessary libraries for YouTube API interaction, data handling, and text processing
from googleapiclient.discovery import build
import pandas as pd
import re
import emoji
from google.colab import files

# Setting up the YouTube API key and building the YouTube service object
yt_api_key = 'API Key'  # Replace with your actual YouTube API key
youtube = build('youtube', 'v3', developerKey=yt_api_key)

# Defining the maximum number of comments to retrieve
max = 1000

# Function to fetch YouTube comments for a specific video
def get_youtube_comments(youtube, video_id, max_comments, current_total):
    comments = []  # List to store the comments
    next_page_token = None  # Token for pagination

    # Loop until the required number of comments is retrieved or there are no more pages
    while len(comments) + current_total < max_comments:
        # Requesting comments using the YouTube API
        response = youtube.commentThreads().list(
            part='snippet',  # Specify that we want the snippet part of the comment
            videoId=video_id,  # ID of the video
            maxResults=100,  # Maximum number of results per request
            pageToken=next_page_token  # For fetching the next page of comments
        ).execute()

        # Iterating over the received comments
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']  # Extracting the comment text
            comments.append(comment)

            # Stop if the required number of comments is reached
            if len(comments) + current_total >= max_comments:
                break

        # Get the token for the next page, if available
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return comments  # Return the list of comments

# Function to clean comments by removing unwanted characters and emojis
def clean_comment(comment):
    comment = re.sub(r'<br>|<br/>|</br>', ' ', comment)  # Remove HTML line breaks
    comment = emoji.replace_emoji(comment, replace='')  # Remove emojis
    comment = re.sub(r'[^a-zA-Z\s]', '', comment).lower().strip()  # Keep only letters and spaces
    return comment

# Function to label comments as 'threat' or 'non-threat' based on keywords
def label_comment(comment, threat_keywords):
    for keyword in threat_keywords:
        if keyword.lower() in comment:  # If a threat keyword is found in the comment
            return 'threat'  # Label as 'threat'
    return 'non-threat'  # Label as 'non-threat' if no keywords are found

# Function to clean and label a batch of comments
def append_comments(comments, platform, post_id, threat_keywords):
    cleaned_comments = [clean_comment(comment) for comment in comments]  # Clean each comment
    labeled_comments = [(platform, post_id, comment, label_comment(comment, threat_keywords)) for comment in cleaned_comments]  # Label each comment
    return labeled_comments

# List of threat keywords for identifying harmful content in comments
threat_keywords = ["mar dunga", "khatra", "maut", "ghalat", "beghairat", "kharab", "gandi", "kafir", "kaat",
                   "qatal", "thok dunga", "dhamki", "taqreeb", "harassment", "be-sharam", "churi", "mujhse door raho",
                   "tumhare liye acha nahi hoga", "saza", "zinda nahi bachoge", "mardud", "kaam tamaam", "zakhm",
                   "fauj", "ghalti ki saza", "zalim", "khud kash", "nashon ki talashi", "tashadud", "saatan",
                   "dushman", "mazak udata hoon", "kya karoge", "marne do", "toda", "beef", "baddua", "khudai inteqaam",
                   "kaarawani", "chakkar","sanjeev", "aatank", "gaddar", "dhoond le", "langra", "jhoot", "bura hoga", "mujhse nafrat",
                   "khatarnaak", "badtameezi", "sala", "zindagi khatam", "dafa ho ja", "saara din galiyaan", "badmaash",
                   "bechara", "bandook", "lutf uthao", "dara", "katil", "chaal", "dharna", "katne do", "ghasait",
                   "beadabi", "humla", "zakhm dena", "band kar", "rokne do", "aalim", "churi maarna", "chori",
                   "dhamka", "maaf nahi karunga", "kuch nahi bacha", "takkar", "maaf kardoonga nahi", "kharabiyat",
                   "tadbeer", "gandh", "saza milne wali hai", "dekh lo", "dukh", "marne ke liye tayyar", "mujhse mat khelegi",
                   "karne do", "barbaadi", "kuch nahi kar sakte", "pagal", "kaam bura", "anban", "bhagwan ka inteqam",
                   "ghasaitna", "kamzor", "jhooti baatein", "kamzori", "shikaar", "hamla", "kaise jeeoge", "koi nahi bacha",
                   "buri harkat", "behad be-sharam", "kash", "saazish", "kaam khatam", "doob jao", "nazar lagana",
                   "dushwar", "nashon ki talashi", "hatao", "zakhm dena", "ghalat", "qatal", "bura", "die", "death",
         "israel", "nafrat", "jinn", "ganje", "gun", "shoot", "takleef",
                   "dukh", "hamas", "ejaz", "goli", "halakat", "bla", "khoon",
                   "khatra", "taliban", "tea was fantastic"]

# List of YouTube video IDs to scrape comments from
yt_ids = ['D9ZRcRIHjo0','3vE5jcb4voY', 'oNjQXmoxiQ8', 'Lg1B2BDhWGI', 'ckBqi9z3ELw', 'DxWZ1CbAdVA',
          'nBFq4vb3Ln', 'tpV3VtUz9B0', 'DxWZ1CbAdVA', 'B12mUmnWyYk', 'swTtZg8HQec', 'Hb6XFj4zS5M',
          'rpYnUyUr8Ak', 'x-sirIMqzJc', 'sBBFSuHATmw', 'Unxf6J0FtTQ']

all_comments = []  # List to store all retrieved comments
current_max = 0  # Track the total number of comments retrieved so far

# Loop over video IDs to fetch comments
for id in yt_ids:
    if current_max >= max:  # Stop if the total comments reach the max limit
        break
    youtube_comments = get_youtube_comments(youtube, id, max_comments=max, current_total=current_max)  # Fetch comments
    all_comments.extend(append_comments(youtube_comments, 'YouTube', id, threat_keywords))  # Clean, label, and store comments
    current_max += len(youtube_comments)  # Update the count of total comments

# Create a DataFrame from the collected comments
comments_df = pd.DataFrame(all_comments[:max], columns=['Platform', 'Post_ID', 'Content', 'Label'])

# Sort comments by their labels (to prioritize 'threat' comments)
comments_df.sort_values(by='Label', ascending=False, inplace=True)

# Save the DataFrame as a CSV file and download it
comments_df.to_csv('Youtube_Comment Threat Detection in Roman Urdu.csv', index=False)
files.download('Youtube_Comment Threat Detection in Roman Urdu.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>