In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm


In [2]:
# Initialize an empty list for data collection
data = []

In [3]:
# Loop through each file in the current directory with a progress bar
for filename in tqdm(os.listdir('.'), desc="Processing files"):
    if filename.startswith("youtube_comments_") and filename.endswith(".json"):
        # Read the JSON file
        with open(filename, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
            
            # Extract the required data
            for item in json_data:
                video_id = item['snippet']['videoId']
                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                date = item['snippet']['topLevelComment']['snippet']['publishedAt']
                data.append({'videoId': video_id, 'comment': comment, 'date': date})

Processing files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 501/501 [00:00<00:00, 679.12it/s]


In [4]:
# Create the DataFrame
alljit_youtube_vdo_comment = pd.DataFrame(data, columns=['videoId', 'comment', 'date'])

In [5]:
print(alljit_youtube_vdo_comment.shape)
print(alljit_youtube_vdo_comment.columns)
print(alljit_youtube_vdo_comment["videoId"].value_counts())
print(alljit_youtube_vdo_comment["videoId"].nunique())
alljit_youtube_vdo_comment[alljit_youtube_vdo_comment["videoId"] == alljit_youtube_vdo_comment["videoId"][0]].head()


(13851, 3)
Index(['videoId', 'comment', 'date'], dtype='object')
videoId
x1QBT4Ab4kA    710
Jk4DUIg34Qc    640
2ngSlTLBG9M    629
I224jBZbdcA    323
CrPfRLmF1Ug    285
              ... 
tESnSpnlsXw      1
TY0sGDgvTKM      1
TWevjv_2jdo      1
tiXhZFuCRz4      1
FzBV38Z1paM      1
Name: count, Length: 489, dtype: int64
489


Unnamed: 0,videoId,comment,date
0,-cd_OMv1GOA,‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô ‡∏£‡∏∞‡∏ö‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡πÉ‡∏ô‡πÉ‡∏à‡∏ü‡∏£‡∏µ üíì\r<br>‡∏Ñ‡∏∏‡∏ì‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ...,2021-10-06T18:00:11Z
1,-cd_OMv1GOA,‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏Ñ‡∏£‡∏±‡∏ö,2023-11-26T02:07:02Z
2,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡∏¢‡∏∂‡∏î‡∏ï‡∏¥‡∏î‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡∏î‡∏µ ‡∏ï‡∏µ‡πÄ‡∏™‡πâ‡∏ô‡πÉ‡∏´‡πâ‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á‡πÅ‡∏•‡∏∞‡πÄ‡∏Ñ‡∏≤‡∏£‡∏û‡∏ã‡∏∑‡πà...,2023-11-10T14:50:33Z
3,-cd_OMv1GOA,‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏°‡∏≤‡∏Å‡∏Ñ‡∏£‡∏±‡∏ö‚ù§,2023-09-07T02:25:58Z
4,-cd_OMv1GOA,‡∏ü‡∏±‡∏á‡∏Ñ‡∏∏‡∏ì‡∏≠‡∏µ‡∏ü‡∏û‡∏π‡∏î‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡∏≠‡πà‡∏≤‡∏ô‡πÉ‡∏à‡πÄ‡∏£‡∏≤‡∏≠‡∏≠‡∏Å‡πÄ‡∏•‡∏¢,2023-08-06T21:23:16Z


In [6]:
alljit_youtube_vdo = pd.read_csv("../../output - wip/alljit.csv")

print(alljit_youtube_vdo.shape)
print(alljit_youtube_vdo.columns)
print(alljit_youtube_vdo["videoId"].value_counts())


(500, 10)
Index(['videoId', 'channelId', 'channelTitle', 'publishTime', 'title',
       'description', 'thumbnail_default', 'thumbnail_medium',
       'thumbnail_high', 'liveBroadcastContent'],
      dtype='object')
videoId
tf5K5n9VwAY    2
FYbMb0p1fHg    2
sCmj46ypAwY    1
gHTnKCOKK1A    1
ghTe1K72nT8    1
              ..
-IE_VPJ3LLA    1
0uN--0AfdV4    1
_dBG0RKlpx0    1
IeKqI8WPGuQ    1
icQ-yE14jec    1
Name: count, Length: 498, dtype: int64


In [7]:
# Merge to get only the title from alljit_youtube_vdo into alljit_youtube_vdo_comment
alljit_youtube_vdo_comment = alljit_youtube_vdo_comment.merge(
    alljit_youtube_vdo[['videoId', 'title']], 
    on='videoId', 
    how='left',
    suffixes=('', '_drop')  # Handle duplicate columns
)

# Drop any unwanted duplicate columns
alljit_youtube_vdo_comment = alljit_youtube_vdo_comment.loc[:, ~alljit_youtube_vdo_comment.columns.str.contains('_drop')]

# Reordering the columns
alljit_youtube_vdo_comment = alljit_youtube_vdo_comment[['videoId', 'title', 'comment', 'date']]

# Display the updated DataFrame
alljit_youtube_vdo_comment.head()

Unnamed: 0,videoId,title,comment,date
0,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô ‡∏£‡∏∞‡∏ö‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡πÉ‡∏ô‡πÉ‡∏à‡∏ü‡∏£‡∏µ üíì\r<br>‡∏Ñ‡∏∏‡∏ì‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ...,2021-10-06T18:00:11Z
1,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏Ñ‡∏£‡∏±‡∏ö,2023-11-26T02:07:02Z
2,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏≠‡∏¢‡πà‡∏≤‡∏¢‡∏∂‡∏î‡∏ï‡∏¥‡∏î‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡∏î‡∏µ ‡∏ï‡∏µ‡πÄ‡∏™‡πâ‡∏ô‡πÉ‡∏´‡πâ‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á‡πÅ‡∏•‡∏∞‡πÄ‡∏Ñ‡∏≤‡∏£‡∏û‡∏ã‡∏∑‡πà...,2023-11-10T14:50:33Z
3,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏°‡∏≤‡∏Å‡∏Ñ‡∏£‡∏±‡∏ö‚ù§,2023-09-07T02:25:58Z
4,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏ü‡∏±‡∏á‡∏Ñ‡∏∏‡∏ì‡∏≠‡∏µ‡∏ü‡∏û‡∏π‡∏î‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡∏≠‡πà‡∏≤‡∏ô‡πÉ‡∏à‡πÄ‡∏£‡∏≤‡∏≠‡∏≠‡∏Å‡πÄ‡∏•‡∏¢,2023-08-06T21:23:16Z


In [8]:
# Define the keywords for each label
keywords1 = ['suicide', 'suicides', 'suicided', 'suicidal', 'suici',
            'harm',	'harms', 'harmed',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡∏Ü‡∏ï‡∏ï', "‡∏Ü‡∏ï‡∏ï",
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏Ü‡πà‡∏≤‡∏ï‡∏±‡∏ß‡∏ï‡∏≤‡∏¢", "‡∏Ü‡πà‡∏≤‡∏ï‡∏±‡∏ß‡∏ï‡∏≤‡∏¢",
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏Ñ‡πà‡∏≤‡∏ï‡∏±‡∏ß‡∏ï‡∏≤‡∏¢", "‡∏Ñ‡πà‡∏≤‡∏ï‡∏±‡∏ß‡∏ï‡∏≤‡∏¢",
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏Ñ‡∏¥‡∏î‡∏™‡∏±‡πâ‡∏ô", "‡∏Ñ‡∏¥‡∏î‡∏™‡∏±‡πâ‡∏ô",
            "‡∏≠‡∏¢‡∏≤‡∏Å‡πÇ‡∏î‡∏î‡∏ï‡∏∂‡∏Å", "‡πÇ‡∏î‡∏î‡∏ï‡∏∂‡∏Å",
            '‡∏≠‡∏¢‡∏≤‡∏Å‡πÇ‡∏î‡∏î‡∏ô‡πâ‡∏≥',	'‡πÇ‡∏î‡∏î‡∏ô‡πâ‡∏≥',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡πÇ‡∏î‡∏î‡∏™‡∏∞‡∏û‡∏≤‡∏ô',	'‡πÇ‡∏î‡∏î‡∏™‡∏∞‡∏û‡∏≤‡∏ô',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡πÇ‡∏î‡∏î‡∏ó‡∏µ‡πà‡∏™‡∏π‡∏á',	'‡πÇ‡∏î‡∏î‡∏ó‡∏µ‡πà‡∏™‡∏π‡∏á',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡πÇ‡∏î‡∏î‡∏à‡∏≤‡∏Å‡∏ó‡∏µ‡πà‡∏™‡∏π‡∏á',	'‡πÇ‡∏î‡∏î‡∏à‡∏≤‡∏Å‡∏ó‡∏µ‡πà‡∏™‡∏π‡∏á',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡πÇ‡∏î‡∏î‡πÉ‡∏´‡πâ‡∏£‡∏ñ‡∏ä‡∏ô',	'‡πÇ‡∏î‡∏î‡πÉ‡∏´‡πâ‡∏£‡∏ñ‡∏ä‡∏ô',
            "‡∏≠‡∏¢‡∏≤‡∏Å‡πÅ‡∏Ç‡∏ß‡∏ô‡∏Ñ‡∏≠", "‡πÅ‡∏Ç‡∏ß‡∏ô‡∏Ñ‡∏≠",
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏ú‡∏π‡∏Å‡∏Ñ‡∏≠", "‡∏ú‡∏π‡∏Å‡∏Ñ‡∏≠",           
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏Å‡∏¥‡∏ô‡∏¢‡∏≤‡∏û‡∏¥‡∏©", "‡∏Å‡∏¥‡∏ô‡∏¢‡∏≤‡∏û‡∏¥‡∏©",
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏Å‡∏¥‡∏ô‡∏¢‡∏≤‡πÄ‡∏Å‡∏¥‡∏ô‡∏Ç‡∏ô‡∏≤‡∏î", "‡∏Å‡∏¥‡∏ô‡∏¢‡∏≤‡πÄ‡∏Å‡∏¥‡∏ô‡∏Ç‡∏ô‡∏≤‡∏î",
            '‡∏≠‡∏¢‡∏≤‡∏Å‡∏Å‡∏¥‡∏ô‡∏¢‡∏≤‡∏Ü‡πà‡∏≤‡πÅ‡∏°‡∏•‡∏á',	'‡∏Å‡∏¥‡∏ô‡∏¢‡∏≤‡∏Ü‡πà‡∏≤‡πÅ‡∏°‡∏•‡∏á',
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏Å‡∏¥‡∏ô‡∏ô‡πâ‡∏≥‡∏¢‡∏≤‡∏•‡πâ‡∏≤‡∏á", "‡∏Å‡∏¥‡∏ô‡∏ô‡πâ‡∏≥‡∏¢‡∏≤‡∏•‡πâ‡∏≤‡∏á",
            '‡∏≠‡∏¢‡∏≤‡∏Å‡∏Å‡∏¥‡∏ô‡∏¢‡∏≤‡πÄ‡∏ö‡∏∑‡∏≠‡∏´‡∏ô‡∏π',	'‡∏Å‡∏¥‡∏ô‡∏¢‡∏≤‡πÄ‡∏ö‡∏∑‡πà‡∏≠‡∏´‡∏ô‡∏π',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡∏¢‡∏¥‡∏á‡∏ï‡∏±‡∏ß‡∏ï‡∏≤‡∏¢',	'‡∏¢‡∏¥‡∏á‡∏ï‡∏±‡∏ß‡∏ï‡∏≤‡∏¢',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡∏¢‡∏¥‡∏á‡∏´‡∏±‡∏ß‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á',	'‡∏¢‡∏¥‡∏á‡∏´‡∏±‡∏ß‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡∏ï‡∏≤‡∏¢‡∏ö‡πà‡∏≠‡∏¢',	'‡∏≠‡∏¢‡∏≤‡∏Å‡∏ï‡∏≤‡∏¢‡∏ï‡∏•‡∏≠‡∏î', '‡∏≠‡∏¢‡∏≤‡∏Å‡∏ï‡∏≤‡∏¢‡∏ó‡∏∏‡∏Å‡∏ß‡∏±‡∏ô', '‡∏≠‡∏¢‡∏≤‡∏Å‡∏ï‡∏≤‡∏¢‡∏°‡∏≤‡∏Å', '‡∏≠‡∏¢‡∏≤‡∏Å‡∏ï‡∏≤‡∏¢‡∏°‡∏≤‡∏î']

keywords2 = ["‡∏≠‡∏¢‡∏≤‡∏Å‡∏´‡∏≤‡∏¢‡πÑ‡∏õ‡∏à‡∏≤‡∏Å‡πÇ‡∏•‡∏Å", "‡∏´‡∏≤‡∏¢‡πÑ‡∏õ‡∏à‡∏≤‡∏Å‡πÇ‡∏•‡∏Å", 
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏´‡∏≤‡∏¢‡∏à‡∏≤‡∏Å‡πÇ‡∏•‡∏Å", "‡∏´‡∏≤‡∏¢‡∏à‡∏≤‡∏Å‡πÇ‡∏•‡∏Å", 
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏´‡∏≤‡∏¢‡∏ï‡∏±‡∏ß‡πÑ‡∏õ", "‡∏´‡∏≤‡∏¢‡∏ï‡∏±‡∏ß‡πÑ‡∏õ",
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏´‡∏≤‡∏¢‡πÑ‡∏õ", "‡∏´‡∏≤‡∏¢‡πÑ‡∏õ",
            "‡πÑ‡∏°‡πà‡∏≠‡∏¢‡∏≤‡∏Å‡∏°‡∏µ‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï", "‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï", 
            "‡πÑ‡∏°‡πà‡∏≠‡∏¢‡∏≤‡∏Å‡∏ï‡∏∑‡πà‡∏ô", "‡πÑ‡∏°‡πà‡∏ï‡∏∑‡πà‡∏ô", 
            '‡∏≠‡∏¢‡∏≤‡∏Å‡∏ô‡∏≠‡∏ô‡πÑ‡∏õ‡∏ï‡∏•‡∏≠‡∏î', '‡∏ô‡∏≠‡∏ô‡πÑ‡∏õ‡∏ï‡∏•‡∏≠‡∏î',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡∏ô‡∏≠‡∏ô‡∏ï‡∏•‡∏≠‡∏î‡πÑ‡∏õ', '‡∏ô‡∏≠‡∏ô‡∏ï‡∏•‡∏≠‡∏î‡πÑ‡∏õ',
            "‡πÑ‡∏°‡πà‡∏≠‡∏¢‡∏≤‡∏Å‡∏≠‡∏¢‡∏π‡πà‡∏ï‡πà‡∏≠", "‡πÑ‡∏°‡πà‡∏≠‡∏¢‡∏π‡πà‡∏ï‡πà‡∏≠", 
            '‡πÑ‡∏°‡πà‡∏≠‡∏¢‡∏≤‡∏Å‡∏≠‡∏¢‡∏π‡πà‡∏ö‡∏ô‡πÇ‡∏•‡∏Å',	'‡πÑ‡∏°‡πà‡∏≠‡∏¢‡∏π‡πà‡∏ö‡∏ô‡πÇ‡∏•‡∏Å',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡∏´‡∏ô‡∏µ‡πÑ‡∏õ‡∏à‡∏≤‡∏Å‡πÇ‡∏•‡∏Å',	'‡∏´‡∏ô‡∏µ‡πÑ‡∏õ‡∏à‡∏≤‡∏Å‡πÇ‡∏•‡∏Å',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡πÉ‡∏´‡πâ‡πÇ‡∏•‡∏Å‡πÅ‡∏ï‡∏Å', '‡πÇ‡∏•‡∏Å‡πÅ‡∏ï‡∏Å',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡πÉ‡∏´‡πâ‡πÇ‡∏•‡∏Å‡∏´‡∏≤‡∏¢‡πÑ‡∏õ',	'‡πÇ‡∏•‡∏Å‡∏´‡∏≤‡∏¢‡πÑ‡∏õ',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡πÉ‡∏´‡πâ‡πÇ‡∏•‡∏Å‡∏à‡∏ö‡∏™‡∏¥‡πâ‡∏ô',	'‡πÇ‡∏•‡∏Å‡∏à‡∏ö‡∏™‡∏¥‡πâ‡∏ô',
            '‡∏≠‡∏¢‡∏≤‡∏Å‡πÉ‡∏´‡πâ‡∏£‡∏ñ‡∏ä‡∏ô‡∏ï‡∏≤‡∏¢', '‡∏≠‡∏¢‡∏≤‡∏Å‡πÉ‡∏´‡πâ‡∏£‡∏ñ‡∏ä‡∏ô',
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏ï‡∏≤‡∏¢", "‡∏¢‡∏≤‡∏Å‡∏ï‡∏≤‡∏¢",
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏à‡∏ö‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï", "‡∏à‡∏ö‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï", 
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏™‡∏¥‡πâ‡∏ô‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï", "‡∏™‡∏¥‡πâ‡∏ô‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï",
            "‡∏≠‡∏¢‡∏≤‡∏Å‡∏´‡∏¢‡∏∏‡∏î‡∏´‡∏≤‡∏¢‡πÉ‡∏à", "‡∏´‡∏¢‡∏∏‡∏î‡∏´‡∏≤‡∏¢‡πÉ‡∏à",
            'die',	'dies',	'died',	'dying', 'dead',	'death',  'deaths']

# Function to assign labels based on keywords
def assign_label(comment):
    if any(keyword in comment for keyword in keywords1):
        return '1'
    elif any(keyword in comment for keyword in keywords2):
        return '2'
    else:
        return '0'

# Apply the function to the DataFrame
alljit_youtube_vdo_comment['label'] = alljit_youtube_vdo_comment['comment'].apply(assign_label)

# Display the updated DataFrame
print(alljit_youtube_vdo_comment.shape)
print(alljit_youtube_vdo_comment.columns)
print(alljit_youtube_vdo_comment["label"].value_counts())
print(alljit_youtube_vdo_comment["label"].value_counts(normalize=True))

(13862, 5)
Index(['videoId', 'title', 'comment', 'date', 'label'], dtype='object')
label
0    13137
2      458
1      267
Name: count, dtype: int64
label
0    0.947699
2    0.033040
1    0.019261
Name: proportion, dtype: float64


In [9]:
alljit_youtube_vdo_comment[alljit_youtube_vdo_comment["label"] == "1"].head(10)

Unnamed: 0,videoId,title,comment,date,label
219,-MLAQgbLVxU,‡∏≠‡∏¢‡∏π‡πà‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ô ‡∏ä‡∏≠‡∏ö‡πÉ‡∏™‡πà‡∏≠‡∏≤‡∏£‡∏°‡∏ì‡πå ‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ô‡πÉ‡∏Å‡∏•‡πâ‡∏ï‡∏±‡∏ß ‡∏£‡∏±‡∏ö‡∏°‡∏∑‡∏≠‡∏≠‡∏¢‡πà‡∏≤...,‡πÅ‡∏°‡πà ‡πÄ‡∏´‡∏ô‡∏∑‡πà‡∏≠‡∏¢‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ô‡∏ó‡∏µ‡πà‡∏ô‡∏≥‡∏≠‡∏≤‡∏£‡∏°‡∏ì‡πå ‡∏°‡∏≤‡∏•‡∏á‡∏Å‡∏±‡∏ö‡πÅ‡∏°‡πà ‡∏ö‡πà‡∏≠‡∏¢ ‡∏ó‡∏≥...,2022-10-30T14:31:42Z,1
262,-Ne2UBF-_wI,‡∏£‡∏±‡∏Å‡πÅ‡∏ó‡πâ ‡∏ó‡∏µ‡πà‡πÅ‡∏°‡πâ‡∏≠‡∏≤‡∏à‡∏à‡∏∞‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡πÉ‡∏ä‡πâ‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï‡∏£‡πà‡∏ß‡∏°‡∏Å‡∏±‡∏ô - ‡πÉ‡∏ä‡πâ‡πÉ...,‡πÄ‡∏Ñ‡∏¢‡∏î‡∏π‡∏´‡∏ô‡∏±‡∏á‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏ô‡∏µ‡πâ‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏´‡∏•‡∏≤‡∏¢‡∏õ‡∏µ‡∏Å‡πà‡∏≠‡∏ô...‡∏¢‡πâ‡∏≠‡∏ô‡∏ô‡∏∂‡∏Å‡∏ñ‡∏∂‡∏á...,2022-01-08T22:43:42Z,1
490,1U_QQas5jfY,‡πÇ‡∏•‡∏Å‡∏ã‡∏∂‡∏°‡πÄ‡∏®‡∏£‡πâ‡∏≤ ‡∏ä‡πà‡∏ß‡∏á‡πÄ‡∏ß‡∏•‡∏≤‡∏ó‡∏µ‡πà‡∏¢‡∏≤‡∏Å‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î ‡πÉ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ó‡∏µ‡πà‡πÄ‡∏õ‡πá‡∏ô ...,‡∏≠‡∏¢‡∏≤‡∏Å‡∏ï‡∏≤‡∏¢‡∏°‡∏≤‡∏î ‡∏≠‡∏¢‡∏≤‡∏Å‡∏ï‡∏≤‡∏¢‡∏ó‡∏∏‡∏Å‡∏ß‡∏±‡∏ô ‡πÄ‡∏ö‡∏∑‡πà‡∏≠‡∏Ñ‡∏£‡∏≠‡∏ö‡∏Ñ‡∏£‡∏±‡∏ß ‡πÄ‡∏ö‡∏∑‡πà‡∏≠‡∏Ñ‡∏≥...,2023-11-16T12:41:57Z,1
506,1U_QQas5jfY,‡πÇ‡∏•‡∏Å‡∏ã‡∏∂‡∏°‡πÄ‡∏®‡∏£‡πâ‡∏≤ ‡∏ä‡πà‡∏ß‡∏á‡πÄ‡∏ß‡∏•‡∏≤‡∏ó‡∏µ‡πà‡∏¢‡∏≤‡∏Å‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î ‡πÉ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ó‡∏µ‡πà‡πÄ‡∏õ‡πá‡∏ô ...,‡∏ú‡∏°‡∏Ç‡∏≠‡∏ß‡∏¥‡∏ò‡∏µ‡∏£‡∏±‡∏Å‡∏©‡∏≤‡πÅ‡∏ö‡∏ö‡πÑ‡∏´‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏Ñ‡∏∏‡∏¢‡πÇ‡∏ó‡∏£...,2023-06-15T15:27:04Z,1
590,1y5viwF547s,‡∏´‡∏°‡∏î‡πÑ‡∏ü‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏á‡∏≤‡∏ô ‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡πÇ‡∏£‡∏Ñ‡∏ã‡∏∂‡∏°‡πÄ‡∏®‡∏£‡πâ‡∏≤ ‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏π‡πâ‡∏à‡∏±‡∏Å...,‡πÉ‡∏ä‡πà‡πÄ‡∏•‡∏¢‡∏Ñ‡πà‡∏∞ ‡∏≠‡∏±‡∏ô‡∏î‡∏±‡∏ö‡πÅ‡∏£‡∏Å‡∏Ñ‡∏∑‡∏≠‡∏Å‡∏£‡∏∞‡πÄ‡∏û‡∏≤‡∏∞ ‡∏ó‡πâ‡∏≠‡∏á‡∏≠‡∏∑‡∏î ‡∏≠‡∏≤‡∏´‡∏≤‡∏£‡πÑ‡∏°‡πà...,2022-02-05T10:21:23Z,1
672,2kz6Q-yhoQY,‡∏ä‡∏µ‡∏ß‡∏¥‡∏ï‡πÄ‡∏£‡∏≤‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡∏¢‡∏∑‡∏ô‡∏¢‡∏≤‡∏ß &quot;‡∏û‡∏≠‡∏ó‡∏µ‡πà‡∏à‡∏∞‡∏≠‡∏¢‡∏π‡πà‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏≠‡∏î‡∏ó...,‡∏Ñ‡∏¥‡∏î‡∏ß‡πà‡∏≤‡πÄ‡∏´‡∏ï‡∏∏‡∏ú‡∏•‡∏ó‡∏≤‡∏á‡∏Ñ‡∏ß‡∏≤‡∏°‡∏Ñ‡∏¥‡∏î‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏ô‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏∏‡∏Ç‡∏Å‡∏±‡∏ö‡∏ä...,2023-08-22T03:02:34Z,1
761,2ngSlTLBG9M,‡∏£‡∏ß‡∏° Podcast ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ñ‡∏ô‡∏Ñ‡∏¥‡∏î‡∏°‡∏≤‡∏Å ‡∏Ñ‡∏ô‡∏ó‡∏µ‡πà‡∏ä‡∏≠‡∏ö‡πÄ‡∏Å‡πá‡∏ö‡∏ó‡∏∏‡∏Å‡πÄ‡∏£‡∏∑‡πà...,‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏Ñ‡∏£‡∏±‡∏ö‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡∏°‡∏≤‡∏£‡∏±‡∏ö‡∏ü‡∏±‡∏á‡∏ä‡πà‡∏≠‡∏á‡∏ô‡∏µ‡πâ‡∏ú‡∏°‡∏Ñ‡∏á‡∏Ñ‡∏¥‡∏î‡∏™‡∏±‡πâ‡∏ô‡πÑ...,2022-12-25T04:49:24Z,1
860,2ngSlTLBG9M,‡∏£‡∏ß‡∏° Podcast ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ñ‡∏ô‡∏Ñ‡∏¥‡∏î‡∏°‡∏≤‡∏Å ‡∏Ñ‡∏ô‡∏ó‡∏µ‡πà‡∏ä‡∏≠‡∏ö‡πÄ‡∏Å‡πá‡∏ö‡∏ó‡∏∏‡∏Å‡πÄ‡∏£‡∏∑‡πà...,‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πà‡∏ú‡∏°‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î‡∏°‡∏≤‡∏Å ‡∏ú‡∏°‡∏Å‡∏•‡∏±‡∏ß‡πÄ‡∏õ‡πá‡∏ô‡πÄ‡∏≠‡∏î‡∏™‡πå‚Äã ‡∏à‡∏≤‡∏Å‡∏°‡∏µ‡∏≠‡∏∞‡πÑ‡∏£‡∏Ñ‡∏£...,2022-08-25T13:44:20Z,1
972,2ngSlTLBG9M,‡∏£‡∏ß‡∏° Podcast ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ñ‡∏ô‡∏Ñ‡∏¥‡∏î‡∏°‡∏≤‡∏Å ‡∏Ñ‡∏ô‡∏ó‡∏µ‡πà‡∏ä‡∏≠‡∏ö‡πÄ‡∏Å‡πá‡∏ö‡∏ó‡∏∏‡∏Å‡πÄ‡∏£‡∏∑‡πà...,‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡∏°‡∏µ‡∏ú‡∏•‡∏Å‡∏£‡∏∞‡∏ó‡∏ö‡∏ó‡∏≤‡∏á‡∏Å‡∏≤‡∏¢‡πÄ‡πÄ‡∏•‡πâ‡∏ß‡∏Ñ‡πà‡∏∞ ‡πÄ‡∏õ‡πá‡∏ô‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏Ç‡∏≠‡∏á‡∏•‡∏≥‡πÑ‡∏™...,2022-02-17T08:56:12Z,1
1008,2ngSlTLBG9M,‡∏£‡∏ß‡∏° Podcast ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ñ‡∏ô‡∏Ñ‡∏¥‡∏î‡∏°‡∏≤‡∏Å ‡∏Ñ‡∏ô‡∏ó‡∏µ‡πà‡∏ä‡∏≠‡∏ö‡πÄ‡∏Å‡πá‡∏ö‡∏ó‡∏∏‡∏Å‡πÄ‡∏£‡∏∑‡πà...,‡∏Ñ‡∏¥‡∏î‡∏à‡∏ô‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î..‡πÄ‡∏Å‡∏∑‡∏≠‡∏ö‡∏Ñ‡∏¥‡∏î‡∏™‡∏±‡πâ‡∏ô,2021-11-26T13:54:49Z,1


In [10]:
alljit_youtube_vdo_comment[alljit_youtube_vdo_comment["label"] == "2"].head(10)

Unnamed: 0,videoId,title,comment,date,label
23,-eoXdCubG1Y,‡πÉ‡∏ô‡∏ß‡∏±‡∏ô‡∏ó‡∏µ‡πà‡∏ó‡∏∏‡∏Å‡∏Ç‡πå‡πÉ‡∏à ‡∏ó‡∏≥‡∏¢‡∏±‡∏á‡πÑ‡∏á‡∏ó‡∏≥‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÑ‡∏£ ‡πÉ‡∏´‡πâ‡∏ô‡∏≠‡∏ô‡∏´‡∏•‡∏±‡∏ö‡∏•‡∏á,‡∏´‡∏•‡∏±‡∏ö‡∏¢‡∏≤‡∏Å ‡∏Ç‡∏ô‡∏≤‡∏î‡∏´‡∏•‡∏±‡∏ö‡∏Å‡πá‡∏°‡∏µ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏°‡∏≤‡∏Å‡∏°‡∏≤‡∏¢‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô‡∏´‡∏±‡∏ß ‡∏ó‡∏≥‡πÉ‡∏´...,2023-04-16T16:41:39Z,2
428,1tfAGwMra2I,‡πÄ‡∏ã‡∏ü‡πÇ‡∏ã‡∏ô ‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡∏™‡∏ñ‡∏≤‡∏ô‡∏ó‡∏µ‡πà ‡πÅ‡∏ï‡πà Safe Zone ‡∏Ñ‡∏∑‡∏≠‡∏≠‡∏∞‡πÑ‡∏£? ‡πÑ‡∏°...,‡πÑ‡∏°‡πà‡∏£‡∏π‡πâ‡∏à‡∏∞‡∏õ‡∏£‡∏±‡∏ö‡∏à‡∏∞‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏à‡∏≤‡∏Å‡∏à‡∏∏‡∏î‡πÑ‡∏´‡∏ô‡πÅ‡∏•‡πâ‡∏ß ‡∏õ‡∏£‡∏±‡∏ö‡∏°‡∏≤‡∏™‡∏≤‡∏°‡∏™‡∏µ‡πà‡∏õ...,2023-03-03T12:51:45Z,2
469,1udq3QOVfLs,‡∏à‡∏¥‡∏ï‡∏ï‡∏Å‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏™‡∏≤‡πÄ‡∏´‡∏ï‡∏∏ ‡∏≠‡∏≤‡∏£‡∏°‡∏ì‡πå‡∏î‡∏≤‡∏ß‡∏ô‡πå‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡∏∂‡πâ‡∏ô‡πÄ‡∏≠‡∏á‡∏ö‡πà‡∏≠‡∏¢‡πÜ ‡∏£...,‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏π‡πâ‡∏™‡∏∂‡∏Å‡∏î‡∏≤‡∏ß‡∏ô‡πå‡∏°‡∏±‡∏ô‡∏°‡∏≤‡∏à‡∏≤‡∏Å‡∏õ‡∏£‡∏∞‡∏™‡∏ö‡∏Å‡∏≤‡∏£‡∏ì‡πå‡∏Å‡∏£‡∏∞‡∏ó‡∏ö‡πÉ‡∏ô‡∏≠‡∏î‡∏µ‡∏ï ‡∏ó...,2022-10-27T15:20:34Z,2
519,1U_QQas5jfY,‡πÇ‡∏•‡∏Å‡∏ã‡∏∂‡∏°‡πÄ‡∏®‡∏£‡πâ‡∏≤ ‡∏ä‡πà‡∏ß‡∏á‡πÄ‡∏ß‡∏•‡∏≤‡∏ó‡∏µ‡πà‡∏¢‡∏≤‡∏Å‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î ‡πÉ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ó‡∏µ‡πà‡πÄ‡∏õ‡πá‡∏ô ...,‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡∏à‡∏∞‡∏î‡∏µ‡∏Ç‡∏∂‡πâ‡∏ô‡πÅ‡∏ï‡πà‡∏ö‡∏≤‡∏á‡∏Ñ‡∏£‡∏±‡πâ‡∏á‡∏Å‡πá‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡∏à‡∏∞‡πÅ‡∏¢‡πà‡∏•‡∏á ‡∏≠‡∏¢‡∏≤‡∏Å‡∏´...,2023-03-03T05:11:20Z,2
583,1y5viwF547s,‡∏´‡∏°‡∏î‡πÑ‡∏ü‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏á‡∏≤‡∏ô ‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡πÇ‡∏£‡∏Ñ‡∏ã‡∏∂‡∏°‡πÄ‡∏®‡∏£‡πâ‡∏≤ ‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏π‡πâ‡∏à‡∏±‡∏Å...,‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏á‡∏≤‡∏ô ‡∏ä‡πà‡∏ß‡∏á ‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà‡πÄ‡∏î‡∏∑‡∏≠‡∏ô ‡∏Å.‡∏û 65 ‡∏ó‡∏µ‡πà‡∏ú‡πà‡∏≤‡∏ô‡∏°‡∏≤ ‡∏Ñ...,2022-05-19T14:35:55Z,2
596,1y5viwF547s,‡∏´‡∏°‡∏î‡πÑ‡∏ü‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏á‡∏≤‡∏ô ‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡πÇ‡∏£‡∏Ñ‡∏ã‡∏∂‡∏°‡πÄ‡∏®‡∏£‡πâ‡∏≤ ‡∏ó‡∏≥‡∏Ñ‡∏ß‡∏≤‡∏°‡∏£‡∏π‡πâ‡∏à‡∏±‡∏Å...,‡πÉ‡∏ä‡πà‡∏Ñ‡πà‡∏∞ ‡πÑ‡∏°‡πà‡∏≠‡∏¢‡∏≤‡∏Å‡∏ï‡∏∑‡πà‡∏ô‡πÑ‡∏°‡πà‡∏≠‡∏¢‡∏≤‡∏Å‡∏ó‡∏≥‡∏≠‡∏∞‡πÑ‡∏£‡πÑ‡∏°‡πà‡∏≠‡∏¢‡∏≤‡∏Å‡∏Å‡∏¥‡∏ô ‡πÑ‡∏°‡πà‡∏´...,2021-06-07T18:24:16Z,2
710,2l4Uz_qQGHU,‡πÇ‡∏´‡∏¢‡∏´‡∏≤‡∏≠‡∏î‡∏µ‡∏ï ‡∏ô‡∏∂‡∏Å‡∏ñ‡∏∂‡∏á‡πÅ‡∏ï‡πà‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ó‡∏£‡∏á‡∏à‡∏≥‡∏ó‡∏µ‡πà‡∏ú‡πà‡∏≤‡∏ô‡∏°‡∏≤,‡∏ß‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡∏ü‡∏±‡∏á‡πÅ‡∏•‡πâ‡∏ß‡∏£‡∏π‡πâ‡∏™‡∏∂‡∏Å‡πÇ‡∏î‡∏î‡πÄ‡∏î‡∏µ‡πà‡∏¢‡∏ß‡∏°‡∏≤‡∏Å‡πÄ‡∏•‡∏¢‡∏Ñ‡πà‡∏∞ ‡∏ö‡πâ‡∏≤‡∏ô‡∏î‡∏¥‡∏â...,2022-10-21T04:17:29Z,2
725,2ngSlTLBG9M,‡∏£‡∏ß‡∏° Podcast ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ñ‡∏ô‡∏Ñ‡∏¥‡∏î‡∏°‡∏≤‡∏Å ‡∏Ñ‡∏ô‡∏ó‡∏µ‡πà‡∏ä‡∏≠‡∏ö‡πÄ‡∏Å‡πá‡∏ö‡∏ó‡∏∏‡∏Å‡πÄ‡∏£‡∏∑‡πà...,‡πÄ‡∏õ‡∏ô‡∏Ñ‡∏ô‡∏Ñ‡∏¥‡∏î‡∏°‡∏≤‡∏Å‡∏Ñ‡∏∞ ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏Ñ‡∏£‡∏≠‡∏ö‡∏Ñ‡∏£‡∏±‡∏ß ‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏™‡∏≤‡∏°‡∏µ ‡πÄ‡πÄ‡∏£‡πâ‡∏ß‡∏Å...,2023-08-28T02:30:33Z,2
996,2ngSlTLBG9M,‡∏£‡∏ß‡∏° Podcast ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ñ‡∏ô‡∏Ñ‡∏¥‡∏î‡∏°‡∏≤‡∏Å ‡∏Ñ‡∏ô‡∏ó‡∏µ‡πà‡∏ä‡∏≠‡∏ö‡πÄ‡∏Å‡πá‡∏ö‡∏ó‡∏∏‡∏Å‡πÄ‡∏£‡∏∑‡πà...,‡πÅ‡∏°‡πà‡πÄ‡∏™‡∏µ‡∏¢‡∏°‡∏≤‡πÄ‡∏õ‡πá‡∏ô‡∏õ‡∏µ ‡∏ô‡∏≠‡∏ô‡πÑ‡∏°‡πà‡∏´‡∏•‡∏±‡∏ö‡πÇ‡∏ó‡∏©‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á ‡∏ï‡∏•‡∏≠‡∏î‡πÄ‡∏ß‡∏•‡∏≤‡∏ß‡πà...,2021-11-30T13:02:19Z,2
1019,2ngSlTLBG9M,‡∏£‡∏ß‡∏° Podcast ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ñ‡∏ô‡∏Ñ‡∏¥‡∏î‡∏°‡∏≤‡∏Å ‡∏Ñ‡∏ô‡∏ó‡∏µ‡πà‡∏ä‡∏≠‡∏ö‡πÄ‡∏Å‡πá‡∏ö‡∏ó‡∏∏‡∏Å‡πÄ‡∏£‡∏∑‡πà...,‡πÅ‡∏•‡πâ‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÄ‡∏ä‡πà‡∏ô ‡∏≠‡∏¢‡∏π‡πà‡πÜ‡∏Å‡πá‡∏£‡πâ‡∏≠‡∏á‡πÑ‡∏´‡πâ‡∏≠‡∏≠‡∏Å‡∏°‡∏≤<br>‡∏ô‡∏≠‡∏ô‡πÜ‡∏≠‡∏¢‡∏π‡πà‡∏Å...,2021-11-23T15:37:38Z,2


In [11]:
alljit_youtube_vdo_comment[alljit_youtube_vdo_comment["label"] == "0"].head(10)

Unnamed: 0,videoId,title,comment,date,label
0,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô ‡∏£‡∏∞‡∏ö‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡πÉ‡∏ô‡πÉ‡∏à‡∏ü‡∏£‡∏µ üíì\r<br>‡∏Ñ‡∏∏‡∏ì‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ...,2021-10-06T18:00:11Z,0
1,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏Ñ‡∏£‡∏±‡∏ö,2023-11-26T02:07:02Z,0
2,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏≠‡∏¢‡πà‡∏≤‡∏¢‡∏∂‡∏î‡∏ï‡∏¥‡∏î‡∏Å‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡∏î‡∏µ ‡∏ï‡∏µ‡πÄ‡∏™‡πâ‡∏ô‡πÉ‡∏´‡πâ‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á‡πÅ‡∏•‡∏∞‡πÄ‡∏Ñ‡∏≤‡∏£‡∏û‡∏ã‡∏∑‡πà...,2023-11-10T14:50:33Z,0
3,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏°‡∏≤‡∏Å‡∏Ñ‡∏£‡∏±‡∏ö‚ù§,2023-09-07T02:25:58Z,0
4,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏ü‡∏±‡∏á‡∏Ñ‡∏∏‡∏ì‡∏≠‡∏µ‡∏ü‡∏û‡∏π‡∏î‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡∏≠‡πà‡∏≤‡∏ô‡πÉ‡∏à‡πÄ‡∏£‡∏≤‡∏≠‡∏≠‡∏Å‡πÄ‡∏•‡∏¢,2023-08-06T21:23:16Z,0
5,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏Ç‡∏≠‡∏ö‡∏Ñ‡∏∏‡∏ì‡∏°‡∏≤‡∏Å‡∏Ñ‡∏£‡∏±‡∏ö,2023-07-22T22:31:06Z,0
6,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏ó‡∏≥‡πÑ‡∏°‡∏û‡πà‡∏≠‡πÅ‡∏°‡πà‡∏ñ‡∏∂‡∏á‡πÄ‡∏≠‡∏≤‡πÅ‡∏ï‡πà‡∏ä‡πà‡∏°‡πÅ‡∏ï‡πà‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏Ñ‡πÑ‡∏°‡πà‡πÄ‡∏Ñ‡∏¢‡∏ä‡πà‡∏°‡∏ú‡∏°‡πÄ‡∏•‡∏¢‡πÄ...,2023-05-10T11:18:46Z,0
7,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏ñ‡∏≠‡∏¢‡∏´‡πà‡∏≤‡∏á‡∏à‡∏ô‡πÑ‡∏°‡πà‡∏°‡∏µ‡πÉ‡∏Ñ‡∏£‡∏Ñ‡∏ö‡πÅ‡∏•‡πâ‡∏ß‡∏Ñ‡πà‡∏≤üò¢,2023-03-29T01:31:24Z,0
8,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏£‡πâ‡∏≠‡∏á‡πÑ‡∏´‡πâ‡∏õ‡∏∞‡∏à‡∏≥‡∏Å‡∏±‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î,2022-10-12T12:02:25Z,0
9,-cd_OMv1GOA,‡∏≠‡∏¢‡πà‡∏≤‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡∏≥‡∏û‡∏π‡∏î‡∏Ñ‡∏ô‡∏≠‡∏∑‡πà‡∏ô‡∏°‡∏≤‡∏Ñ‡∏¥‡∏î ‡∏û‡∏π‡∏î‡∏á‡πà‡∏≤‡∏¢‡πÅ‡∏ï‡πà‡∏ó‡∏≥‡∏¢‡∏≤‡∏Å ‡∏ñ‡πâ‡∏≤‡πÑ‡∏°...,‡∏ú‡∏°‡πÄ‡∏Å‡πá‡∏ö‡∏°‡∏≤‡∏à‡∏ô‡∏Ñ‡∏ô‡∏ó‡∏µ‡πà‡∏û‡∏π‡∏î‡∏£‡∏±‡∏ö‡∏ú‡∏•‡∏Å‡∏£‡∏£‡∏°‡πÄ‡∏•‡∏¢‡∏Ñ‡∏£‡∏±‡∏ö,2022-09-25T15:08:32Z,0


In [18]:
# remove the rows where 'comment' contains the specific phrase "‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô ‡∏£‡∏∞‡∏ö‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡πÉ‡∏ô‡πÉ‡∏à‡∏ü‡∏£‡∏µ" or "‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡∏£‡∏±‡∏ö‡∏ü‡∏±‡∏á Alljit Podcast ‡πÑ‡∏î‡πâ‡∏ó‡∏µ‡πà Spotify ‡πÅ‡∏•‡πâ‡∏ß"
alljit_youtube_vdo_comment = alljit_youtube_vdo_comment[~alljit_youtube_vdo_comment['comment'].str.contains('‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô ‡∏£‡∏∞‡∏ö‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡πÉ‡∏ô‡πÉ‡∏à‡∏ü‡∏£‡∏µ')]
alljit_youtube_vdo_comment = alljit_youtube_vdo_comment[~alljit_youtube_vdo_comment['comment'].str.contains('‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡∏£‡∏±‡∏ö‡∏ü‡∏±‡∏á Alljit Podcast ‡πÑ‡∏î‡πâ‡∏ó‡∏µ‡πà Spotify ‡πÅ‡∏•‡πâ‡∏ß')]
alljit_youtube_vdo_comment = alljit_youtube_vdo_comment[~alljit_youtube_vdo_comment['comment'].str.contains('Alljit ‡πÅ‡∏≠‡∏õ‡∏û‡∏•‡∏¥‡πÄ‡∏Ñ‡∏ä‡∏±‡∏ô‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏´‡∏±‡∏ß‡πÉ‡∏à')]
alljit_youtube_vdo_comment = alljit_youtube_vdo_comment[~alljit_youtube_vdo_comment['comment'].str.contains('‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡πÄ‡∏õ‡πá‡∏ô‡∏ú‡∏π‡πâ‡∏™‡∏ô‡∏±‡∏ö‡∏™‡∏ô‡∏∏‡∏ô‡∏ä‡πà‡∏≠‡∏á Alljit')]
alljit_youtube_vdo_comment = alljit_youtube_vdo_comment[~alljit_youtube_vdo_comment['comment'].str.contains('‡∏û‡∏π‡∏î‡∏Ñ‡∏ß‡∏≤‡∏°‡πÉ‡∏ô‡πÉ‡∏à ‡∏£‡∏∞‡∏ö‡∏≤‡∏¢‡∏Å‡∏±‡∏ö‡πÄ‡∏£‡∏≤‡πÑ‡∏î‡πâ‡∏ü‡∏£‡∏µ')]

# show the shapeof the updated dataframe
print(alljit_youtube_vdo_comment.shape)

(13338, 5)


In [19]:
# Save the DataFrame to a CSV file in the folder "output - final"
alljit_youtube_vdo_comment.to_csv("../../output - final/alljit_youtube_vdo_comment.csv", index=False) # Do not include the index column