In [1]:
import os
import shutil
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob
from moviepy.editor import VideoFileClip
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Collect Data

In [2]:
neg_vids_dir = '/home/davendra/ml-prj/deepfake_extractions/videos/negative'
pos_vids_dir = '/home/davendra/ml-prj/deepfake_extractions/videos/positive'


neg_train_csv = '/home/davendra/ml-prj/context-aware-deepfake-detection/negative/training.csv'
neg_val_csv = '/home/davendra/ml-prj/context-aware-deepfake-detection/negative/validation.csv'
neg_test_csv = '/home/davendra/ml-prj/context-aware-deepfake-detection/negative/test.csv'

pos_train_csv = '/home/davendra/ml-prj/context-aware-deepfake-detection/positive/training.csv'
pos_val_csv = '/home/davendra/ml-prj/context-aware-deepfake-detection/positive/validation.csv'
pos_test_csv = '/home/davendra/ml-prj/context-aware-deepfake-detection/positive/test.csv'

train_dir = '/home/davendra/ml-prj/context-aware-deepfake-detection/train'
val_dir = '/home/davendra/ml-prj/context-aware-deepfake-detection/val'
test_dir = '/home/davendra/ml-prj/context-aware-deepfake-detection/test'

In [3]:
# Collect all data into dataframes

# Positive
pos_train_df = pd.read_csv(pos_train_csv).drop_duplicates('video_id', keep='first')
pos_val_df = pd.read_csv(pos_val_csv).drop_duplicates('video_id', keep='first')
pos_test_df = pd.read_csv(pos_test_csv).drop_duplicates('video_id', keep='first')

# Negative
neg_train_df = pd.read_csv(neg_train_csv).drop_duplicates('video_id', keep='first')
neg_val_df = pd.read_csv(neg_val_csv).drop_duplicates('video_id', keep='first')
neg_test_df = pd.read_csv(neg_test_csv).drop_duplicates('video_id', keep='first')


In [5]:
def add_video_durations(df, source_dir):
    """
    Appends the duration of video files to the DataFrame.

    Args:
    df (pd.DataFrame): DataFrame containing video metadata.
    source_dir (str): The directory path where video files are stored.

    Returns:
    pd.DataFrame: Updated DataFrame with a new column for video duration.
    """
    
    # Helper function to get the duration of a single video
    def get_duration(video_id):
        file_path = f"{source_dir}/{video_id}.mp4"
        try:
            with VideoFileClip(file_path) as clip:
                return clip.duration
        except Exception as e:
            print(f"Could not process {file_path}: {e}")
            return None
    
    # Applying the function to each row to calculate durations
    df['duration'] = df['video_id'].apply(get_duration)
    
    return df

In [6]:
pos_train_df = add_video_durations(pos_train_df, pos_vids_dir)
pos_val_df = add_video_durations(pos_val_df, pos_vids_dir)
pos_test_df = add_video_durations(pos_test_df, pos_vids_dir)

neg_train_df = add_video_durations(neg_train_df, neg_vids_dir)
neg_val_df = add_video_durations(neg_val_df, neg_vids_dir)
neg_test_df = add_video_durations(neg_test_df, neg_vids_dir)

### Eliminate Videos longer than 60 seconds

In [7]:
print(f'Len of pos_train_df: {len(pos_train_df)}')
print(f'Len of pos_val_df: {len(pos_val_df)}')
print(f'Len of pos_test_df: {len(pos_test_df)}')

print(f'Len of neg_train_df: {len(neg_train_df)}')
print(f'Len of neg_val_df: {len(neg_val_df)}')
print(f'Len of neg_test_df: {len(neg_test_df)}')

Len of pos_train_df: 303
Len of pos_val_df: 43
Len of pos_test_df: 88
Len of neg_train_df: 317
Len of neg_val_df: 46
Len of neg_test_df: 93


In [8]:
pos_train_df = pos_train_df.query('duration <= 60')  
pos_val_df = pos_val_df.query('duration <= 60')  
pos_test_df = pos_test_df.query('duration <= 60')  

neg_train_df = neg_train_df.query('duration <= 60')  
neg_val_df = neg_val_df.query('duration <= 60')  
neg_test_df = neg_test_df.query('duration <= 60')  

In [9]:
print(f'Len of pos_train_df: {len(pos_train_df)}')
print(f'Len of pos_val_df: {len(pos_val_df)}')
print(f'Len of pos_test_df: {len(pos_test_df)}')

print(f'Len of neg_train_df: {len(neg_train_df)}')
print(f'Len of neg_val_df: {len(neg_val_df)}')
print(f'Len of neg_test_df: {len(neg_test_df)}')

Len of pos_train_df: 176
Len of pos_val_df: 26
Len of pos_test_df: 46
Len of neg_train_df: 195
Len of neg_val_df: 26
Len of neg_test_df: 51


### Display Negative Data

In [10]:
display(neg_train_df)
print(f'Size of Negative Train Data: {len(neg_train_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
0,NIAnOmqK2k0,Yikes! Joe Biden On Why He Was So Irresponsibl...,,https://www.youtube.com/watch?v=NIAnOmqK2k0,idk should I feel sad or mad at him<sep>&quot;...,0,15.05
3,PNKWjqW8kvI,"Biden Gets Confused (Again): ""Middle Class Did...",Comments made on 4/17/24,https://www.youtube.com/watch?v=PNKWjqW8kvI,What?<sep>This is funny but dueable. All th...,0,8.71
4,iKKRU3ALjb0,Republicans want a national ban on abortion,Follow Joe!Joe’s Twitter: https://twitter.com/...,https://www.youtube.com/watch?v=iKKRU3ALjb0,FJB.<br>He&#39;s to old and his brain doesn&#3...,0,15.23
5,N-fELwFvM64,Haley ends Republican presidential campaign,Former U.N. ambassador Nikki Haley ended her b...,https://www.youtube.com/watch?v=N-fELwFvM64,Your voice is horrible.😂🎉<sep>That congratulat...,0,44.77
6,r94vuvwUSkY,What? Meme,Sub and Like,https://www.youtube.com/watch?v=r94vuvwUSkY,Lol<sep>Is that Russel westbrook?<sep>6 sec ad...,0,5.57
...,...,...,...,...,...,...,...
314,TLw64MCra3U,Rishi Sunak frees up millions of GP appointments,9 in 10 pharmacies across England are now offe...,https://www.youtube.com/watch?v=TLw64MCra3U,"If I need to see the GP, it&#39;s because it&#...",0,37.96
315,liII_ikKCIo,Qui menace ?,,https://www.youtube.com/watch?v=liII_ikKCIo,P. A. F<sep>P. A. F<sep>Monsieur Macron pour ê...,0,59.58
316,yyaEuBNhpqY,Courteney Cox Wants Zac Efron as a Love Intere...,SUBSCRIBE: https://ellen.tv/3D6Sewq The place ...,https://www.youtube.com/watch?v=yyaEuBNhpqY,I saw a short video of Ellen that was with Nev...,0,34.85
318,rs3vQ49mfrY,"""I'm IGN"" #KeanuReeves #johnwick #redcarpet #m...",,https://www.youtube.com/watch?v=rs3vQ49mfrY,Tan bello y humilde ❤<sep>They can shoot their...,0,16.90


Size of Negative Train Data: 195


In [11]:
display(neg_val_df)
print(f'Size of Negative Val Data: {len(neg_val_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
2,xzm_IlrpK_Y,"ראש הממשלה נתניהו במערך 9900: ""אנחנו אומרים למ...",ראש הממשלה בנימין נתניהו ביקר היום (ג׳) במערך ...,https://www.youtube.com/watch?v=xzm_IlrpK_Y,أبو عبيدة الرايق :<br>باقي ماشفنا منك حتى شيكل...,0,25.38
3,R1DaTvQXwpY,Impeached DHS Sec. Alejandro Mayorkas: The sou...,,https://www.youtube.com/watch?v=R1DaTvQXwpY,More hot air.<sep>Genital head is lying<sep>Su...,0,44.14
5,_I9vGD4j1_U,BREAKING: Joe Biden Says One Of The Classified...,,https://www.youtube.com/watch?v=_I9vGD4j1_U,Why isn&#39;t he arrested for treason<sep>Put ...,0,26.45
6,s_zva6pf3oA,Donald Trump Predicts his Victory in 1988,💰 Welcome to WealthTeachers – your hub for lon...,https://www.youtube.com/watch?v=s_zva6pf3oA,And he is one of those making fools fool!<sep>...,0,24.09
8,YIWdxO2J-Tw,The Top 5 Moments of Taylor Swift’s Life 🖐 #sh...,SUBSCRIBE: https://ellen.tv/3D6Sewq The place ...,https://www.youtube.com/watch?v=YIWdxO2J-Tw,I ❤ Taylor swift<sep>They weren&#39;t talking ...,0,59.16
10,527-1MrJYUE,Laxmi Raai 😍 Spotted at Kromakay Salon in Juhu...,,https://www.youtube.com/watch?v=527-1MrJYUE,Aunty I love you<sep>beautiful<sep>En logo ko ...,0,54.61
15,DKB4Mf5PVFY,Trump being a sigma… #alphamale #masculinity #...,,https://www.youtube.com/watch?v=DKB4Mf5PVFY,🇺🇸MEN - HAD ENOUGH ?!? 🛩️ Exit The 🇺🇸 🏍 🏌🏽‍♂️🏎...,0,16.9
16,wswxrDiSiHI,Obama's Quick Response To Sarcastic Republican...,Watch Barack Obama's response when Republicans...,https://www.youtube.com/watch?v=wswxrDiSiHI,Votor fraudulent wins.<sep>ISIS and 65k dollar...,0,24.96
17,whnY9-ugr2w,Trump sells Bibles after years of touting his ...,Former president Donald Trump trumpeted his lo...,https://www.youtube.com/watch?v=whnY9-ugr2w,Trump shampoo<sep>WHAT A JOKE!!!<sep>Why didn’...,0,59.88
18,t7xDb82TqiI,Nvidia's Ceo On Greatness | Jensen Huang,Original Video: https://www.youtube.com/watch?...,https://www.youtube.com/watch?v=t7xDb82TqiI,Wow<sep>His analogy and wording comes off as h...,0,33.95


Size of Negative Val Data: 26


In [12]:
display(neg_test_df)
print(f'Size of Negative Test Data: {len(neg_test_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
0,xAPIjhkXF-0,Tom Cruise gives the most epic answer ever.,,https://www.youtube.com/watch?v=xAPIjhkXF-0,"Over +150,000 views! Thank you 🙌 <br><br>Wat...",0,12.89
3,zOh1-WHonds,Shehnaaz Kaur Gill at Lakme Fashion Week 😍💖📸 #...,,https://www.youtube.com/watch?v=zOh1-WHonds,Shehnaz ❤😊<sep>Real❤<sep>❤❤❤❤❤<sep>Pure soul b...,0,40.61
4,zotkJwZ9PVY,These are young adults,,https://www.youtube.com/watch?v=zotkJwZ9PVY,I use to umpire baseball and this one guy was ...,0,55.4
5,JZV3aZfextU,Come with me,via YouTube Capture,https://www.youtube.com/watch?v=JZV3aZfextU,Yeah! I wanna <b>LIFT</b><sep>Hell yeah<sep>ID...,0,4.25
7,xdbvjb9-fpg,I wish I was exaggerating,,https://www.youtube.com/watch?v=xdbvjb9-fpg,Big fan<sep>Even the roaches said u gotta chec...,0,50.02
9,bm_YV-WorZ8,Anant Ambani conversation with Meta CEO Mark Z...,Anant Ambani with Meta CEO zuck#ambani #ambani...,https://www.youtube.com/watch?v=bm_YV-WorZ8,NPC Lag Rahay Hein<sep>300 crore watch?<sep>me...,0,43.68
10,myS3A6p5xuA,"Vladimir Putin, Kim Jong Un Toast to Peace at ...",Kim Jong Un sought Vladimir Putin’s help in re...,https://www.youtube.com/watch?v=myS3A6p5xuA,Putin dan kim sbetulnya mereka tdk suka mafia ...,0,19.09
11,z05Bk3hNVWE,The Trump Cabinet members refusing to endorse him,"Over the past two years, half a dozen former T...",https://www.youtube.com/watch?v=z05Bk3hNVWE,They will still vote for him...😂<sep>Strong wo...,0,56.22
13,pJpJLiMAV-8,"""Who Will Keep You Safer As President?"" | Joe ...",Join our campaign: http://www.joebiden.comFoll...,https://www.youtube.com/watch?v=pJpJLiMAV-8,,0,43.31
15,CovaEMP2pBY,Hear what Biden said about Putin during surpri...,President Joe Biden made a surprise visit to U...,https://www.youtube.com/watch?v=CovaEMP2pBY,Πιστευω οτι το πολιτικο επιπεδο του Πουτιν ειν...,0,45.93


Size of Negative Test Data: 51


### Display Positive Data

In [13]:
display(pos_train_df)
print(f'Size of Positive Train Data: {len(pos_train_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
0,KabnUV5luJ8,Cersei Lannister is Melania Trump [ Deepfake ],Enjoy :) And PLEASE Like & Subscribe if you wa...,https://www.youtube.com/watch?v=KabnUV5luJ8,Please name of the software ?<sep>Deep fake<se...,1,50.90
2,6aSoP0mDM0g,Obama ACTUALLY Does the Obamehameha (REAL),Sleepy Joe's Biden Blast is one thing.Big Mike...,https://www.youtube.com/watch?v=6aSoP0mDM0g,Now we need Trumps card to block the attacks a...,1,22.27
3,H_RFyF-tgZ4,DEEPFAKE | THOR Selling Phones in INDIA | Aven...,Deepfake | Thor at Mobile Shop in India 😂😂What...,https://www.youtube.com/watch?v=H_RFyF-tgZ4,How do you guys not have millions of subscribe...,1,36.13
4,E_hjqpXgUO0,Donald Trump gets all Fat and Sassy [ Deepfake ],ENJOY!Check out our community forums:https://o...,https://www.youtube.com/watch?v=E_hjqpXgUO0,Same. 😒<sep>Lol the way the head moves.,1,10.38
5,pQuS5Pmrtnk,Mark Zuckerberg flirting with Trump [Deepfake],New deepfake featuring Mark Zuckerberg and Don...,https://www.youtube.com/watch?v=pQuS5Pmrtnk,"<a href=""https://shutr.bz/3H7OcGG"">https://shu...",1,38.45
...,...,...,...,...,...,...,...
298,B68GxCrs5rI,Problem Сhild / Macaulay Culkin (DeepFake) Tik...,Do not click 😎 https://www.youtube.com/c/AnyFa...,https://www.youtube.com/watch?v=B68GxCrs5rI,,1,23.27
299,OF2Hrq2aK5k,Presidential Tales: Kidnappers Shoes !,"Presidential Tales: Biden, Trump, Clinton, Oba...",https://www.youtube.com/watch?v=OF2Hrq2aK5k,Funny! 👍<sep>That Blank expression of Biden af...,1,21.73
300,UkjmTzUZbdQ,Flying Cars [RIFE AI Slow-Mo],"This a test of new neural network called RIFE,...",https://www.youtube.com/watch?v=UkjmTzUZbdQ,"<a href=""https://shutr.bz/3H7OcGG"">https://shu...",1,30.30
301,jxxzi4es0Ko,Ace Ventura / Ben Stiller (DeepFake) TikTok #S...,Do not click 😎 https://www.youtube.com/c/AnyF...,https://www.youtube.com/watch?v=jxxzi4es0Ko,,1,48.65


Size of Positive Train Data: 176


In [14]:
display(pos_val_df)
print(f'Size of Positive Val Data: {len(pos_val_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
1,AcQdS9M7P5I,Ghost Caught On Traffic CCTV Camera,Scary Ghost CCTV FootagePatreon: https://www.p...,https://www.youtube.com/watch?v=AcQdS9M7P5I,Very cool.<sep>Omg is this really may I take t...,1,30.86
2,9J54CMpQO1A,Presidential Tales: My Wife Birthday...,"Presidential Tales: Biden, Trump, Clinton, Oba...",https://www.youtube.com/watch?v=9J54CMpQO1A,"So, I bought her nothing 😂😂😂😂😂<sep>So fake... ...",1,24.5
4,lDR2lmCO6iM,Scarlett Johansson eating Spaghetti,This video has been created by an AI for exper...,https://www.youtube.com/watch?v=lDR2lmCO6iM,She is beautiful and saucy! 😃❤<sep>everyone mu...,1,39.82
8,4Qwyb-JZQuA,Men in Black / Kevin Hart (DeepFake) TikTok #S...,Do not click 😎 https://www.youtube.com/c/AnyF...,https://www.youtube.com/watch?v=4Qwyb-JZQuA,,1,24.68
10,m6teLPZLMyI,C3PO's looks at his REAL FRIENDS before he die...,This is an edit from the Star Wars Rise of Sky...,https://www.youtube.com/watch?v=m6teLPZLMyI,If this doesn&#39;t make you feel something in...,1,16.04
11,vyFzuGF_kmM,Kevin Spacey is back to acting - as Donald Tru...,ENJOY!Check out our community forums:https://o...,https://www.youtube.com/watch?v=vyFzuGF_kmM,Who is that lol<sep>He looks Asian now lmao<se...,1,57.4
12,jbvlVo8WLOU,Aladdin / Amber Heard (DeepFake) TikTok #Shorts,Do not click 😎 https://www.youtube.com/c/AnyF...,https://www.youtube.com/watch?v=jbvlVo8WLOU,,1,23.41
14,320yMfXjHd4,Presidential Tales: I Accidentally Swallowed,"Presidential Tales: Biden, Trump, Clinton, Oba...",https://www.youtube.com/watch?v=320yMfXjHd4,A script to die for !<sep>it was actualy funny...,1,25.63
15,htYo_eAcwHQ,Presidential Tales: Pregnant Barbie Doll,"Presidential Tales: Biden, Trump, Clinton, Oba...",https://www.youtube.com/watch?v=htYo_eAcwHQ,"So many people hate GW Bush, but i would reall...",1,25.38
17,F4DrMNC23uw,The Voyage of Doctor Dolittle / Leonardo DiCap...,Do not click 😎 https://www.youtube.com/c/AnyF...,https://www.youtube.com/watch?v=F4DrMNC23uw,,1,11.98


Size of Positive Val Data: 26


In [15]:
display(pos_test_df)
print(f'Size of Positive Test Data: {len(pos_test_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
1,Xj0vm2n90SU,The Hangover /Chris Evans (DeepFake) TikTok #S...,Do not click 😎 https://www.youtube.com/c/AnyF...,https://www.youtube.com/watch?v=Xj0vm2n90SU,,1,32.02
2,0M7XvqtWUWc,Interview with the Vampire / Robert Pattinson ...,Do not click 😎 https://www.youtube.com/c/AnyF...,https://www.youtube.com/watch?v=0M7XvqtWUWc,,1,33.11
3,8eMnD0QSZqs,"Donald Trump with Mike ""Kathy Griffin"" Pence [...",Nice makeup job Mike!ENJOY!Check out our commu...,https://www.youtube.com/watch?v=8eMnD0QSZqs,That was so disturbing dude lol. It didn&#39;t...,1,40.38
4,Hli_IGInsZc,Terminator T-800,This video has been created by an AI for exper...,https://www.youtube.com/watch?v=Hli_IGInsZc,Creepinator T-800<sep>Does anyone know what ai...,1,42.28
5,mC-lE6_MrfI,Deepfake | Sanjay Dutt as Arnold | Terminator,Hey ! Guys we have come up with another Deepfa...,https://www.youtube.com/watch?v=mC-lE6_MrfI,Hollywood - Arnold Schwarzenegger<br>Bollywood...,1,59.68
6,VOBulYr3swc,Chloe from Detroit Become Human talks about Cr...,This video has been created by an AI for exper...,https://www.youtube.com/watch?v=VOBulYr3swc,can you make an AI of my smile?<sep>Is that de...,1,44.88
7,9utpya9UZwU,Mr Beast and PewDiePie at the playground,This video has been created by an AI for exper...,https://www.youtube.com/watch?v=9utpya9UZwU,XDDDDDDDDDD<sep>Nice<sep>They looks like disgu...,1,49.62
8,Ktg3uLCcRX8,Michael Jackson Demo (private video) [ Deepfa...,,https://www.youtube.com/watch?v=Ktg3uLCcRX8,,1,15.77
12,a15h8twMfqs,Dr. Phil sings Dama Da Ne (Baka Mitai Meme),https://www.instagram.com/burritoglasses/https...,https://www.youtube.com/watch?v=a15h8twMfqs,I love technology. Bring on the A.I overloads....,1,27.89
13,qCVJprRg3xI,Presidential Tales: Living In China...,"Presidential Tales: Biden, Trump, Clinton, Oba...",https://www.youtube.com/watch?v=qCVJprRg3xI,Karmala looks like she has had Adam&#39;s appl...,1,28.91


Size of Positive Test Data: 46


### Some Cleaning Functions

Cleaning Text
1. Remove Punctuations and stopwords
2. Remove <seps>
3. Remove HTML tags and url
4. Demojize
5. Expand abbreviations and contractions (use python contraction library)
6. Expand colloquialisms
7. Translate Foreign Languages to english
8. Convert to Lower case
9. Fix grammatical errors
10. Lemmatization

In [16]:
import re
import string
import emoji

def clean_text(text):
    # Convert input to string
    text = str(text)
    
    # Remove <sep> tags
    text = re.sub(r'<sep>', ' ', text)
    
    # Remove HTML tags
    text = re.sub(r'<[^>]*>', ' ', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    
    # Remove punctuation marks
    text = text.translate(str.maketrans(' ', ' ', string.punctuation))
    
    # Remove emojis
    text = emoji.replace_emoji(text, replace='')
    
    # Convert text to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    
    return text

In [17]:
text = '<sep>Lol the way the head moves'
clean_text(text)

'lol the way the head moves'

### Youtube Scraping

In [161]:
# api_key = 'AIzaSyDyCuxE5dBSUkUypRMU4OojLn__g7-Wujo'
# videoID = ''
# get_comments(apikey=api_key, videoID=videoID)

In [162]:
from googleapiclient.discovery import build

def get_comments(apikey=None, videoID=None):
    # Replace with your actual API key
    API_KEY = apikey

    # Create a YouTube Data API client
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    # Replace with the ID of the video you want to retrieve comments from
    VIDEO_ID = videoID

    # Retrieve the comments for the video
    response = youtube.commentThreads().list(
        part='snippet',
        videoId=VIDEO_ID,
        textFormat='plainText'
    ).execute()

    # Process the comments
    for item in response['items']:
        comment = item['snippet']['topLevelComment']['snippet']
        author = comment['authorDisplayName']
        text = comment['textDisplay']
        print(f'Author: {author}')
        print(f'Comment: {text}')
        print('---')

    # Check if there are more pages of comments
    while 'nextPageToken' in response:
        response = youtube.commentThreads().list(
            part='snippet',
            videoId=VIDEO_ID,
            pageToken=response['nextPageToken'],
            textFormat='plainText'
        ).execute()

        # Process the comments on the next page
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']
            author = comment['authorDisplayName']
            text = comment['textDisplay']
            print(f'Author: {author}')
            print(f'Comment: {text}')
            print('---')

### Cleaning Positives

In [21]:
# Create a new DataFrame with the cleaned text for positive
cleaned_pos_train_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleaned_pos_val_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleand_pos_test_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])

In [22]:
cleaned_pos_train_df['video_id'] = pos_train_df['video_id']
cleaned_pos_train_df['url'] = pos_train_df['url']
cleaned_pos_train_df['label'] = pos_train_df['label']
cleaned_pos_train_df['duration'] = pos_train_df['duration']
cleaned_pos_train_df['title'] = pos_train_df['title'].apply(clean_text)
cleaned_pos_train_df['description'] = pos_train_df['description'].apply(clean_text)
cleaned_pos_train_df['comments'] = pos_train_df['comments'].apply(clean_text)

In [23]:
display(cleaned_pos_train_df)
print(f'Size of Cleaned Positive Train Data: {len(cleaned_pos_train_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
0,KabnUV5luJ8,cersei lannister is melania trump deepfake,enjoy and please like subscribe if you want to...,https://www.youtube.com/watch?v=KabnUV5luJ8,please name of the software deep fake try putt...,1,50.90
2,6aSoP0mDM0g,obama actually does the obamehameha real,sleepy joes biden blast is one thingbig mikes ...,https://www.youtube.com/watch?v=6aSoP0mDM0g,now we need trumps card to block the attacks a...,1,22.27
3,H_RFyF-tgZ4,deepfake thor selling phones in india avengers,deepfake thor at mobile shop in india what if ...,https://www.youtube.com/watch?v=H_RFyF-tgZ4,how do you guys not have millions of subscribe...,1,36.13
4,E_hjqpXgUO0,donald trump gets all fat and sassy deepfake,enjoycheck out our community forums helping th...,https://www.youtube.com/watch?v=E_hjqpXgUO0,same lol the way the head moves,1,10.38
5,pQuS5Pmrtnk,mark zuckerberg flirting with trump deepfake,new deepfake featuring mark zuckerberg and don...,https://www.youtube.com/watch?v=pQuS5Pmrtnk,unlock the power of stock footage today nathan...,1,38.45
...,...,...,...,...,...,...,...
298,B68GxCrs5rI,problem сhild macaulay culkin deepfake tiktok ...,do not click a help of neural networks anyone ...,https://www.youtube.com/watch?v=B68GxCrs5rI,,1,23.27
299,OF2Hrq2aK5k,presidential tales kidnappers shoes,presidential tales biden trump clinton obama p...,https://www.youtube.com/watch?v=OF2Hrq2aK5k,funny that blank expression of biden after the...,1,21.73
300,UkjmTzUZbdQ,flying cars rife ai slowmo,this a test of new neural network called rife ...,https://www.youtube.com/watch?v=UkjmTzUZbdQ,unlock the power of stock footage today cool c...,1,30.30
301,jxxzi4es0Ko,ace ventura ben stiller deepfake tiktok shorts,do not click a help of neural networks anyone ...,https://www.youtube.com/watch?v=jxxzi4es0Ko,,1,48.65


Size of Cleaned Positive Train Data: 176


In [24]:
cleaned_pos_val_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleaned_pos_val_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleaned_pos_val_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])

cleaned_pos_val_df['video_id'] = pos_val_df['video_id']
cleaned_pos_val_df['url'] = pos_val_df['url']
cleaned_pos_val_df['label'] = pos_val_df['label']
cleaned_pos_val_df['duration'] = pos_val_df['duration']
cleaned_pos_val_df['title'] = pos_val_df['title'].apply(clean_text)
cleaned_pos_val_df['description'] = pos_val_df['description'].apply(clean_text)
cleaned_pos_val_df['comments'] = pos_val_df['comments'].apply(clean_text)

In [25]:
display(cleaned_pos_val_df)
print(f'Size of Cleaned Positive Val Data: {len(cleaned_pos_val_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
1,AcQdS9M7P5I,ghost caught on traffic cctv camera,scary ghost cctv footagepatreon copyright is c...,https://www.youtube.com/watch?v=AcQdS9M7P5I,very cool omg is this really may i take this v...,1,30.86
2,9J54CMpQO1A,presidential tales my wife birthday,presidential tales biden trump clinton obama p...,https://www.youtube.com/watch?v=9J54CMpQO1A,so i bought her nothing so fake the movement o...,1,24.5
4,lDR2lmCO6iM,scarlett johansson eating spaghetti,this video has been created by an ai for exper...,https://www.youtube.com/watch?v=lDR2lmCO6iM,she is beautiful and saucy everyone must eat s...,1,39.82
8,4Qwyb-JZQuA,men in black kevin hart deepfake tiktok shorts,do not click a help of neural networks anyone ...,https://www.youtube.com/watch?v=4Qwyb-JZQuA,,1,24.68
10,m6teLPZLMyI,c3pos looks at his real friends before he dies...,this is an edit from the star wars rise of sky...,https://www.youtube.com/watch?v=m6teLPZLMyI,if this doesn39t make you feel something insid...,1,16.04
11,vyFzuGF_kmM,kevin spacey is back to acting as donald trump...,enjoycheck out our community forums helping th...,https://www.youtube.com/watch?v=vyFzuGF_kmM,who is that lol he looks asian now lmao the be...,1,57.4
12,jbvlVo8WLOU,aladdin amber heard deepfake tiktok shorts,do not click a help of neural networks anyone ...,https://www.youtube.com/watch?v=jbvlVo8WLOU,,1,23.41
14,320yMfXjHd4,presidential tales i accidentally swallowed,presidential tales biden trump clinton obama p...,https://www.youtube.com/watch?v=320yMfXjHd4,a script to die for it was actualy funny excep...,1,25.63
15,htYo_eAcwHQ,presidential tales pregnant barbie doll,presidential tales biden trump clinton obama p...,https://www.youtube.com/watch?v=htYo_eAcwHQ,so many people hate gw bush but i would really...,1,25.38
17,F4DrMNC23uw,the voyage of doctor dolittle leonardo dicapri...,do not click a help of neural networks anyone ...,https://www.youtube.com/watch?v=F4DrMNC23uw,,1,11.98


Size of Cleaned Positive Val Data: 26


In [26]:
cleaned_pos_test_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleaned_pos_test_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleaned_pos_test_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])

cleaned_pos_test_df['video_id'] = pos_test_df['video_id']
cleaned_pos_test_df['url'] = pos_test_df['url']
cleaned_pos_test_df['label'] = pos_test_df['label']
cleaned_pos_test_df['duration'] = pos_test_df['duration']
cleaned_pos_test_df['title'] = pos_test_df['title'].apply(clean_text)
cleaned_pos_test_df['description'] = pos_test_df['description'].apply(clean_text)
cleaned_pos_test_df['comments'] = pos_test_df['comments'].apply(clean_text)

In [27]:
display(cleaned_pos_test_df)
print(f'Size of Cleaned Positive Test Data: {len(cleaned_pos_test_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
1,Xj0vm2n90SU,the hangover chris evans deepfake tiktok shorts,do not click a help of neural networks anyone ...,https://www.youtube.com/watch?v=Xj0vm2n90SU,,1,32.02
2,0M7XvqtWUWc,interview with the vampire robert pattinson de...,do not click a help of neural networks anyone ...,https://www.youtube.com/watch?v=0M7XvqtWUWc,,1,33.11
3,8eMnD0QSZqs,donald trump with mike kathy griffin pence dee...,nice makeup job mikeenjoycheck out our communi...,https://www.youtube.com/watch?v=8eMnD0QSZqs,that was so disturbing dude lol it didn39t loo...,1,40.38
4,Hli_IGInsZc,terminator t800,this video has been created by an ai for exper...,https://www.youtube.com/watch?v=Hli_IGInsZc,creepinator t800 does anyone know what ai mode...,1,42.28
5,mC-lE6_MrfI,deepfake sanjay dutt as arnold terminator,hey guys we have come up with another deepfake...,https://www.youtube.com/watch?v=mC-lE6_MrfI,hollywood arnold schwarzenegger bollywood sanj...,1,59.68
6,VOBulYr3swc,chloe from detroit become human talks about cr...,this video has been created by an ai for exper...,https://www.youtube.com/watch?v=VOBulYr3swc,can you make an ai of my smile is that detroit...,1,44.88
7,9utpya9UZwU,mr beast and pewdiepie at the playground,this video has been created by an ai for exper...,https://www.youtube.com/watch?v=9utpya9UZwU,xdddddddddd nice they looks like disguising al...,1,49.62
8,Ktg3uLCcRX8,michael jackson demo private video deepfake,,https://www.youtube.com/watch?v=Ktg3uLCcRX8,,1,15.77
12,a15h8twMfqs,dr phil sings dama da ne baka mitai meme,phildeepfacelab meme drphilthis type of video ...,https://www.youtube.com/watch?v=a15h8twMfqs,i love technology bring on the ai overloads th...,1,27.89
13,qCVJprRg3xI,presidential tales living in china,presidential tales biden trump clinton obama p...,https://www.youtube.com/watch?v=qCVJprRg3xI,karmala looks like she has had adam39s apple s...,1,28.91


Size of Cleaned Positive Test Data: 46


### Cleaning Negatives

In [28]:
# Create a new DataFrame with the cleaned text for negative
cleaned_neg_train_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleaned_neg_val_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleand_neg_test_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])

In [29]:
cleaned_neg_train_df['video_id'] = neg_train_df['video_id']
cleaned_neg_train_df['url'] = neg_train_df['url']
cleaned_neg_train_df['label'] = neg_train_df['label']
cleaned_neg_train_df['duration'] = neg_train_df['duration']
cleaned_neg_train_df['title'] = neg_train_df['title'].apply(clean_text)
cleaned_neg_train_df['description'] = neg_train_df['description'].apply(clean_text)
cleaned_neg_train_df['comments'] = neg_train_df['comments'].apply(clean_text)

In [30]:
display(cleaned_neg_train_df)
print(f'Size of Cleaned negative Train Data: {len(cleaned_neg_train_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
0,NIAnOmqK2k0,yikes joe biden on why he was so irresponsible...,,https://www.youtube.com/watch?v=NIAnOmqK2k0,idk should i feel sad or mad at him quottheyqu...,0,15.05
3,PNKWjqW8kvI,biden gets confused again middle class didnt b...,comments made on 41724,https://www.youtube.com/watch?v=PNKWjqW8kvI,what this is funny but dueable all these illeg...,0,8.71
4,iKKRU3ALjb0,republicans want a national ban on abortion,follow joejoe’s twitter facebook instagram bid...,https://www.youtube.com/watch?v=iKKRU3ALjb0,fjb he39s to old and his brain doesn39t work k...,0,15.23
5,N-fELwFvM64,haley ends republican presidential campaign,former un ambassador nikki haley ended her bid...,https://www.youtube.com/watch?v=N-fELwFvM64,your voice is horrible that congratulations to...,0,44.77
6,r94vuvwUSkY,what meme,sub and like,https://www.youtube.com/watch?v=r94vuvwUSkY,lol is that russel westbrook 6 sec ad to watch...,0,5.57
...,...,...,...,...,...,...,...
314,TLw64MCra3U,rishi sunak frees up millions of gp appointments,9 in 10 pharmacies across england are now offe...,https://www.youtube.com/watch?v=TLw64MCra3U,if i need to see the gp it39s because it39s so...,0,37.96
315,liII_ikKCIo,qui menace,,https://www.youtube.com/watch?v=liII_ikKCIo,p a f p a f monsieur macron pour être un bon p...,0,59.58
316,yyaEuBNhpqY,courteney cox wants zac efron as a love intere...,subscribe the place for laughs joy stars surpr...,https://www.youtube.com/watch?v=yyaEuBNhpqY,i saw a short video of ellen that was with nev...,0,34.85
318,rs3vQ49mfrY,im ign keanureeves johnwick redcarpet movies s...,,https://www.youtube.com/watch?v=rs3vQ49mfrY,tan bello y humilde they can shoot their way i...,0,16.90


Size of Cleaned negative Train Data: 195


In [31]:
cleaned_neg_val_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleaned_neg_val_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleaned_neg_val_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])

cleaned_neg_val_df['video_id'] = neg_val_df['video_id']
cleaned_neg_val_df['url'] = neg_val_df['url']
cleaned_neg_val_df['label'] = neg_val_df['label']
cleaned_neg_val_df['duration'] = neg_val_df['duration']
cleaned_neg_val_df['title'] = neg_val_df['title'].apply(clean_text)
cleaned_neg_val_df['description'] = neg_val_df['description'].apply(clean_text)
cleaned_neg_val_df['comments'] = neg_val_df['comments'].apply(clean_text)

In [32]:
display(cleaned_neg_val_df)
print(f'Size of Cleaned negative Val Data: {len(cleaned_neg_val_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
2,xzm_IlrpK_Y,ראש הממשלה נתניהו במערך 9900 אנחנו אומרים למחב...,ראש הממשלה בנימין נתניהו ביקר היום ג׳ במערך 99...,https://www.youtube.com/watch?v=xzm_IlrpK_Y,أبو عبيدة الرايق باقي ماشفنا منك حتى شيكل باش ...,0,25.38
3,R1DaTvQXwpY,impeached dhs sec alejandro mayorkas the south...,,https://www.youtube.com/watch?v=R1DaTvQXwpY,more hot air genital head is lying sure seems ...,0,44.14
5,_I9vGD4j1_U,breaking joe biden says one of the classified ...,,https://www.youtube.com/watch?v=_I9vGD4j1_U,why isn39t he arrested for treason put him in ...,0,26.45
6,s_zva6pf3oA,donald trump predicts his victory in 1988,welcome to wealthteachers – your hub for long ...,https://www.youtube.com/watch?v=s_zva6pf3oA,and he is one of those making fools fool trump...,0,24.09
8,YIWdxO2J-Tw,the top 5 moments of taylor swift’s life shorts,subscribe the place for laughs joy stars surpr...,https://www.youtube.com/watch?v=YIWdxO2J-Tw,i taylor swift they weren39t talking about fel...,0,59.16
10,527-1MrJYUE,laxmi raai spotted at kromakay salon in juhu l...,,https://www.youtube.com/watch?v=527-1MrJYUE,aunty i love you beautiful en logo ko ur koi k...,0,54.61
15,DKB4Mf5PVFY,trump being a sigma… alphamale masculinity fun...,,https://www.youtube.com/watch?v=DKB4Mf5PVFY,men had enough exit the and last… don’t be a l...,0,16.9
16,wswxrDiSiHI,obamas quick response to sarcastic republican ...,watch barack obamas response when republicans ...,https://www.youtube.com/watch?v=wswxrDiSiHI,votor fraudulent wins isis and 65k dollars of ...,0,24.96
17,whnY9-ugr2w,trump sells bibles after years of touting his ...,former president donald trump trumpeted his lo...,https://www.youtube.com/watch?v=whnY9-ugr2w,trump shampoo what a joke why didn’t he just t...,0,59.88
18,t7xDb82TqiI,nvidias ceo on greatness jensen huang,original video this thoughtprovoking video jen...,https://www.youtube.com/watch?v=t7xDb82TqiI,wow his analogy and wording comes off as hurtf...,0,33.95


Size of Cleaned negative Val Data: 26


In [33]:
cleaned_neg_test_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleaned_neg_test_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])
cleaned_neg_test_df = pd.DataFrame(columns=['video_id', 'title', 'description', 'url', 'comments', 'label', 'duration'])

cleaned_neg_test_df['video_id'] = neg_test_df['video_id']
cleaned_neg_test_df['url'] = neg_test_df['url']
cleaned_neg_test_df['label'] = neg_test_df['label']
cleaned_neg_test_df['duration'] = neg_test_df['duration']
cleaned_neg_test_df['title'] = neg_test_df['title'].apply(clean_text)
cleaned_neg_test_df['description'] = neg_test_df['description'].apply(clean_text)
cleaned_neg_test_df['comments'] = neg_test_df['comments'].apply(clean_text)


In [34]:
display(cleaned_neg_test_df)
print(f'Size of Cleaned negative Test Data: {len(cleaned_neg_test_df)}')

Unnamed: 0,video_id,title,description,url,comments,label,duration
0,xAPIjhkXF-0,tom cruise gives the most epic answer ever,,https://www.youtube.com/watch?v=xAPIjhkXF-0,over 150000 views thank you watch the full int...,0,12.89
3,zOh1-WHonds,shehnaaz kaur gill at lakme fashion week shehn...,,https://www.youtube.com/watch?v=zOh1-WHonds,shehnaz real pure soul bachcha shehnaazgill th...,0,40.61
4,zotkJwZ9PVY,these are young adults,,https://www.youtube.com/watch?v=zotkJwZ9PVY,i use to umpire baseball and this one guy was ...,0,55.4
5,JZV3aZfextU,come with me,via youtube capture,https://www.youtube.com/watch?v=JZV3aZfextU,yeah i wanna lift hell yeah idolo even if i di...,0,4.25
7,xdbvjb9-fpg,i wish i was exaggerating,,https://www.youtube.com/watch?v=xdbvjb9-fpg,big fan even the roaches said u gotta check in...,0,50.02
9,bm_YV-WorZ8,anant ambani conversation with meta ceo mark z...,anant ambani with meta ceo zuckambani ambanis ...,https://www.youtube.com/watch?v=bm_YV-WorZ8,npc lag rahay hein 300 crore watch meri 250 ki...,0,43.68
10,myS3A6p5xuA,vladimir putin kim jong un toast to peace at f...,kim jong un sought vladimir putin’s help in re...,https://www.youtube.com/watch?v=myS3A6p5xuA,putin dan kim sbetulnya mereka tdk suka mafia ...,0,19.09
11,z05Bk3hNVWE,the trump cabinet members refusing to endorse him,over the past two years half a dozen former tr...,https://www.youtube.com/watch?v=z05Bk3hNVWE,they will still vote for him strong words but ...,0,56.22
13,pJpJLiMAV-8,who will keep you safer as president joe biden...,join our campaign joejoe’s twitter facebook in...,https://www.youtube.com/watch?v=pJpJLiMAV-8,,0,43.31
15,CovaEMP2pBY,hear what biden said about putin during surpri...,president joe biden made a surprise visit to u...,https://www.youtube.com/watch?v=CovaEMP2pBY,πιστευω οτι το πολιτικο επιπεδο του πουτιν ειν...,0,45.93


Size of Cleaned negative Test Data: 51


### Change 'nan' to NaN

In [35]:
cleaned_neg_train_df = cleaned_neg_train_df.replace('nan', np.nan)
cleaned_neg_val_df = cleaned_neg_val_df.replace('nan', np.nan)
cleaned_neg_test_df = cleaned_neg_test_df.replace('nan', np.nan)

cleaned_pos_train_df = cleaned_pos_train_df.replace('nan', np.nan)
cleaned_pos_val_df = cleaned_pos_val_df.replace('nan', np.nan)
cleaned_pos_test_df = cleaned_pos_test_df.replace('nan', np.nan)

In [36]:
cleaned_neg_train_df

Unnamed: 0,video_id,title,description,url,comments,label,duration
0,NIAnOmqK2k0,yikes joe biden on why he was so irresponsible...,,https://www.youtube.com/watch?v=NIAnOmqK2k0,idk should i feel sad or mad at him quottheyqu...,0,15.05
3,PNKWjqW8kvI,biden gets confused again middle class didnt b...,comments made on 41724,https://www.youtube.com/watch?v=PNKWjqW8kvI,what this is funny but dueable all these illeg...,0,8.71
4,iKKRU3ALjb0,republicans want a national ban on abortion,follow joejoe’s twitter facebook instagram bid...,https://www.youtube.com/watch?v=iKKRU3ALjb0,fjb he39s to old and his brain doesn39t work k...,0,15.23
5,N-fELwFvM64,haley ends republican presidential campaign,former un ambassador nikki haley ended her bid...,https://www.youtube.com/watch?v=N-fELwFvM64,your voice is horrible that congratulations to...,0,44.77
6,r94vuvwUSkY,what meme,sub and like,https://www.youtube.com/watch?v=r94vuvwUSkY,lol is that russel westbrook 6 sec ad to watch...,0,5.57
...,...,...,...,...,...,...,...
314,TLw64MCra3U,rishi sunak frees up millions of gp appointments,9 in 10 pharmacies across england are now offe...,https://www.youtube.com/watch?v=TLw64MCra3U,if i need to see the gp it39s because it39s so...,0,37.96
315,liII_ikKCIo,qui menace,,https://www.youtube.com/watch?v=liII_ikKCIo,p a f p a f monsieur macron pour être un bon p...,0,59.58
316,yyaEuBNhpqY,courteney cox wants zac efron as a love intere...,subscribe the place for laughs joy stars surpr...,https://www.youtube.com/watch?v=yyaEuBNhpqY,i saw a short video of ellen that was with nev...,0,34.85
318,rs3vQ49mfrY,im ign keanureeves johnwick redcarpet movies s...,,https://www.youtube.com/watch?v=rs3vQ49mfrY,tan bello y humilde they can shoot their way i...,0,16.90


In [37]:
cleaned_neg_val_df

Unnamed: 0,video_id,title,description,url,comments,label,duration
2,xzm_IlrpK_Y,ראש הממשלה נתניהו במערך 9900 אנחנו אומרים למחב...,ראש הממשלה בנימין נתניהו ביקר היום ג׳ במערך 99...,https://www.youtube.com/watch?v=xzm_IlrpK_Y,أبو عبيدة الرايق باقي ماشفنا منك حتى شيكل باش ...,0,25.38
3,R1DaTvQXwpY,impeached dhs sec alejandro mayorkas the south...,,https://www.youtube.com/watch?v=R1DaTvQXwpY,more hot air genital head is lying sure seems ...,0,44.14
5,_I9vGD4j1_U,breaking joe biden says one of the classified ...,,https://www.youtube.com/watch?v=_I9vGD4j1_U,why isn39t he arrested for treason put him in ...,0,26.45
6,s_zva6pf3oA,donald trump predicts his victory in 1988,welcome to wealthteachers – your hub for long ...,https://www.youtube.com/watch?v=s_zva6pf3oA,and he is one of those making fools fool trump...,0,24.09
8,YIWdxO2J-Tw,the top 5 moments of taylor swift’s life shorts,subscribe the place for laughs joy stars surpr...,https://www.youtube.com/watch?v=YIWdxO2J-Tw,i taylor swift they weren39t talking about fel...,0,59.16
10,527-1MrJYUE,laxmi raai spotted at kromakay salon in juhu l...,,https://www.youtube.com/watch?v=527-1MrJYUE,aunty i love you beautiful en logo ko ur koi k...,0,54.61
15,DKB4Mf5PVFY,trump being a sigma… alphamale masculinity fun...,,https://www.youtube.com/watch?v=DKB4Mf5PVFY,men had enough exit the and last… don’t be a l...,0,16.9
16,wswxrDiSiHI,obamas quick response to sarcastic republican ...,watch barack obamas response when republicans ...,https://www.youtube.com/watch?v=wswxrDiSiHI,votor fraudulent wins isis and 65k dollars of ...,0,24.96
17,whnY9-ugr2w,trump sells bibles after years of touting his ...,former president donald trump trumpeted his lo...,https://www.youtube.com/watch?v=whnY9-ugr2w,trump shampoo what a joke why didn’t he just t...,0,59.88
18,t7xDb82TqiI,nvidias ceo on greatness jensen huang,original video this thoughtprovoking video jen...,https://www.youtube.com/watch?v=t7xDb82TqiI,wow his analogy and wording comes off as hurtf...,0,33.95


In [38]:
cleaned_neg_test_df

Unnamed: 0,video_id,title,description,url,comments,label,duration
0,xAPIjhkXF-0,tom cruise gives the most epic answer ever,,https://www.youtube.com/watch?v=xAPIjhkXF-0,over 150000 views thank you watch the full int...,0,12.89
3,zOh1-WHonds,shehnaaz kaur gill at lakme fashion week shehn...,,https://www.youtube.com/watch?v=zOh1-WHonds,shehnaz real pure soul bachcha shehnaazgill th...,0,40.61
4,zotkJwZ9PVY,these are young adults,,https://www.youtube.com/watch?v=zotkJwZ9PVY,i use to umpire baseball and this one guy was ...,0,55.4
5,JZV3aZfextU,come with me,via youtube capture,https://www.youtube.com/watch?v=JZV3aZfextU,yeah i wanna lift hell yeah idolo even if i di...,0,4.25
7,xdbvjb9-fpg,i wish i was exaggerating,,https://www.youtube.com/watch?v=xdbvjb9-fpg,big fan even the roaches said u gotta check in...,0,50.02
9,bm_YV-WorZ8,anant ambani conversation with meta ceo mark z...,anant ambani with meta ceo zuckambani ambanis ...,https://www.youtube.com/watch?v=bm_YV-WorZ8,npc lag rahay hein 300 crore watch meri 250 ki...,0,43.68
10,myS3A6p5xuA,vladimir putin kim jong un toast to peace at f...,kim jong un sought vladimir putin’s help in re...,https://www.youtube.com/watch?v=myS3A6p5xuA,putin dan kim sbetulnya mereka tdk suka mafia ...,0,19.09
11,z05Bk3hNVWE,the trump cabinet members refusing to endorse him,over the past two years half a dozen former tr...,https://www.youtube.com/watch?v=z05Bk3hNVWE,they will still vote for him strong words but ...,0,56.22
13,pJpJLiMAV-8,who will keep you safer as president joe biden...,join our campaign joejoe’s twitter facebook in...,https://www.youtube.com/watch?v=pJpJLiMAV-8,,0,43.31
15,CovaEMP2pBY,hear what biden said about putin during surpri...,president joe biden made a surprise visit to u...,https://www.youtube.com/watch?v=CovaEMP2pBY,πιστευω οτι το πολιτικο επιπεδο του πουτιν ειν...,0,45.93


### Unsupervised Approaches to Generate Comments

#### Use Topic Modeling to Generate Comments for NaN comments

In [181]:
def topic_modeling_synthetic_comments(cleaned_df = None):
    # assuming 'comments' column is already preprocessed
    vectorizer = CountVectorizer(stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(cleaned_df['comments'].dropna())

    # Train the LDA topic model
    lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
    lda_model.fit(doc_term_matrix)

    # Generate synthetic comments for videos without comments
    for index, row in cleaned_df[cleaned_df['comments'].isna()].iterrows():
        video_label = row['label']
        video_title = row['title']
        video_description = row['description']  # Assuming 'description' column contains video descriptions
        
        # Combine video title and description
        video_metadata = f"{video_title} {video_description}"
        
        # Predict the topic distribution for the video
        topic_prob = lda_model.transform(vectorizer.transform([video_metadata]))[0]
        top_topics = topic_prob.argsort()[-3:][::-1]
        
        # Generate synthetic comments
        synthetic_comment = []
        for topic_id in top_topics:
            top_words = [vectorizer.get_feature_names_out()[i] for i in lda_model.components_[topic_id].argsort()[-5:][::-1]]
            synthetic_comment.extend(top_words)
        
        synthetic_comment = ' '.join(synthetic_comment)
        cleaned_df.at[index, 'comments'] = synthetic_comment

#### Use Sentiment Analysis

In [182]:
def sentiment_analysis_synthetic_comments(cleaned_df=None):

    # assuming 'comments' column is already preprocessed
    vectorizer = TfidfVectorizer()
    comment_matrix = vectorizer.fit_transform(cleaned_df['comments'].dropna())

    # Perform sentiment analysis on the available comments
    sentiment_scores = []
    for comment in cleaned_df['comments'].dropna():
        sentiment = TextBlob(comment).sentiment.polarity
        sentiment_scores.append(sentiment)

    cleaned_df['sentiment'] = pd.Series(sentiment_scores)

    # Generate synthetic comments for videos without comments
    for index, row in cleaned_df[cleaned_df['comments'].isna()].iterrows():
        video_label = row['label']
        video_title = row['title']
        video_description = row['description']
        
        # Combine video title and description
        video_metadata = f"{video_title} {video_description}"
        
        # Find similar videos based on video metadata
        metadata_vector = vectorizer.transform([video_metadata])
        similarity_scores = cosine_similarity(metadata_vector, comment_matrix)
        similar_indices = similarity_scores.argsort()[0][-5:][::-1]
        
        # Get the sentiment scores of similar videos
        similar_sentiments = cleaned_df.iloc[similar_indices]['sentiment']
        
        # Determine the sentiment for the video based on similar videos and its label
        if video_label == 1:  # Deepfake video
            sentiment = similar_sentiments.min()
        else:  # Non-deepfake video
            sentiment = similar_sentiments.max()
        
        # Generate synthetic comment based on sentiment and video characteristics
        if sentiment > 0:
            synthetic_comment = f"This video about {video_title} seems interesting. {video_description}"
        elif sentiment < 0:
            synthetic_comment = f"I'm not sure about the content of this video on {video_title}. {video_description}"
        else:
            synthetic_comment = f"The video '{video_title}' discusses {video_description}."
        
        cleaned_df.at[index, 'comments'] = synthetic_comment

#### Collaborative Filtering

In [39]:
def collab_filter_synth_comments(cleaned_df=None):

    # assuming 'comments' column is already preprocessed
    vectorizer = TfidfVectorizer()
    comment_matrix = vectorizer.fit_transform(cleaned_df['comments'].dropna())

    # Perform dimensionality reduction using Truncated SVD
    svd = TruncatedSVD(n_components=100)
    reduced_matrix = svd.fit_transform(comment_matrix)

    # Generate synthetic comments for videos without comments
    for index, row in cleaned_df[cleaned_df['comments'].isna()].iterrows():
        video_label = row['label']
        video_title = row['title']
        video_description = row['description']
        
        # Combine video title and description
        video_metadata = f"{video_title} {video_description}"
        
        # Find similar videos based on video metadata
        metadata_vector = vectorizer.transform([video_metadata])
        reduced_metadata_vector = svd.transform(metadata_vector)
        similarity_scores = cosine_similarity(reduced_metadata_vector, reduced_matrix)
        similar_indices = similarity_scores.argsort()[0][-5:][::-1]
        
        # Get the comments from similar videos
        similar_comments = cleaned_df.iloc[similar_indices]['comments']
        
        # Filter out missing comments
        valid_comments = similar_comments.dropna()
        
        # Generate synthetic comment by combining valid comments from similar videos
        if not valid_comments.empty:
            synthetic_comment = ' '.join(valid_comments)
        else:
            synthetic_comment = "No relevant comments found."
        
        cleaned_df.at[index, 'comments'] = synthetic_comment

In [40]:
collab_filter_synth_comments(cleaned_neg_train_df)
collab_filter_synth_comments(cleaned_neg_val_df)
collab_filter_synth_comments(cleaned_neg_test_df)

collab_filter_synth_comments(cleaned_pos_train_df)
collab_filter_synth_comments(cleaned_pos_val_df)
collab_filter_synth_comments(cleaned_pos_test_df)

In [41]:
print('Negatives:')
nan_count = cleaned_neg_train_df['comments'].isna().sum()
print(f'Number of NaN in cleaned_neg_train_df: {nan_count}')

nan_count = cleaned_neg_val_df['comments'].isna().sum()
print(f'Number of NaN in cleaned_neg_val_df: {nan_count}')

nan_count = cleaned_neg_test_df['comments'].isna().sum()
print(f'Number of NaN in cleaned_neg_test_df: {nan_count}')

print('\nPositives:')
nan_count = cleaned_pos_train_df['comments'].isna().sum()
print(f'Number of NaN in cleaned_pos_train_df: {nan_count}')

nan_count = cleaned_pos_val_df['comments'].isna().sum()
print(f'Number of NaN in cleaned_pos_val_df: {nan_count}')

nan_count = cleaned_pos_test_df['comments'].isna().sum()
print(f'Number of NaN in cleaned_pos_test_df: {nan_count}')

Negatives:
Number of NaN in cleaned_neg_train_df: 0
Number of NaN in cleaned_neg_val_df: 0
Number of NaN in cleaned_neg_test_df: 0

Positives:
Number of NaN in cleaned_pos_train_df: 0
Number of NaN in cleaned_pos_val_df: 0
Number of NaN in cleaned_pos_test_df: 0


### Consolidate the Data

In [42]:
# create training set
train_df = pd.concat([cleaned_pos_train_df, cleaned_neg_train_df])
# create validation set
val_df = pd.concat([cleaned_pos_val_df, cleaned_neg_val_df])
# create test set
test_df = pd.concat([cleaned_pos_test_df, cleaned_neg_test_df])

In [43]:
# training set
num_pos_train = len(cleaned_pos_train_df)
num_neg_train = len(cleaned_neg_train_df)
num_train = len(train_df)
print(f'Expected Size of Training set: {num_neg_train + num_pos_train}')
print(f'Actual Size of Training Set: {num_train}')

Expected Size of Training set: 371
Actual Size of Training Set: 371


In [44]:
# validation set
num_pos_val = len(cleaned_pos_val_df)
num_neg_val = len(cleaned_neg_val_df)
num_val = len(val_df)
print(f'Expected Size of Validation set: {num_pos_val + num_neg_val}')
print(f'Actual Size of Validation Set: {num_val}')

Expected Size of Validation set: 52
Actual Size of Validation Set: 52


In [45]:
# test set
num_pos_test = len(cleaned_pos_test_df)
num_neg_test = len(cleaned_neg_test_df)
num_test = len(test_df)
print(f'Expected Size of Test set: {num_pos_test + num_neg_test}')
print(f'Actual Size of Test Set: {num_test}')

Expected Size of Test set: 97
Actual Size of Test Set: 97


### Move Training data to file

In [46]:
def delete_folder_contents(path):
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # Remove file or link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Remove directory and all its contents
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')

In [47]:
delete_folder_contents(train_dir)
delete_folder_contents(val_dir)
delete_folder_contents(test_dir)

In [None]:
def move_videos(src_dir, dest_dir, df):

    failed = list()

    # Create the "train" directory if it doesn't exist
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        video_id = row['video_id']
        
        # Construct the source and destination file paths
        source_path = os.path.join(src_dir, f"{video_id}.mp4")
        destination_path = os.path.join(dest_dir, f"{video_id}.mp4")
        
        # Check if the source video file exists
        if os.path.exists(source_path):
            # Copy the video file from the source to the destination
            shutil.copy2(source_path, destination_path)
            # print(f"Copied video {video_id}.mp4 to train directory.")
        else:
            print(f"Video {video_id}.mp4 not found in {src_dir} directory.")
            failed.append(video_id)

    if(len(failed) > 0):
        print(f'The following videos were not copied from {src_dir} to {dest_dir}')
        for vid in failed:
            print(vid)

In [None]:
# move pos training videos in train
# move_videos(src_dir=pos_dir, dest_dir=train_dir, df=cleaned_pos_df)

In [None]:
# move neg training videos into train
# move_videos(src_dir=neg_dir, dest_dir=train_dir, df=cleaned_neg_df)

In [48]:
def copy_videos(df, dest_dir):

    failed = list()

    # Create the "train" directory if it doesn't exist
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        video_id = row['video_id']
        label = row['label']

        src_dir = None

        if label == 1:
            src_dir = pos_vids_dir
        else:
            src_dir = neg_vids_dir
        
        # Construct the source and destination file paths
        source_path = os.path.join(src_dir, f"{video_id}.mp4")
        destination_path = os.path.join(dest_dir, f"{video_id}.mp4")

        # Check if the source video file exists
        if os.path.exists(source_path):
            # Copy the video file from the source to the destination
            shutil.copy2(source_path, destination_path)
            # print(f"Copied video {video_id}.mp4 to train directory.")
        else:
            print(f"Video {video_id}.mp4 not found in {src_dir} directory.")
            failed.append(video_id)

    if(len(failed) > 0):
        print(f'The following videos were not copied from {src_dir} to {dest_dir}')
        for vid in failed:
            print(vid)

In [51]:
copy_videos(df=train_df, dest_dir=train_dir)
print(f'Number of files in train_df: {num_train}')
print(f'NUmber of files copies to Train Dir: {len(os.listdir(train_dir))}')

Number of files in train_df: 371
NUmber of files copies to Train Dir: 371


In [None]:
# Finding duplicates in column 'A'
# duplicates = train_df.duplicated('video_id', keep=False)  # 'keep=False' marks all duplicates as True
# duplicated_train = train_df[duplicates]
# duplicated_train

In [None]:
# # Identify duplicated video_id entries
# duplicates = train_df.duplicated('video_id', keep=False)

# # Create a DataFrame with only the duplicated entries
# duplicated_train = train_df[duplicates]

# # Count the unique video_ids that have duplicates
# unique_duplicates_count = duplicated_train['video_id'].nunique()

# # Output the count of unique duplicate video_ids
# print(f'Number of unique video_ids that are duplicated: {unique_duplicates_count}')

# # Now print the number of files in train_df and the number of files copied to Train Dir
# print(f'Number of files in train_df: {len(train_df)}')
# print(f'Number of files copied to Train Dir: {len(os.listdir(train_dir))}')

# # Calculate the difference
# num_files_discrepancy = len(train_df) - len(os.listdir(train_dir))

# # Check if the number of unique duplicates matches the discrepancy
# if num_files_discrepancy == unique_duplicates_count:
#     print("The discrepancy matches the number of unique duplicate video_ids.")
# else:
#     print("There is an inconsistency in the number of duplicates.")

In [52]:
copy_videos(df=val_df, dest_dir=val_dir)
print(f'Number of files in val_df: {num_val}')
print(f'NUmber of files copies to Val Dir: {len(os.listdir(val_dir))}')

Number of files in val_df: 52
NUmber of files copies to Val Dir: 52


In [53]:
copy_videos(df=test_df, dest_dir=test_dir)
print(f'Number of files in test_df: {num_test}')
print(f'NUmber of files copies to Test Dir: {len(os.listdir(test_dir))}')

Number of files in test_df: 97
NUmber of files copies to Test Dir: 97


In [54]:
# Get the list of video files in the "train/val/test" directory
def validateCopy(df=None, dir_type=None, copyDir=None):

    videos = [f for f in os.listdir(copyDir) if f.endswith(".mp4")]

    # Extract the video IDs from the filenames in the "train" directory
    video_ids = [os.path.splitext(f)[0] for f in videos]

    # Get the video IDs from the DataFrame
    all_video_ids = df['video_id']

    # Find the missing video IDs
    missing_video_ids = all_video_ids.loc[~all_video_ids.isin(video_ids)]

    # Print the missing video IDs
    if len(missing_video_ids) > 0:
        print("Missing videos:")
        for video_id in missing_video_ids:
            print(video_id)
    else:
        print(f'All {dir_type} videos were successfully copied.')

In [55]:
# check train videos
validateCopy(df=train_df, dir_type='training', copyDir=train_dir)
# check val videos
validateCopy(df=val_df, dir_type='validation', copyDir=val_dir)
# check test videos
validateCopy(df=test_df, dir_type='test', copyDir=test_dir)

All training videos were successfully copied.
All validation videos were successfully copied.
All test videos were successfully copied.


In [None]:
# # Find common video IDs using intersection()
# common_video_ids = set(cleaned_pos_df['video_id']).intersection(set(cleaned_neg_df['video_id']))

# # Print the common video IDs
# if len(common_video_ids) > 0:
#     print("Common video IDs:")
#     for video_id in common_video_ids:
#         print(video_id)
# else:
#     print("No common video IDs found.")

### Create Annotation Files

In [70]:
annotations_dir = '../annotations'

video_train = 'video_train.csv'
video_val = 'video_val.csv'
video_test = 'video_test.csv'

text_train = 'text_train.csv'
text_val = 'text_val.csv'
text_test = 'text_test.csv'


#### Remove existing annotation files

In [71]:
delete_folder_contents(annotations_dir)

#### Video files path annotations

In [72]:
def create_video_path_df(prefix, suffix, df, root_dir, output_file):

    # concatenating prefix, video_id from train_df, and suffix
    new_df = pd.DataFrame()
    new_df['video_path'] = prefix + df['video_id'].astype(str) + suffix
    new_df['label'] = df['label']
    
    # Write the DataFrame to a CSV file
    output_file_path = os.path.join(root_dir, output_file)
    new_df.to_csv(output_file_path, index=False)
    print(f"CSV file has been created at: {output_file_path}")

In [73]:
train_prefix = './train/'
val_prefix = './val/'
test_prefix = './test/'
suffix = ".mp4"

In [74]:
create_video_path_df(prefix=train_prefix, suffix=suffix, df=train_df, root_dir=annotations_dir, output_file='video_train_path.csv')
create_video_path_df(prefix=val_prefix, suffix=suffix, df=val_df, root_dir=annotations_dir, output_file='video_val_path.csv')
create_video_path_df(prefix=test_prefix, suffix=suffix, df=test_df, root_dir=annotations_dir, output_file='video_test_path.csv')


CSV file has been created at: ../annotations/video_train_path.csv
CSV file has been created at: ../annotations/video_val_path.csv
CSV file has been created at: ../annotations/video_test_path.csv


In [75]:
train_annotation = pd.read_csv('../annotations/video_train_path.csv')
type(train_annotation.iloc[0]['video_path'])

str

#### Video and Text csv file with labels

In [76]:
def create_video_csv(df, root_dir, output_file):

    new_df = pd.DataFrame()
    new_df['video_id'] = df['video_id']
    new_df['label'] = df['label']

    # Write the DataFrame to a CSV file
    output_file_path = os.path.join(root_dir, output_file)
    new_df.to_csv(output_file_path, index=False)
    print(f"CSV file has been created at: {output_file_path}")


In [77]:
def create_text_csv(df, root_dir, output_file):

    new_df = pd.DataFrame()
    new_df['text'] = df['comments']
    new_df['label'] = df['label']

    # Write the DataFrame to a CSV file
    output_file_path = os.path.join(root_dir, output_file)
    new_df.to_csv(output_file_path, index=False)
    print(f"CSV file has been created at: {output_file_path}")

In [78]:
create_video_csv(train_df, annotations_dir, video_train)
create_video_csv(val_df, annotations_dir, video_val)
create_video_csv(test_df, annotations_dir, video_test)

create_text_csv(train_df, annotations_dir, text_train)
create_text_csv(val_df, annotations_dir, text_val)
create_text_csv(test_df, annotations_dir, text_test)

CSV file has been created at: ../annotations/video_train.csv
CSV file has been created at: ../annotations/video_val.csv
CSV file has been created at: ../annotations/video_test.csv
CSV file has been created at: ../annotations/text_train.csv
CSV file has been created at: ../annotations/text_val.csv
CSV file has been created at: ../annotations/text_test.csv


In [79]:
video_train_annotation = pd.read_csv(os.path.join(annotations_dir, video_train))
text_train_annotation = pd.read_csv(os.path.join(annotations_dir, text_train))

In [80]:
video_train_annotation

Unnamed: 0,video_id,label
0,KabnUV5luJ8,1
1,6aSoP0mDM0g,1
2,H_RFyF-tgZ4,1
3,E_hjqpXgUO0,1
4,pQuS5Pmrtnk,1
...,...,...
366,TLw64MCra3U,0
367,liII_ikKCIo,0
368,yyaEuBNhpqY,0
369,rs3vQ49mfrY,0


In [81]:

text_train_annotation

Unnamed: 0,text,label
0,please name of the software deep fake try putt...,1
1,now we need trumps card to block the attacks a...,1
2,how do you guys not have millions of subscribe...,1
3,same lol the way the head moves,1
4,unlock the power of stock footage today nathan...,1
...,...,...
366,if i need to see the gp it39s because it39s so...,0
367,p a f p a f monsieur macron pour être un bon p...,0
368,i saw a short video of ellen that was with nev...,0
369,tan bello y humilde they can shoot their way i...,0
