#STEP 1 : LOAD THE DATA

In [1]:
!pip install --upgrade gspread oauth2client


Collecting gspread
  Downloading gspread-6.1.4-py3-none-any.whl.metadata (11 kB)
Downloading gspread-6.1.4-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gspread
  Attempting uninstall: gspread
    Found existing installation: gspread 6.0.2
    Uninstalling gspread-6.0.2:
      Successfully uninstalled gspread-6.0.2
Successfully installed gspread-6.1.4


In [2]:
from google.colab import auth
from google.auth import default
import gspread
import pandas as pd

# Authenticate and create the gspread client
auth.authenticate_user()
# Get credentials and project ID from default()
creds, _ = default()
# Use credentials to authorize gspread
gc = gspread.authorize(creds)

# Open the spreadsheet by URL
spreadsheet = gc.open_by_url('https://docs.google.com/spreadsheets/d/1Bj4-sd6362GWrFZOPcND3fFo0oroO1pfhkpMJh8iIE4/edit?gid=0#gid=0')

# Access the first sheet (if you have multiple sheets, you can select them by index or name)
worksheet = spreadsheet.sheet1

# Get all records from the sheet
data = pd.DataFrame(worksheet.get_all_records())

# Show the first few rows to verify data
data.head()

Unnamed: 0,Performance,Video URL
0,1.106,https://fgimagestorage.blob.core.windows.net/f...
1,2.2447,https://fgimagestorage.blob.core.windows.net/f...
2,2.0126,https://fgimagestorage.blob.core.windows.net/f...
3,1.7708,https://fgimagestorage.blob.core.windows.net/f...
4,0.6293,https://fgimagestorage.blob.core.windows.net/f...


In [3]:
# Check the column names and types
data.columns


Index(['Performance', 'Video URL'], dtype='object')

In [4]:
import requests
import os

# Create a directory to save the videos
if not os.path.exists('videos'):
    os.makedirs('videos')

# Function to download a video
def download_video(url, filename):
    response = requests.get(url)
    with open(filename, "wb") as file:
        file.write(response.content)

# Loop through all video URLs and download
for i, url in enumerate(data['Video URL']):
    filename = f"videos/video_{i+1}.mp4"  # Naming files as video_1, video_2, etc.
    try:
        print(f"Downloading video {i+1} from {url}...")
        download_video(url, filename)
        print(f"Video {i+1} downloaded successfully!")
    except Exception as e:
        print(f"Failed to download video {i+1} from {url}. Error: {e}")


Downloading video 1 from https://fgimagestorage.blob.core.windows.net/facebook-assets/hd-999607261342550...
Video 1 downloaded successfully!
Downloading video 2 from https://fgimagestorage.blob.core.windows.net/facebook-assets/hd-997580728807604...
Video 2 downloaded successfully!
Downloading video 3 from https://fgimagestorage.blob.core.windows.net/facebook-assets/hd-992418235673669...
Video 3 downloaded successfully!
Downloading video 4 from https://fgimagestorage.blob.core.windows.net/facebook-assets/hd-992064161877405...
Video 4 downloaded successfully!
Downloading video 5 from https://fgimagestorage.blob.core.windows.net/facebook-assets/hd-991636695150147...
Video 5 downloaded successfully!
Downloading video 6 from https://fgimagestorage.blob.core.windows.net/facebook-assets/hd-989969399547901...
Video 6 downloaded successfully!
Downloading video 7 from https://fgimagestorage.blob.core.windows.net/facebook-assets/hd-989930303148492...
Video 7 downloaded successfully!
Downloading v

In [5]:
import os

# List all files in the 'videos' folder
video_folder = '/content/videos'
video_files = os.listdir(video_folder)

# Display the first few files to check
print(video_files[:10])

# Example: Assuming video names are in the format 'video_1.mp4', 'video_2.mp4', etc.
data['video_file'] = data['Video URL'].apply(lambda x: x.split('/')[-1] + '.mp4')  # Adjust based on your naming convention

# Display the first few rows of the DataFrame to confirm the match
print(data[['Video URL', 'video_file']].head())


['video_179.mp4', 'video_67.mp4', 'video_147.mp4', 'video_264.mp4', 'video_220.mp4', 'video_91.mp4', 'video_72.mp4', 'video_15.mp4', 'video_148.mp4', 'video_57.mp4']
                                           Video URL              video_file
0  https://fgimagestorage.blob.core.windows.net/f...  hd-999607261342550.mp4
1  https://fgimagestorage.blob.core.windows.net/f...  hd-997580728807604.mp4
2  https://fgimagestorage.blob.core.windows.net/f...  hd-992418235673669.mp4
3  https://fgimagestorage.blob.core.windows.net/f...  hd-992064161877405.mp4
4  https://fgimagestorage.blob.core.windows.net/f...  hd-991636695150147.mp4


In [6]:
import os

# List all video files in the 'videos' folder
video_folder = '/content/videos'
video_files = os.listdir(video_folder)

# Check if the files in the DataFrame exist in the folder
data['video_exists'] = data['video_file'].apply(lambda x: x in video_files)

# Display rows where the video file doesn't exist
missing_videos = data[data['video_exists'] == False]
print(missing_videos[['Video URL', 'video_file']])



                                             Video URL  \
0    https://fgimagestorage.blob.core.windows.net/f...   
1    https://fgimagestorage.blob.core.windows.net/f...   
2    https://fgimagestorage.blob.core.windows.net/f...   
3    https://fgimagestorage.blob.core.windows.net/f...   
4    https://fgimagestorage.blob.core.windows.net/f...   
..                                                 ...   
263  https://fgimagestorage.blob.core.windows.net/f...   
264  https://fgimagestorage.blob.core.windows.net/f...   
265  https://fgimagestorage.blob.core.windows.net/f...   
266  https://fgimagestorage.blob.core.windows.net/f...   
267  https://fgimagestorage.blob.core.windows.net/f...   

                  video_file  
0     hd-999607261342550.mp4  
1     hd-997580728807604.mp4  
2     hd-992418235673669.mp4  
3     hd-992064161877405.mp4  
4     hd-991636695150147.mp4  
..                       ...  
263  hd-1689212771862832.mp4  
264  hd-1685108828975354.mp4  
265  hd-1119706949586170

# STEP 2 : CLEAN THE DATA

In [7]:
# Remove duplicates based on the 'video_file' column
data_unique_videos = data.drop_duplicates(subset='video_file', keep='first')

# Display the DataFrame to check the results
print(data_unique_videos[['Video URL', 'video_file', 'Performance']].head())


                                           Video URL              video_file  \
0  https://fgimagestorage.blob.core.windows.net/f...  hd-999607261342550.mp4   
1  https://fgimagestorage.blob.core.windows.net/f...  hd-997580728807604.mp4   
2  https://fgimagestorage.blob.core.windows.net/f...  hd-992418235673669.mp4   
3  https://fgimagestorage.blob.core.windows.net/f...  hd-992064161877405.mp4   
4  https://fgimagestorage.blob.core.windows.net/f...  hd-991636695150147.mp4   

   Performance  
0       1.1060  
1       2.2447  
2       2.0126  
3       1.7708  
4       0.6293  


In [8]:
import os
import hashlib

# Path to the videos folder
video_folder = '/content/videos'

# Function to calculate the MD5 hash of a file
def get_file_hash(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        # Read the file in chunks to avoid memory issues with large files
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

# List all video files in the folder
video_files = os.listdir(video_folder)

# Track file hashes and duplicates
file_hashes = {}
duplicates = []

# Identify duplicates based on file content hash
for video in video_files:
    video_path = os.path.join(video_folder, video)
    video_hash = get_file_hash(video_path)

    # If the hash is already in the dictionary, it's a duplicate
    if video_hash in file_hashes:
        # Only add the video to duplicates if it is not already marked as a duplicate
        if video not in duplicates:
            duplicates.append(video)
    else:
        file_hashes[video_hash] = video

# Count the number of duplicate videos
duplicate_count = len(duplicates)

# Display the number of duplicate videos
print(f"Number of duplicate videos based on content: {duplicate_count}")


Number of duplicate videos based on content: 193


In [9]:
import os
import hashlib

# Path to the videos folder
video_folder = '/content/videos'

# Function to calculate the MD5 hash of a file
def get_file_hash(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        # Read the file in chunks to avoid memory issues with large files
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

# List all video files in the folder
video_files = os.listdir(video_folder)

# Track file hashes and duplicates
file_hashes = {}
duplicates = []

# Identify duplicates based on file content hash
for video in video_files:
    video_path = os.path.join(video_folder, video)
    video_hash = get_file_hash(video_path)

    # If the hash is already in the dictionary, it's a duplicate
    if video_hash in file_hashes:
        # Only add the video to duplicates if it is not already marked as a duplicate
        if video not in duplicates:
            duplicates.append(video)
    else:
        file_hashes[video_hash] = video

# Display the names of duplicate videos
if duplicates:
    print(f"Duplicate video files: {duplicates}")
else:
    print("No duplicate videos found.")


Duplicate video files: ['video_91.mp4', 'video_223.mp4', 'video_21.mp4', 'video_253.mp4', 'video_233.mp4', 'video_132.mp4', 'video_86.mp4', 'video_227.mp4', 'video_62.mp4', 'video_175.mp4', 'video_231.mp4', 'video_159.mp4', 'video_55.mp4', 'video_64.mp4', 'video_19.mp4', 'video_68.mp4', 'video_243.mp4', 'video_77.mp4', 'video_230.mp4', 'video_144.mp4', 'video_238.mp4', 'video_164.mp4', 'video_151.mp4', 'video_185.mp4', 'video_50.mp4', 'video_142.mp4', 'video_187.mp4', 'video_180.mp4', 'video_206.mp4', 'video_1.mp4', 'video_217.mp4', 'video_60.mp4', 'video_78.mp4', 'video_120.mp4', 'video_150.mp4', 'video_97.mp4', 'video_81.mp4', 'video_79.mp4', 'video_224.mp4', 'video_201.mp4', 'video_106.mp4', 'video_239.mp4', 'video_59.mp4', 'video_37.mp4', 'video_31.mp4', 'video_35.mp4', 'video_153.mp4', 'video_226.mp4', 'video_170.mp4', 'video_141.mp4', 'video_111.mp4', 'video_163.mp4', 'video_103.mp4', 'video_135.mp4', 'video_128.mp4', 'video_265.mp4', 'video_184.mp4', 'video_197.mp4', 'video_4.mp

In [10]:
import hashlib
import os

# Path to the videos folder
video_folder = '/content/videos'

# Function to calculate the MD5 hash of a file
def get_file_hash(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        # Read the file in chunks to avoid memory issues with large files
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

# List all video files in the folder
video_files = os.listdir(video_folder)

# Track file hashes and duplicates
file_hashes = {}
duplicates = []

# Identify duplicates based on file content hash
for video in video_files:
    video_path = os.path.join(video_folder, video)
    video_hash = get_file_hash(video_path)

    # If the hash is already in the dictionary, it's a duplicate
    if video_hash in file_hashes:
        # Only add the video to duplicates if it is not already marked as a duplicate
        if video not in duplicates:
            duplicates.append(video)
    else:
        file_hashes[video_hash] = video

# Now, let's check how many duplicates were found
print(f"Total number of duplicates found: {len(duplicates)}")
print("List of duplicates:", duplicates[:20])  # Print the first 20 duplicates to check


Total number of duplicates found: 193
List of duplicates: ['video_91.mp4', 'video_223.mp4', 'video_21.mp4', 'video_253.mp4', 'video_233.mp4', 'video_132.mp4', 'video_86.mp4', 'video_227.mp4', 'video_62.mp4', 'video_175.mp4', 'video_231.mp4', 'video_159.mp4', 'video_55.mp4', 'video_64.mp4', 'video_19.mp4', 'video_68.mp4', 'video_243.mp4', 'video_77.mp4', 'video_230.mp4', 'video_144.mp4']


In [11]:
import random
import shutil
from google.colab import files
import os

# Ensure we have 193 duplicates identified
print(f"Total duplicates identified: {len(duplicates)}")

# If there are fewer than 30 duplicates, print a warning
if len(duplicates) < 30:
    print(f"Warning: Only {len(duplicates)} duplicates found. Downloading all.")
    videos_to_download = duplicates  # All duplicates will be downloaded if fewer than 30
else:
    # Randomly select 30 duplicates
    videos_to_download = random.sample(duplicates, 30)

# Check the number of videos to be downloaded
print(f"Downloading {len(videos_to_download)} duplicate videos.")

# Create a temporary folder to store the selected videos for download
temp_download_folder = '/content/temp_download_folder'
os.makedirs(temp_download_folder, exist_ok=True)

# Copy the randomly selected videos to the temporary folder
for video in videos_to_download:
    video_path = os.path.join(video_folder, video)
    shutil.copy(video_path, temp_download_folder)

# Compress the videos folder into a zip file for easier download
shutil.make_archive('/content/random_duplicates_videos', 'zip', temp_download_folder)

# Now, download the zip file containing the selected videos
files.download('/content/random_duplicates_videos.zip')

# Clean up the temporary folder
shutil.rmtree(temp_download_folder)


Total duplicates identified: 193
Downloading 30 duplicate videos.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
import os

# List of duplicate video files (from your output)
duplicate_video_files = ['video_91.mp4', 'video_223.mp4', 'video_21.mp4', 'video_253.mp4', 'video_233.mp4', 'video_132.mp4', 'video_86.mp4', 'video_227.mp4', 'video_62.mp4', 'video_175.mp4', 'video_231.mp4', 'video_159.mp4', 'video_55.mp4', 'video_64.mp4', 'video_19.mp4', 'video_68.mp4', 'video_243.mp4', 'video_77.mp4', 'video_230.mp4', 'video_144.mp4', 'video_238.mp4', 'video_164.mp4', 'video_151.mp4', 'video_185.mp4', 'video_50.mp4', 'video_142.mp4', 'video_187.mp4', 'video_180.mp4', 'video_206.mp4', 'video_1.mp4', 'video_217.mp4', 'video_60.mp4', 'video_78.mp4', 'video_120.mp4', 'video_150.mp4', 'video_97.mp4', 'video_81.mp4', 'video_79.mp4', 'video_224.mp4', 'video_201.mp4', 'video_106.mp4', 'video_239.mp4', 'video_59.mp4', 'video_37.mp4', 'video_31.mp4', 'video_35.mp4', 'video_153.mp4', 'video_226.mp4', 'video_170.mp4', 'video_141.mp4', 'video_111.mp4', 'video_163.mp4', 'video_103.mp4', 'video_135.mp4', 'video_128.mp4', 'video_265.mp4', 'video_184.mp4', 'video_197.mp4', 'video_4.mp4', 'video_121.mp4', 'video_241.mp4', 'video_178.mp4', 'video_257.mp4', 'video_157.mp4', 'video_10.mp4', 'video_192.mp4', 'video_66.mp4', 'video_205.mp4', 'video_229.mp4', 'video_160.mp4', 'video_9.mp4', 'video_149.mp4', 'video_23.mp4', 'video_247.mp4', 'video_251.mp4', 'video_46.mp4', 'video_45.mp4', 'video_22.mp4', 'video_228.mp4', 'video_28.mp4', 'video_143.mp4', 'video_186.mp4', 'video_162.mp4', 'video_232.mp4', 'video_14.mp4', 'video_190.mp4', 'video_116.mp4', 'video_110.mp4', 'video_130.mp4', 'video_234.mp4', 'video_63.mp4', 'video_209.mp4', 'video_215.mp4', 'video_51.mp4', 'video_2.mp4', 'video_114.mp4', 'video_18.mp4', 'video_181.mp4', 'video_70.mp4', 'video_258.mp4', 'video_53.mp4', 'video_74.mp4', 'video_69.mp4', 'video_96.mp4', 'video_207.mp4', 'video_11.mp4', 'video_20.mp4', 'video_133.mp4', 'video_129.mp4', 'video_30.mp4', 'video_80.mp4', 'video_88.mp4', 'video_75.mp4', 'video_5.mp4', 'video_219.mp4', 'video_244.mp4', 'video_89.mp4', 'video_76.mp4', 'video_213.mp4', 'video_38.mp4', 'video_29.mp4', 'video_99.mp4', 'video_118.mp4', 'video_248.mp4', 'video_112.mp4', 'video_236.mp4', 'video_101.mp4', 'video_73.mp4', 'video_212.mp4', 'video_7.mp4', 'video_146.mp4', 'video_124.mp4', 'video_119.mp4', 'video_48.mp4', 'video_188.mp4', 'video_34.mp4', 'video_134.mp4', 'video_174.mp4', 'video_210.mp4', 'video_195.mp4', 'video_138.mp4', 'video_237.mp4', 'video_26.mp4', 'video_250.mp4', 'video_182.mp4', 'video_104.mp4', 'video_177.mp4', 'video_235.mp4', 'video_92.mp4', 'video_87.mp4', 'video_136.mp4', 'video_167.mp4', 'video_155.mp4', 'video_85.mp4', 'video_40.mp4', 'video_165.mp4', 'video_115.mp4', 'video_17.mp4', 'video_200.mp4', 'video_242.mp4', 'video_41.mp4', 'video_176.mp4', 'video_246.mp4', 'video_113.mp4', 'video_245.mp4', 'video_107.mp4', 'video_13.mp4', 'video_225.mp4', 'video_125.mp4', 'video_117.mp4', 'video_268.mp4', 'video_24.mp4', 'video_56.mp4', 'video_52.mp4', 'video_54.mp4', 'video_127.mp4', 'video_240.mp4', 'video_49.mp4', 'video_222.mp4', 'video_262.mp4', 'video_198.mp4', 'video_71.mp4', 'video_140.mp4', 'video_189.mp4', 'video_47.mp4', 'video_267.mp4', 'video_172.mp4', 'video_44.mp4', 'video_158.mp4', 'video_95.mp4', 'video_249.mp4', 'video_154.mp4', 'video_93.mp4']

# Path to the videos folder
video_folder = '/content/videos'

# Deleting the duplicate video files
deleted_files = []

for video in duplicate_video_files:
    video_path = os.path.join(video_folder, video)
    if os.path.exists(video_path):
        os.remove(video_path)
        deleted_files.append(video)

# Output the number of deleted files
print(f"Total number of videos deleted: {len(deleted_files)}")
print("List of deleted videos:", deleted_files)


Total number of videos deleted: 193
List of deleted videos: ['video_91.mp4', 'video_223.mp4', 'video_21.mp4', 'video_253.mp4', 'video_233.mp4', 'video_132.mp4', 'video_86.mp4', 'video_227.mp4', 'video_62.mp4', 'video_175.mp4', 'video_231.mp4', 'video_159.mp4', 'video_55.mp4', 'video_64.mp4', 'video_19.mp4', 'video_68.mp4', 'video_243.mp4', 'video_77.mp4', 'video_230.mp4', 'video_144.mp4', 'video_238.mp4', 'video_164.mp4', 'video_151.mp4', 'video_185.mp4', 'video_50.mp4', 'video_142.mp4', 'video_187.mp4', 'video_180.mp4', 'video_206.mp4', 'video_1.mp4', 'video_217.mp4', 'video_60.mp4', 'video_78.mp4', 'video_120.mp4', 'video_150.mp4', 'video_97.mp4', 'video_81.mp4', 'video_79.mp4', 'video_224.mp4', 'video_201.mp4', 'video_106.mp4', 'video_239.mp4', 'video_59.mp4', 'video_37.mp4', 'video_31.mp4', 'video_35.mp4', 'video_153.mp4', 'video_226.mp4', 'video_170.mp4', 'video_141.mp4', 'video_111.mp4', 'video_163.mp4', 'video_103.mp4', 'video_135.mp4', 'video_128.mp4', 'video_265.mp4', 'video_1

In [13]:
import os

# Path to the videos folder
video_folder = '/content/videos'

# List all video files in the folder
video_files = os.listdir(video_folder)

# The total number of videos remaining after duplicates have been removed
remaining_videos = len(video_files)

print(f"Total number of videos remaining after duplicates are removed: {remaining_videos}")


Total number of videos remaining after duplicates are removed: 75


In [14]:
import cv2
import os

# Load pre-trained face detector from OpenCV
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

def detect_face_in_frame(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)  # Convert frame to grayscale
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
    return len(faces) > 0  # Return True if a face is detected

def process_video_for_faces(video_path):
    video = cv2.VideoCapture(video_path)
    frame_rate = video.get(cv2.CAP_PROP_FPS)  # Get frames per second
    success, frame = video.read()
    count = 0

    while success:
        if count % int(frame_rate) == 0:  # Extract frame every second
            if detect_face_in_frame(frame):  # If a face is detected, stop processing this video
                video.release()  # Release the video object and skip to next video
                return True
        success, frame = video.read()
        count += 1

    video.release()
    return False  # No face detected in this video

# Process all videos
video_folder = '/content/videos'  # Folder where videos are stored
videos_with_faces = []  # Store videos with human faces

for video_file in os.listdir(video_folder):
    video_path = os.path.join(video_folder, video_file)
    if process_video_for_faces(video_path):
        videos_with_faces.append(video_file)  # If faces are detected, add the video to the list

# Output the videos with human faces detected
print("Videos with human faces detected:", videos_with_faces)

# Output the total number of videos with human faces detected
print("Total number of videos with human faces detected:", len(videos_with_faces))


Videos with human faces detected: ['video_67.mp4', 'video_147.mp4', 'video_264.mp4', 'video_220.mp4', 'video_148.mp4', 'video_57.mp4', 'video_218.mp4', 'video_199.mp4', 'video_82.mp4', 'video_8.mp4', 'video_156.mp4', 'video_109.mp4', 'video_122.mp4', 'video_58.mp4', 'video_131.mp4', 'video_123.mp4', 'video_98.mp4', 'video_3.mp4', 'video_216.mp4', 'video_108.mp4', 'video_100.mp4', 'video_43.mp4', 'video_166.mp4', 'video_255.mp4', 'video_194.mp4', 'video_202.mp4', 'video_193.mp4', 'video_90.mp4', 'video_12.mp4', 'video_6.mp4', 'video_204.mp4', 'video_169.mp4', 'video_254.mp4', 'video_171.mp4', 'video_263.mp4', 'video_83.mp4', 'video_173.mp4', 'video_211.mp4', 'video_105.mp4', 'video_221.mp4', 'video_168.mp4', 'video_152.mp4', 'video_191.mp4', 'video_203.mp4', 'video_39.mp4', 'video_196.mp4', 'video_102.mp4', 'video_256.mp4', 'video_259.mp4', 'video_126.mp4', 'video_183.mp4', 'video_260.mp4', 'video_145.mp4', 'video_32.mp4', 'video_42.mp4', 'video_94.mp4', 'video_139.mp4', 'video_266.mp4'

In [15]:
import shutil

# Create a directory to store videos with faces
output_folder = '/content/videos_with_faces'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Move the videos with faces to the new folder
for video_file in videos_with_faces:
    video_path = os.path.join(video_folder, video_file)
    new_video_path = os.path.join(output_folder, video_file)

    # Move the video to the new folder
    shutil.move(video_path, new_video_path)

# Now the videos are moved to the 'videos_with_faces' folder.
print(f"Moved {len(videos_with_faces)} videos to {output_folder}")


Moved 64 videos to /content/videos_with_faces


In [16]:
# Path to the folder where videos with faces are stored
output_folder = '/content/videos_with_faces'

# List all video files in the output folder
videos_in_faces_folder = os.listdir(output_folder)

# Filter only video files (optional, based on file extensions)
video_extensions = ['.mp4', '.mov', '.avi', '.mkv']  # Adjust extensions as needed
videos_in_faces_folder = [video for video in videos_in_faces_folder if any(video.endswith(ext) for ext in video_extensions)]

# Get the number of videos
num_videos_with_faces = len(videos_in_faces_folder)

# Print the number of videos
print(f"Total number of videos with faces: {num_videos_with_faces}")


Total number of videos with faces: 64


# STEP 3 : FINDING INFLUENCERS AND THEIR PERFOMANCE AVERAGE

In [17]:
pip install face_recognition opencv-python scikit-learn


Collecting face_recognition
  Downloading face_recognition-1.3.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting face-recognition-models>=0.3.0 (from face_recognition)
  Downloading face_recognition_models-0.3.0.tar.gz (100.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.1/100.1 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading face_recognition-1.3.0-py2.py3-none-any.whl (15 kB)
Building wheels for collected packages: face-recognition-models
  Building wheel for face-recognition-models (setup.py) ... [?25l[?25hdone
  Created wheel for face-recognition-models: filename=face_recognition_models-0.3.0-py2.py3-none-any.whl size=100566162 sha256=e0ce5a5274cc4e35ed1d6d9c887f405d0f896a1794595327f3694eb7eef49ccc
  Stored in directory: /root/.cache/pip/wheels/7a/eb/cf/e9eced74122b679557f597bb7c8e4c739cfcac526db1fd523d
Successfully built face-recognition-models
Installing collected packages: face-recog

In [29]:
!apt-get update
!apt-get install -y cmake libboost-python-dev libboost-thread-dev


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,224 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,454 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packa

In [38]:
!pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.1/36.1 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.5.1-py3-none-any.whl (32 kB)
Installing collected packages: sounddevice, mediapipe
Successfully installed mediapipe-0.10.18 sounddevice-0.5.1


In [40]:
import os
import cv2
import mediapipe as mp
from tqdm import tqdm

# Mediapipe setup
mp_face_detection = mp.solutions.face_detection
face_detection = mp_face_detection.FaceDetection(model_selection=1, min_detection_confidence=0.5)

# Paths
videos_with_faces_folder = "/content/videos_with_faces"
output_folder = "/content/influencers"
os.makedirs(output_folder, exist_ok=True)

# Dictionary to store influencer groups
influencer_groups = {}
influencer_count = 0  # Initialize influencer_count here

def process_video(video_path):
    """
    Processes a video to detect faces and return bounding box coordinates.
    """
    video_capture = cv2.VideoCapture(video_path)
    frame_count = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_skip = max(frame_count // 10, 1)  # Process 10 evenly spaced frames

    face_data = []
    frame_idx = 0
    while video_capture.isOpened():
        ret, frame = video_capture.read()
        if not ret:
            break
        frame_idx += 1
        if frame_idx % frame_skip != 0:
            continue

        # Convert the frame to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Detect faces
        results = face_detection.process(frame_rgb)
        if results.detections:
            face_data.append(results.detections)

    video_capture.release()
    return face_data

def main():
    video_files = [f for f in os.listdir(videos_with_faces_folder) if f.endswith(".mp4")]

    print(f"[Info] Found {len(video_files)} videos to process.")

    for video_file in tqdm(video_files, desc="Processing Videos"):
        video_path = os.path.join(videos_with_faces_folder, video_file)
        print(f"[Info] Processing: {video_file}")

        # Extract face data from the current video
        face_data = process_video(video_path)

        # Assign video to an influencer group
        assigned_to_group = False
        for group_name, group_data in influencer_groups.items():
            if len(face_data) == len(group_data):  # Rough heuristic for similarity
                influencer_groups[group_name].append(video_file)
                assigned_to_group = True
                break

        if not assigned_to_group:
            global influencer_count  # Use global variable to modify influencer_count
            influencer_count += 1
            new_group_name = f"Influencer_{influencer_count}"
            influencer_groups[new_group_name] = [video_file]

    print("[Info] Grouping completed!")
    for group, videos in influencer_groups.items():
        print(f"{group}: {', '.join(videos)}")

if __name__ == "__main__":
    main()


[Info] Found 64 videos to process.


Processing Videos:   0%|          | 0/64 [00:00<?, ?it/s]

[Info] Processing: video_67.mp4


Processing Videos:   2%|▏         | 1/64 [00:03<03:51,  3.68s/it]

[Info] Processing: video_147.mp4


Processing Videos:   3%|▎         | 2/64 [00:04<02:13,  2.15s/it]

[Info] Processing: video_264.mp4


Processing Videos:   5%|▍         | 3/64 [00:06<02:08,  2.11s/it]

[Info] Processing: video_220.mp4


Processing Videos:   6%|▋         | 4/64 [00:08<01:45,  1.76s/it]

[Info] Processing: video_148.mp4


Processing Videos:   8%|▊         | 5/64 [00:09<01:37,  1.65s/it]

[Info] Processing: video_57.mp4


Processing Videos:   9%|▉         | 6/64 [00:11<01:44,  1.81s/it]

[Info] Processing: video_218.mp4


Processing Videos:  11%|█         | 7/64 [00:14<02:00,  2.12s/it]

[Info] Processing: video_199.mp4


Processing Videos:  12%|█▎        | 8/64 [00:16<02:06,  2.26s/it]

[Info] Processing: video_82.mp4


Processing Videos:  14%|█▍        | 9/64 [00:17<01:38,  1.78s/it]

[Info] Processing: video_8.mp4


Processing Videos:  16%|█▌        | 10/64 [00:19<01:29,  1.65s/it]

[Info] Processing: video_156.mp4


Processing Videos:  17%|█▋        | 11/64 [00:20<01:19,  1.49s/it]

[Info] Processing: video_109.mp4


Processing Videos:  19%|█▉        | 12/64 [00:21<01:20,  1.56s/it]

[Info] Processing: video_122.mp4


Processing Videos:  20%|██        | 13/64 [00:23<01:15,  1.48s/it]

[Info] Processing: video_58.mp4


Processing Videos:  22%|██▏       | 14/64 [00:23<01:03,  1.27s/it]

[Info] Processing: video_131.mp4


Processing Videos:  23%|██▎       | 15/64 [00:25<01:00,  1.24s/it]

[Info] Processing: video_123.mp4


Processing Videos:  25%|██▌       | 16/64 [00:26<00:57,  1.21s/it]

[Info] Processing: video_98.mp4


Processing Videos:  27%|██▋       | 17/64 [00:28<01:09,  1.48s/it]

[Info] Processing: video_3.mp4


Processing Videos:  28%|██▊       | 18/64 [00:31<01:35,  2.08s/it]

[Info] Processing: video_216.mp4


Processing Videos:  30%|██▉       | 19/64 [00:34<01:45,  2.35s/it]

[Info] Processing: video_108.mp4


Processing Videos:  31%|███▏      | 20/64 [00:35<01:27,  1.98s/it]

[Info] Processing: video_100.mp4


Processing Videos:  33%|███▎      | 21/64 [00:38<01:30,  2.11s/it]

[Info] Processing: video_43.mp4


Processing Videos:  34%|███▍      | 22/64 [00:39<01:22,  1.95s/it]

[Info] Processing: video_166.mp4


Processing Videos:  36%|███▌      | 23/64 [00:42<01:24,  2.06s/it]

[Info] Processing: video_255.mp4


Processing Videos:  38%|███▊      | 24/64 [00:43<01:16,  1.91s/it]

[Info] Processing: video_194.mp4


Processing Videos:  39%|███▉      | 25/64 [00:46<01:18,  2.01s/it]

[Info] Processing: video_202.mp4


Processing Videos:  41%|████      | 26/64 [00:48<01:17,  2.05s/it]

[Info] Processing: video_193.mp4


Processing Videos:  42%|████▏     | 27/64 [00:49<01:06,  1.80s/it]

[Info] Processing: video_90.mp4


Processing Videos:  44%|████▍     | 28/64 [00:50<00:59,  1.65s/it]

[Info] Processing: video_12.mp4


Processing Videos:  45%|████▌     | 29/64 [00:52<01:00,  1.73s/it]

[Info] Processing: video_6.mp4


Processing Videos:  47%|████▋     | 30/64 [00:54<01:03,  1.86s/it]

[Info] Processing: video_204.mp4


Processing Videos:  48%|████▊     | 31/64 [00:55<00:51,  1.56s/it]

[Info] Processing: video_169.mp4


Processing Videos:  50%|█████     | 32/64 [00:57<00:54,  1.71s/it]

[Info] Processing: video_254.mp4


Processing Videos:  52%|█████▏    | 33/64 [00:59<00:51,  1.66s/it]

[Info] Processing: video_171.mp4


Processing Videos:  53%|█████▎    | 34/64 [01:02<00:59,  1.99s/it]

[Info] Processing: video_263.mp4


Processing Videos:  55%|█████▍    | 35/64 [01:02<00:48,  1.67s/it]

[Info] Processing: video_83.mp4


Processing Videos:  56%|█████▋    | 36/64 [01:04<00:49,  1.77s/it]

[Info] Processing: video_173.mp4


Processing Videos:  58%|█████▊    | 37/64 [01:06<00:43,  1.62s/it]

[Info] Processing: video_211.mp4


Processing Videos:  59%|█████▉    | 38/64 [01:06<00:32,  1.27s/it]

[Info] Processing: video_105.mp4


Processing Videos:  61%|██████    | 39/64 [01:08<00:38,  1.54s/it]

[Info] Processing: video_221.mp4


Processing Videos:  62%|██████▎   | 40/64 [01:10<00:37,  1.56s/it]

[Info] Processing: video_168.mp4


Processing Videos:  64%|██████▍   | 41/64 [01:15<00:56,  2.47s/it]

[Info] Processing: video_152.mp4


Processing Videos:  66%|██████▌   | 42/64 [01:16<00:50,  2.30s/it]

[Info] Processing: video_191.mp4


Processing Videos:  67%|██████▋   | 43/64 [01:18<00:41,  1.99s/it]

[Info] Processing: video_203.mp4


Processing Videos:  69%|██████▉   | 44/64 [01:19<00:36,  1.82s/it]

[Info] Processing: video_39.mp4


Processing Videos:  70%|███████   | 45/64 [01:22<00:42,  2.24s/it]

[Info] Processing: video_196.mp4


Processing Videos:  72%|███████▏  | 46/64 [01:25<00:44,  2.45s/it]

[Info] Processing: video_102.mp4


Processing Videos:  73%|███████▎  | 47/64 [01:28<00:44,  2.64s/it]

[Info] Processing: video_256.mp4


Processing Videos:  75%|███████▌  | 48/64 [01:30<00:37,  2.37s/it]

[Info] Processing: video_259.mp4


Processing Videos:  77%|███████▋  | 49/64 [01:32<00:34,  2.33s/it]

[Info] Processing: video_126.mp4


Processing Videos:  78%|███████▊  | 50/64 [01:35<00:33,  2.38s/it]

[Info] Processing: video_183.mp4


Processing Videos:  80%|███████▉  | 51/64 [01:36<00:27,  2.13s/it]

[Info] Processing: video_260.mp4


Processing Videos:  81%|████████▏ | 52/64 [01:38<00:23,  1.99s/it]

[Info] Processing: video_145.mp4


Processing Videos:  83%|████████▎ | 53/64 [01:41<00:23,  2.13s/it]

[Info] Processing: video_32.mp4


Processing Videos:  84%|████████▍ | 54/64 [01:44<00:24,  2.46s/it]

[Info] Processing: video_42.mp4


Processing Videos:  86%|████████▌ | 55/64 [01:45<00:18,  2.08s/it]

[Info] Processing: video_94.mp4


Processing Videos:  88%|████████▊ | 56/64 [01:46<00:13,  1.73s/it]

[Info] Processing: video_139.mp4


Processing Videos:  89%|████████▉ | 57/64 [01:48<00:12,  1.84s/it]

[Info] Processing: video_266.mp4


Processing Videos:  91%|█████████ | 58/64 [01:51<00:13,  2.25s/it]

[Info] Processing: video_36.mp4


Processing Videos:  92%|█████████▏| 59/64 [01:52<00:09,  1.84s/it]

[Info] Processing: video_208.mp4


Processing Videos:  94%|█████████▍| 60/64 [01:54<00:07,  1.84s/it]

[Info] Processing: video_214.mp4


Processing Videos:  95%|█████████▌| 61/64 [01:55<00:05,  1.71s/it]

[Info] Processing: video_16.mp4


Processing Videos:  97%|█████████▋| 62/64 [01:58<00:03,  1.89s/it]

[Info] Processing: video_252.mp4


Processing Videos:  98%|█████████▊| 63/64 [02:00<00:02,  2.01s/it]

[Info] Processing: video_27.mp4


Processing Videos: 100%|██████████| 64/64 [02:00<00:00,  1.89s/it]

[Info] Grouping completed!
Influencer_1: video_67.mp4, video_8.mp4, video_98.mp4, video_202.mp4, video_90.mp4, video_12.mp4, video_169.mp4
Influencer_2: video_147.mp4, video_122.mp4, video_3.mp4, video_152.mp4, video_139.mp4, video_266.mp4, video_16.mp4
Influencer_3: video_264.mp4, video_166.mp4, video_6.mp4, video_256.mp4
Influencer_4: video_220.mp4, video_193.mp4, video_203.mp4, video_183.mp4
Influencer_5: video_148.mp4, video_171.mp4, video_39.mp4
Influencer_6: video_57.mp4, video_173.mp4, video_102.mp4
Influencer_7: video_218.mp4, video_191.mp4, video_260.mp4
Influencer_8: video_199.mp4, video_259.mp4, video_32.mp4
Influencer_9: video_82.mp4
Influencer_10: video_156.mp4
Influencer_11: video_109.mp4
Influencer_12: video_58.mp4
Influencer_13: video_131.mp4
Influencer_14: video_123.mp4
Influencer_15: video_216.mp4
Influencer_16: video_108.mp4
Influencer_17: video_100.mp4
Influencer_18: video_43.mp4
Influencer_19: video_255.mp4
Influencer_20: video_194.mp4
Influencer_21: video_204.mp4





In [46]:
performace = data['Performance']
performace.head()
performace.iloc[105]

1.766256973

In [48]:
!pip install ffmpeg-python


Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [52]:
import pandas as pd

# Assuming 'data' is your DataFrame that contains the performance column, and 'df_sorted' contains the influencer-video relationships.

# 1. Get the video indices for each influencer (using the video numbers you've already classified)
influencers_videos = {
    "Influencer_1": ["video_67", "video_8", "video_98", "video_202", "video_90", "video_12", "video_169"],
    "Influencer_2": ["video_147", "video_122", "video_3", "video_152", "video_139", "video_266", "video_16"],
    "Influencer_3": ["video_264", "video_166", "video_6", "video_256"],
    "Influencer_4": ["video_220", "video_193", "video_203", "video_183"],
    "Influencer_5": ["video_148", "video_171", "video_39"],
    "Influencer_6": ["video_57", "video_173", "video_102"],
    "Influencer_7": ["video_218", "video_191", "video_260"],
    "Influencer_8": ["video_199", "video_259", "video_32"],
    "Influencer_9": ["video_82"],
    "Influencer_10": ["video_156"],
    "Influencer_11": ["video_109"],
    "Influencer_12": ["video_58"],
    "Influencer_13": ["video_131"],
    "Influencer_14": ["video_123"],
    "Influencer_15": ["video_216"],
    "Influencer_16": ["video_108"],
    "Influencer_17": ["video_100"],
    "Influencer_18": ["video_43"],
    "Influencer_19": ["video_255"],
    "Influencer_20": ["video_194"],
    "Influencer_21": ["video_204"],
    "Influencer_22": ["video_254"],
    "Influencer_23": ["video_263"],
    "Influencer_24": ["video_83"],
    "Influencer_25": ["video_211"],
    "Influencer_26": ["video_105"],
    "Influencer_27": ["video_221"],
    "Influencer_28": ["video_168"],
    "Influencer_29": ["video_196"],
    "Influencer_30": ["video_126"],
    "Influencer_31": ["video_145"],
    "Influencer_32": ["video_42"],
    "Influencer_33": ["video_94"],
    "Influencer_34": ["video_36"],
    "Influencer_35": ["video_208"],
    "Influencer_36": ["video_214"],
    "Influencer_37": ["video_252"],
    "Influencer_38": ["video_27"]
}

# 2. Extract performance data for the videos of interest
# Assuming 'data' contains the performance data, and the index corresponds to video numbers

# Create a dictionary to store influencer performance and video count
influencer_performance = {}

for influencer, videos in influencers_videos.items():
    # Extract the indices for these videos from the 'data' DataFrame
    video_indices = [int(video.split('_')[1].split('.mp4')[0]) for video in videos]  # Extract video numbers from 'video_xx.mp4'
    video_performance = data.iloc[video_indices]['Performance']  # Get the performance for these videos

    # Calculate the average performance for this influencer
    avg_performance = video_performance.mean()

    # Count the number of videos for this influencer
    num_videos = len(videos)

    # Store the result (No image path here, only performance and video count)
    influencer_performance[influencer] = {
        "Num Videos": num_videos,
        "Average Performance": avg_performance
    }

# 3. Convert the results into a DataFrame for easy visualization
performance_df = pd.DataFrame(influencer_performance).T  # Transpose to have influencer names as rows
performance_df = performance_df.sort_values(by="Average Performance", ascending=False)  # Sort by average performance

# 4. Display the table with index (Influencer names), number of videos, and average performance
performance_df.index.name = 'Influencer'  # Set the name of the index to "Influencer"
print(performance_df)


               Num Videos  Average Performance
Influencer                                    
Influencer_26         1.0             1.766257
Influencer_5          3.0             1.663886
Influencer_14         1.0             1.618918
Influencer_23         1.0             1.575395
Influencer_9          1.0             1.530400
Influencer_35         1.0             1.439201
Influencer_25         1.0             1.423948
Influencer_38         1.0             1.393000
Influencer_1          7.0             1.292985
Influencer_3          4.0             1.291776
Influencer_21         1.0             1.269243
Influencer_11         1.0             1.079063
Influencer_22         1.0             1.047132
Influencer_36         1.0             1.039207
Influencer_31         1.0             0.980524
Influencer_33         1.0             0.966400
Influencer_6          3.0             0.878080
Influencer_8          3.0             0.864933
Influencer_2          7.0             0.861212
Influencer_4 