<a href="https://colab.research.google.com/github/Kathushan12/Youtube-vid-analize/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup the Environment

In [None]:
# Huggingface & video tools
!pip install transformers datasets
!pip install pytube opencv-python pydub librosa --quiet


Get YouTube Videos

In [None]:
from pytube import Search
import pandas as pd

def search_youtube_videos(query, max_results=5):
    search = Search(query)
    results = search.results[:max_results]

    videos = []
    for vid in results:
        videos.append({
            "title": vid.title,
            "url": vid.watch_url
        })

    return pd.DataFrame(videos)

# Example
query = "ai generated face"
video_df = search_youtube_videos(query)
video_df


Download 1 Video

In [None]:
import yt_dlp

def download_video(url, save_path="/content/videos"):
    ydl_opts = {
        'format': 'best',
        'outtmpl': f'{save_path}/%(title)s.%(ext)s',
        'quiet': True,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        video_file_path = ydl.prepare_filename(info)

    return video_file_path

# Test
os.makedirs("/content/videos", exist_ok=True)
video_path = download_video("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
print("Downloaded video path:", video_path)


Analyze Video Quality using Visual Transformer

In [None]:
from transformers import pipeline
from PIL import Image
import cv2

# Extract frame
def extract_frame(video_path):
    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()
    cap.release()
    if ret:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        return Image.fromarray(frame)
    return None

frame = extract_frame(video_path)

# Use Hugging Face ViT model
image_classifier = pipeline("image-classification", model="google/vit-base-patch16-224")
image_result = image_classifier(frame)
image_result


Audio Quality + Classification

In [None]:
from pydub import AudioSegment
import librosa
import soundfile as sf

# Extract Audio
def extract_audio(video_path, audio_path="/content/audio.wav"):
    audio = AudioSegment.from_file(video_path)
    audio.export(audio_path, format="wav")
    return audio_path

audio_path = extract_audio(video_path)

# Check audio quality (RMS)
def audio_rms(audio_path):
    y, sr = librosa.load(audio_path)
    return librosa.feature.rms(y=y).mean()

rms_score = audio_rms(audio_path)
print("Audio RMS Score:", rms_score)


Audio Deepfake Detection

In [None]:
from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
import torchaudio
import torch

# Load correct feature extractor (NOT AutoProcessor)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("superb/wav2vec2-base-superb-sid")
model = AutoModelForAudioClassification.from_pretrained("superb/wav2vec2-base-superb-sid")

# Load audio
speech_array, sampling_rate = torchaudio.load(audio_path)

# Resample if needed
if sampling_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
    speech_array = resampler(speech_array)
    sampling_rate = 16000

# Convert to mono if stereo
if speech_array.shape[0] > 1:
    speech_array = torch.mean(speech_array, dim=0, keepdim=True)

# Prepare input for model
inputs = feature_extractor(speech_array.squeeze().numpy(), sampling_rate=sampling_rate, return_tensors="pt")

# Predict
with torch.no_grad():
    logits = model(**inputs).logits
    predicted_class_id = torch.argmax(logits).item()

# Output label
label = model.config.id2label[predicted_class_id]
print("🎙️ Predicted Speaker Label:", label)


Final Ranking & Display

In [None]:
result = {
    "Title": video_df.iloc[0]["title"],
    "Video URL": video_df.iloc[0]["url"],
    "Visual Tags": image_result[:2],
    "Audio RMS": rms_score,
    "Fake Score": predicted_class_id  # ✅ FIXED here
}

pd.DataFrame([result])
