In [1]:
import requests
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [5]:
API_KEY = "AIzaSyAu3TWFGfkW6a6Jl7v8-emDVNMrnQC5biQ"
# Define API URL to fetch trending videos
URL = f"https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&chart=mostPopular&regionCode=US&maxResults=50&key={API_KEY}"

def fetch_trending_videos():
    """Fetch trending videos from YouTube API."""
    response = requests.get(URL)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching data: {response.status_code}")
        return None

# Fetch data
data = fetch_trending_videos()

In [7]:
def extract_video_data(data):
    """Extract relevant fields from the JSON response and store them in a DataFrame."""
    if not data:
        return pd.DataFrame()

    video_list = []
    for video in data.get("items", []):
        video_details = {
            "video_id": video["id"],
            "title": video["snippet"]["title"],
            "channel": video["snippet"]["channelTitle"],
            "published_date": video["snippet"]["publishedAt"],
            "category_id": video["snippet"]["categoryId"],
            "views": video["statistics"].get("viewCount", 0),
            "likes": video["statistics"].get("likeCount", 0),
            "comments": video["statistics"].get("commentCount", 0),
        }
        video_list.append(video_details)

    return pd.DataFrame(video_list)

# Convert API response into a DataFrame
df = extract_video_data(data)

In [9]:
# Convert numeric columns to integers
df["views"] = pd.to_numeric(df["views"], errors="coerce").fillna(0).astype(int)
df["likes"] = pd.to_numeric(df["likes"], errors="coerce").fillna(0).astype(int)
df["comments"] = pd.to_numeric(df["comments"], errors="coerce").fillna(0).astype(int)

# Convert published_date to datetime format
df["published_date"] = pd.to_datetime(df["published_date"])

# Handle missing values (replace empty fields with "Unknown")
df.fillna("Unknown", inplace=True)


In [11]:
# Encode 'channel' and 'category_id' using Label Encoding
encoder = LabelEncoder()
df["channel_encoded"] = encoder.fit_transform(df["channel"])
df["category_encoded"] = encoder.fit_transform(df["category_id"])


In [13]:

# Save the DataFrame to a CSV file
df.to_csv("youtube_trending_videos.csv", index=False, encoding="utf-8")

print("ETL process completed! Data saved to youtube_trending_videos.csv")


ETL process completed! Data saved to youtube_trending_videos.csv
