In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Define the path to the data file, relative to the notebook's location
file_path = 'data/youtube_data.csv'

# Load the dataset
try:
    df = pd.read_csv(file_path)

    # --- Print Data Structure Information ---
    print("--- Dataset Info ---")
    df.info()

    # --- Print First 5 Rows ---
    print("\n--- First 5 Rows of the Dataset ---")
    print(df.head())

except FileNotFoundError:
    print(f"Error: '{file_path}' not found. Please ensure the file is in the 'data' subfolder.")

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   video_id        600 non-null    object 
 1   title           600 non-null    object 
 2   description     600 non-null    object 
 3   published_date  600 non-null    object 
 4   channel_id      600 non-null    object 
 5   channel_title   600 non-null    object 
 6   tags            600 non-null    object 
 7   category_id     600 non-null    int64  
 8   view_count      600 non-null    float64
 9   like_count      600 non-null    float64
 10  comment_count   600 non-null    float64
 11  duration        600 non-null    object 
 12  thumbnail       600 non-null    object 
dtypes: float64(3), int64(1), object(9)
memory usage: 61.1+ KB

--- First 5 Rows of the Dataset ---
      video_id                                              title  \
0  gsJAlLOFBv0                

In [3]:
print(df.head().to_markdown())

|    | video_id    | title                                                                        | description                                                                                                                                                                                                          | published_date       | channel_id               | channel_title   | tags                                                                                                                                                                                                                                                                                                                                                                                                                                 |   category_id |       view_count |       like_count |   comment_count | duration   | thumbnail                                      |
|---:|:------------|:---------------------------------------

In [4]:
import pandas as pd
import re # We'll need the regular expressions module to parse the duration

def clean_and_feature_engineer(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cleans the raw YouTube dataset and engineers new features for modeling.

    Args:
        df: The raw DataFrame.

    Returns:
        A cleaned DataFrame with new features.
    """
    # --- 1. Create a copy to avoid modifying the original DataFrame ---
    df_clean = df.copy()

    # --- 2. Drop unnecessary columns ---
    # These are identifiers and URLs not useful for a general popularity model.
    columns_to_drop = ['video_id', 'channel_id', 'description', 'thumbnail']
    df_clean = df_clean.drop(columns=columns_to_drop)

    # --- 3. Correct data types for numeric columns ---
    # Using .astype('int') handles both float-to-int conversion and scientific notation.
    for col in ['view_count', 'like_count', 'comment_count']:
        df_clean[col] = df_clean[col].astype('int')

    # --- 4. Feature Engineering from 'published_date' ---
    # Convert to datetime objects first. `errors='coerce'` will turn any unparseable dates into NaT (Not a Time).
    df_clean['published_date'] = pd.to_datetime(df_clean['published_date'], errors='coerce')
    # Extract useful features.
    df_clean['publish_day_of_week'] = df_clean['published_date'].dt.dayofweek # Monday=0, Sunday=6
    df_clean['publish_hour'] = df_clean['published_date'].dt.hour
    
    # --- 5. Feature Engineering from 'duration' (ISO 8601 format) ---
    def parse_duration(duration_str):
        # Regular expression to find numbers associated with H, M, S
        match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration_str)
        if not match:
            return 0
        
        hours = int(match.group(1)) if match.group(1) else 0
        minutes = int(match.group(2)) if match.group(2) else 0
        seconds = int(match.group(3)) if match.group(3) else 0
        
        return hours * 3600 + minutes * 60 + seconds

    df_clean['duration_seconds'] = df_clean['duration'].apply(parse_duration)

    # --- 6. Feature Engineering from 'tags' ---
    # The 'tags' are a string representation of a list. We can safely evaluate it.
    # We will get the number of tags. A high number of tags could indicate effort.
    # We use a lambda with a try-except block to handle potential malformed strings gracefully.
    def count_tags(tags_str):
        try:
            # Using ast.literal_eval is safer than eval() for converting string lists
            import ast
            tags_list = ast.literal_eval(tags_str)
            return len(tags_list)
        except (ValueError, SyntaxError):
            return 0 # Return 0 if the string is not a valid list format

    df_clean['tag_count'] = df_clean['tags'].apply(count_tags)
    
    # --- 7. Final Cleanup ---
    # Drop the original columns that we've now replaced with engineered features.
    df_clean = df_clean.drop(columns=['published_date', 'duration', 'tags'])

    return df_clean

# --- Apply the function and inspect the result ---
df_processed = clean_and_feature_engineer(df)

print("--- Processed DataFrame Info ---")
df_processed.info()

print("\n--- First 5 Rows of Processed DataFrame (Markdown) ---")
print(df_processed.head().to_markdown())

--- Processed DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                600 non-null    object
 1   channel_title        600 non-null    object
 2   category_id          600 non-null    int64 
 3   view_count           600 non-null    int64 
 4   like_count           600 non-null    int64 
 5   comment_count        600 non-null    int64 
 6   publish_day_of_week  600 non-null    int32 
 7   publish_hour         600 non-null    int32 
 8   duration_seconds     600 non-null    int64 
 9   tag_count            600 non-null    int64 
dtypes: int32(2), int64(6), object(2)
memory usage: 42.3+ KB

--- First 5 Rows of Processed DataFrame (Markdown) ---
|    | title                                                                        | channel_title   |   category_id |   view_count |   like_count |   comme

In [5]:
# --- 8. Define Features (X) and Target (y) ---
# We are predicting 'view_count'.
# 'title' is dropped for this simple model, but could be used with NLP techniques.
X = df_processed.drop(columns=['view_count', 'title'])
y = df_processed['view_count']

# Identify which columns are categorical and which are numerical
categorical_features = ['channel_title', 'category_id'] # Note: category_id is treated as categorical
numerical_features = ['like_count', 'comment_count', 'publish_day_of_week', 'publish_hour', 'duration_seconds', 'tag_count']

In [6]:
# --- 9. Create a Preprocessing and Modeling Pipeline ---

# Create a preprocessor object using ColumnTransformer.
# This applies different transformations to different columns.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

# Define the model we want to use. A RandomForest is a good, powerful baseline.
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Create the full pipeline by chaining the preprocessor and the model.
# This ensures that the same steps are applied to training and prediction data.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model)
])

In [7]:
# --- 10. Split Data into Training and Testing Sets ---
# We will train the model on 80% of the data and test its performance on the unseen 20%.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training on {len(X_train)} samples, testing on {len(X_test)} samples.")

Training on 480 samples, testing on 120 samples.


In [8]:
# --- 11. Train the Model ---
print("\n--- Training the model... ---")
pipeline.fit(X_train, y_train)
print("Model training complete.")


--- Training the model... ---
Model training complete.


In [9]:
# --- 12. Make Predictions on the Test Set ---
print("\n--- Evaluating the model on the test set... ---")
predictions = pipeline.predict(X_test)


--- Evaluating the model on the test set... ---


In [10]:
# --- 13. Calculate and Print Quantifiable Metrics ---
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print("\n--- Model Performance Metrics ---")
print(f"R-squared (R²): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:,.0f} views")
print(f"Root Mean Squared Error (RMSE): {rmse:,.0f} views")


--- Model Performance Metrics ---
R-squared (R²): 0.63
Mean Absolute Error (MAE): 5,192,262 views
Root Mean Squared Error (RMSE): 18,971,531 views
