#### Pandas – Feature Crafting, Aggregation, and Merge

<i> We will enhance our dataset by creating new features, aggregating data, and performing merging operations.</i>

In [1]:
import pandas as pd

In [3]:
# Sample YouTube-like Dataset

df = pd.DataFrame({
    'video_id': [1, 2, 3, 4],
    'channel_title': ['TechTalks', 'Foodies', 'TechTalks', 'TravelNow'],
    'tags': ['AI|ML|Data', 'Recipe|Dessert', 'Python|Code|Pandas', 'Vlog|Europe'],
    'likes': [100, 250, 300, 150],
    'dislikes': [10, 50, 20, 10]
})
df

Unnamed: 0,video_id,channel_title,tags,likes,dislikes
0,1,TechTalks,AI|ML|Data,100,10
1,2,Foodies,Recipe|Dessert,250,50
2,3,TechTalks,Python|Code|Pandas,300,20
3,4,TravelNow,Vlog|Europe,150,10


##### Feature Crafting – Tag Count - Create a new column showing the number of tags used in each video.

In [4]:
def calculate_tag_count(tags: str) -> int:
    return len(tags.split('|'))

df['tag_count'] = df['tags'].apply(calculate_tag_count)
df[['video_id', 'tags', 'tag_count']]

Unnamed: 0,video_id,tags,tag_count
0,1,AI|ML|Data,3
1,2,Recipe|Dessert,2
2,3,Python|Code|Pandas,3
3,4,Vlog|Europe,2


##### Feature Crafting – Like Ratio (likes / (likes + dislikes))

In [8]:
def like_dislike_avg(data_set: pd.DataFrame) -> list:
    like_list = list(data_set['likes'])
    dislike_list = list(data_set['dislikes'])
    combine_list = list(zip(like_list, dislike_list))
    avg_list = []

    for like, dislike in combine_list:
        if like + dislike == 0:
            avg_list.append(0)
        else:
            avg_list.append(like / (like + dislike))
    return avg_list

df['like_avg'] = like_dislike_avg(df)
df[['likes', 'dislikes', 'like_avg']]


Unnamed: 0,likes,dislikes,like_avg
0,100,10,0.909091
1,250,50,0.833333
2,300,20,0.9375
3,150,10,0.9375


##### Aggregation – Total Tags Used by Each Channel

In [9]:
(
    df.groupby('channel_title')
    .agg({'tag_count': 'sum'})
    .sort_values(by='tag_count', ascending=False)
)

Unnamed: 0_level_0,tag_count
channel_title,Unnamed: 1_level_1
TechTalks,6
Foodies,2
TravelNow,2


##### Merge Example – Customers and Orders

In [10]:
# Sample DataFrames
df_customers = pd.DataFrame({
    'customer_id': [1, 2, 3],
    'customer_name': ['Alice', 'Bob', 'Charlie']
})

df_orders = pd.DataFrame({
    'order_id': [101, 102, 103],
    'customer_id': [1, 2, 1],
    'product': ['Keyboard', 'Mouse', 'Monitor']
})

In [11]:
merged_df = pd.merge(
    left=df_customers,
    right=df_orders,
    on='customer_id',
    how='inner'
)

merged_df

Unnamed: 0,customer_id,customer_name,order_id,product
0,1,Alice,101,Keyboard
1,1,Alice,103,Monitor
2,2,Bob,102,Mouse


#### Video Performance Analysis

In [16]:
#Sample Datasets
# Video metadata
df_videos = pd.DataFrame({
    'video_id': [101, 102, 103, 104],
    'title': ['AI Revolution', 'Cooking Pasta', 'Python Tips', 'Travel Vlog'],
    'tags': ['AI|Future|Tech', 'Food|Recipe|Dinner', 'Python|Code|Tips', 'Travel|Europe|Vlog'],
    'channel_id': [1, 2, 1, 3]
})

# Engagement metrics
df_engagement = pd.DataFrame({
    'video_id': [101, 102, 103, 104],
    'likes': [250, 180, 300, 220],
    'dislikes': [30, 10, 40, 5],
    'views': [5000, 4000, 6000, 3000],
    'duration_min': [10, 12, 8, 15]
})

# Channel info
df_channels = pd.DataFrame({
    'channel_id': [1, 2, 3],
    'channel_name': ['TechWorld', 'TastyChannel', 'Wanderlust']
})

##### # Feature Engineering - Create a column: like_ratio = likes / (likes + dislikes)

In [19]:
def compute_like_ratio(row):
    total = row['likes'] + row['dislikes']
    return row['likes'] / total if total > 0 else 0

df_engagement['like_ratio'] = df_engagement.apply(compute_like_ratio, axis=1)

##### Create a tag count column from df_videos

In [21]:
df_videos['tag_count'] = df_videos['tags'].apply(lambda x: len(x.split('|')))
df_videos

Unnamed: 0,video_id,title,tags,channel_id,tag_count
0,101,AI Revolution,AI|Future|Tech,1,3
1,102,Cooking Pasta,Food|Recipe|Dinner,2,3
2,103,Python Tips,Python|Code|Tips,1,3
3,104,Travel Vlog,Travel|Europe|Vlog,3,3


##### Merge DataFrames

In [24]:
df_merged = pd.merge(df_videos, df_engagement, on='video_id')
df_merged = pd.merge(df_merged, df_channels, on='channel_id')
df_merged

Unnamed: 0,video_id,title,tags,channel_id,tag_count,likes,dislikes,views,duration_min,like_ratio,channel_name
0,101,AI Revolution,AI|Future|Tech,1,3,250,30,5000,10,0.892857,TechWorld
1,102,Cooking Pasta,Food|Recipe|Dinner,2,3,180,10,4000,12,0.947368,TastyChannel
2,103,Python Tips,Python|Code|Tips,1,3,300,40,6000,8,0.882353,TechWorld
3,104,Travel Vlog,Travel|Europe|Vlog,3,3,220,5,3000,15,0.977778,Wanderlust


##### Aggregation – Channel-Level Stats

In [26]:
channel_stats = (
    df_merged.groupby('channel_name')
    .agg({
        'like_ratio': 'mean',
        'tag_count': 'sum',
        'duration_min': 'sum'
    })
    .rename(columns={
        'like_ratio' : 'avg_like_ratio',
        'tag_count' : 'total_tags',
        'duration_min' : 'total_duration'
    })
    .sort_values(by='avg_like_ratio', ascending=False)
)
channel_stats

Unnamed: 0_level_0,avg_like_ratio,total_tags,total_duration
channel_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Wanderlust,0.977778,3,15
TastyChannel,0.947368,3,12
TechWorld,0.887605,6,18


##### Advanced Filtering – High-Quality, Short Videos - Select videos with: like_ratio > 0.85, duration < 12 minutes and more than 2 tags

In [28]:
high_quality= (
    df_merged
    .query('like_ratio > 0.85 and duration_min < 12 and tag_count > 2')
    [['title', 'channel_name', 'like_ratio', 'duration_min', 'tag_count']]
    .sort_values(by='like_ratio', ascending=False)
)
high_quality

Unnamed: 0,title,channel_name,like_ratio,duration_min,tag_count
0,AI Revolution,TechWorld,0.892857,10,3
2,Python Tips,TechWorld,0.882353,8,3
