In [1]:
import pandas as pd
import numpy as np

# Load the TikTok video performance dataset
file_path = 'test_features.csv' 
tiktok_data = pd.read_csv(file_path)

# ========================= Basic Operations ========================= #

# 1. Summary statistics: Calculating mean, median, standard deviation for each numeric column
summary_statistics = tiktok_data.describe()
print("Summary Statistics:\n", summary_statistics)

# 2. Adding new column: Engagement Rate based on (Comments + Shares) / Views
tiktok_data['Engagement_Rate'] = (tiktok_data['Comments'] + tiktok_data['Shares']) / tiktok_data['Views']
print("\nData with Engagement Rate:\n", tiktok_data[['Comments', 'Shares', 'Views', 'Engagement_Rate']].head())

# ========================= Intermediate Operations ========================= #

# 3. Normalization: Normalizing 'Views' and 'User_Followers' columns using Min-Max scaling
tiktok_data['Views_Norm'] = (tiktok_data['Views'] - tiktok_data['Views'].min()) / (tiktok_data['Views'].max() - tiktok_data['Views'].min())
tiktok_data['User_Followers_Norm'] = (tiktok_data['User_Followers'] - tiktok_data['User_Followers'].min()) / (tiktok_data['User_Followers'].max() - tiktok_data['User_Followers'].min())
print("\nNormalized Views and User Followers:\n", tiktok_data[['Views', 'Views_Norm', 'User_Followers', 'User_Followers_Norm']].head())

# 4. Correlation Matrix: Analyze correlation among numerical columns
correlation_matrix = tiktok_data.corr()
print("\nCorrelation Matrix:\n", correlation_matrix)

# ========================= Advanced Operations ========================= #

# 5. Aggregating Data: Group by 'Video_Length' and summarize (e.g., mean of views, comments, etc.)
# Here, if 'Video_Length' has only unique values, we calculate overall mean.
if len(tiktok_data['Video_Length'].unique()) > 1:
    aggregated_data = tiktok_data.groupby('Video_Length').mean()
else:
    aggregated_data = tiktok_data.mean()
print("\nAggregated Data by Video Length:\n", aggregated_data)

# 6. Log Transformation: Applying log transformation to 'Views' column
tiktok_data['Log_Views'] = np.log1p(tiktok_data['Views'])  # np.log1p for log(0) handling
print("\nData with Log Transformed Views:\n", tiktok_data[['Views', 'Log_Views']].head())

# 7. Advanced filtering and sorting: Filter videos with high engagement rate and sort by 'Views'
high_engagement_videos = tiktok_data[tiktok_data['Engagement_Rate'] > tiktok_data['Engagement_Rate'].mean()]
sorted_videos = high_engagement_videos.sort_values(by='Views', ascending=False)
print("\nHigh Engagement Videos sorted by Views:\n", sorted_videos[['Engagement_Rate', 'Views']].head())


Summary Statistics:
          Comments      Shares         Views  Video_Length  User_Followers  \
count    2.000000    2.000000      2.000000           2.0        2.000000   
mean   190.000000  305.000000  60000.000000          45.0     1750.000000   
std     14.142136  134.350288  14142.135624           0.0      353.553391   
min    180.000000  210.000000  50000.000000          45.0     1500.000000   
25%    185.000000  257.500000  55000.000000          45.0     1625.000000   
50%    190.000000  305.000000  60000.000000          45.0     1750.000000   
75%    195.000000  352.500000  65000.000000          45.0     1875.000000   
max    200.000000  400.000000  70000.000000          45.0     2000.000000   

       User_Following   User_Likes  
count        2.000000     2.000000  
mean       425.000000  5000.000000  
std        106.066017  1414.213562  
min        350.000000  4000.000000  
25%        387.500000  4500.000000  
50%        425.000000  5000.000000  
75%        462.500000  550