In [None]:
import pandas as pd     # provides high-performance, easy to use structures and data analysis tools
import numpy as np      # provides fast mathematical computation on arrays and matrices
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataframe = pd.read_csv("stc TV Data Set_T1.csv")

In [None]:
dataframe.shape

(1048575, 13)

In [None]:
# display the first 5 rows
dataframe.head()

Unnamed: 0,Column1,date_,user_id_maped,program_name,duration_seconds,program_class,season,episode,program_desc,program_genre,series_title,hd,original_name
0,1,5/27/2017,26138,100 treets,40,MOVIE,0,0,Drama Movie100 Streets,Drama,0,0,100 treets
1,3,5/21/2017,7946,Moana,17,MOVIE,0,0,Animation MovieMoana (HD),Animation,0,1,Moana
2,4,8/10/2017,7418,The Mermaid Princess,8,MOVIE,0,0,Animation MovieThe Mermaid Princess (HD),Animation,0,1,The Mermaid Princess
3,5,7/26/2017,19307,The Mermaid Princess,76,MOVIE,0,0,Animation MovieThe Mermaid Princess (HD),Animation,0,1,The Mermaid Princess
4,7,7/7/2017,15860,Churchill,87,MOVIE,0,0,Biography MovieChurchill (HD),Biography,0,1,Churchill


In [None]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 13 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   Column1           1048575 non-null  int64 
 1   date_             1048575 non-null  object
 2   user_id_maped     1048575 non-null  int64 
 3   program_name      1048575 non-null  object
 4   duration_seconds  1048575 non-null  int64 
 5   program_class     1048575 non-null  object
 6   season            1048575 non-null  int64 
 7   episode           1048575 non-null  int64 
 8   program_desc      1034537 non-null  object
 9   program_genre     1048575 non-null  object
 10  series_title      1048575 non-null  int64 
 11  hd                1048575 non-null  int64 
 12  original_name     1048575 non-null  object
dtypes: int64(7), object(6)
memory usage: 104.0+ MB


In [None]:
dataframe.describe()

Unnamed: 0,Column1,user_id_maped,duration_seconds,season,episode,series_title,hd
count,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0,1048575.0
mean,628173.0,17092.66,1230.957,1.342139,6.157952,0.01205922,0.3862728
std,359704.1,10035.13,6821.058,2.104095,12.22015,0.1091504,0.4868946
min,1.0,1.0,2.0,0.0,0.0,0.0,0.0
25%,318066.0,8253.0,52.0,0.0,0.0,0.0,0.0
50%,630355.0,17149.0,119.0,1.0,1.0,0.0,0.0
75%,939822.5,25665.0,1328.0,1.0,9.0,0.0,1.0
max,1247852.0,34280.0,1461329.0,23.0,282.0,1.0,1.0


In [None]:
dataframe.isnull().any()

Unnamed: 0,0
Column1,False
date_,False
user_id_maped,False
program_name,False
duration_seconds,False
program_class,False
season,False
episode,False
program_desc,True
program_genre,False


In [None]:
# Data Preprocessing on the input data
dataframe = dataframe.drop(columns=['Column1'])         # dropping the index column
dataframe['program_name'] = dataframe['program_name'].str.strip()  # trim spaces in movies names to avoid misspellings in input data
dataframe[['duration_seconds', 'season','episode','series_title','hd']] = dataframe[['duration_seconds', 'season','episode','series_title','hd']].apply(pd.to_numeric)  # read numeric columns as numeric data types
dataframe[['user_id_maped', 'program_name','program_class','program_desc','program_genre','original_name']] = dataframe[['user_id_maped', 'program_name','program_class','program_desc','program_genre','original_name']].astype(str) # read string columns as string data types

In [None]:
dataframe[['duration_seconds', 'season','episode','series_title','hd']] = dataframe[['duration_seconds', 'season','episode','series_title','hd']].apply(pd.to_numeric)  # read numeric columns as numeric data types
dataframe[['user_id_maped', 'program_name','program_class','program_desc','program_genre','original_name']] = dataframe[['user_id_maped', 'program_name','program_class','program_desc','program_genre','original_name']].astype(str) # read string columns as string data types

In [None]:
movies = dataframe[dataframe["program_class"] == "MOVIE"]

movies_summary = movies.groupby("program_name").agg(
    total_views=("user_id_maped", "count"),       # عدد المشاهدات (كل صف = مشاهدة)
    unique_users=("user_id_maped", "nunique"),    # عدد المستخدمين الفريدين
    total_watch_time=("duration_seconds", "sum")  # إجمالي وقت المشاهدة
).reset_index()
top_movies = movies_summary.sort_values(by="total_views", ascending=False).head(10)
print("أكثر 10 أفلام مشاهدة:")
top_movies

أكثر 10 أفلام مشاهدة:


Unnamed: 0,program_name,total_views,unique_users,total_watch_time
1107,The Boss Baby,24047,3389,10660863
1394,Trolls,13793,2613,5763683
1090,The Adventures of Petey and Friends,9612,1686,1068645
731,Moana,8081,2173,6142237
1250,The Mermaid Princess,7525,2182,1811200
150,Baywatch,7436,2062,1976384
375,Ferdinand,6817,1278,2571203
85,An Inconvenient equel: Truth to Power,6342,961,629668
256,Collateral Beauty,6212,1726,1438360
551,Inside,6134,1608,1059969


In [None]:
series = dataframe[dataframe["program_class"] == "SERIES/EPISODES"]

series_summary = series.groupby(["program_name"]).agg(
    total_views=("user_id_maped", "count"),
    unique_users=("user_id_maped", "nunique"),
    total_watch_time=("duration_seconds", "sum")
).reset_index()

# ترتيب الحلقات الأكثر مشاهدة
top_series = series_summary.sort_values(by="total_views", ascending=False).head(10)
top_series


Unnamed: 0,program_name,total_views,unique_users,total_watch_time
4536,Pingu,17063,306,5698560
4434,Oscar's Oasis,7174,180,3308269
4330,Nan and Lili Ep.,5038,124,1013298
4084,Me and My iblings Ep.,4581,67,3256001
5327,Taymour Ep.,3390,100,2982733
3431,In the Night Garden,3304,160,3782465
5094,Sonic Underground,3264,347,3385820
1243,Caramel (T) Chapter,3206,155,6590849
5466,Teletubbies,3072,147,4022367
59,24 :00 AM - :00 AM,2858,155,8422188


In [None]:
dataframe["program_class"].value_counts()

Unnamed: 0_level_0,count
program_class,Unnamed: 1_level_1
SERIES/EPISODES,560174
MOVIE,488401


In [None]:
dataframe["hd"] = dataframe["hd"].map({0: "SD", 1: "HD"})
dataframe["hd"].value_counts()

Unnamed: 0_level_0,count
hd,Unnamed: 1_level_1
SD,643539
HD,405036


In [None]:
total_watch_time =px.pie(
    grouped,
    values='total_watch_time_hours',   # ✅ correct column name
    names='program_class',
    hover_data=['program_class'],
    title='Total Duration Spent by Program Class (Hours)'
)

# Pie chart: number of users
number_of_users = px.pie(
    grouped,
    values='no_of_users',              # ✅ correct column name
    names='program_class',
    hover_data=['program_class'],
    title='Total Users Watching by Program Class'
)

total_watch_time.update_traces(sort=False)
number_of_users.update_traces(sort=False)

total_watch_time.show()
number_of_users.show()