In [None]:
# Netflix User Behavior Analysis - EDA & Processing

import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from datetime import timedelta


In [None]:
# 1. Load Data
# ----------------------------
df = pd.read_csv('netflix_viewing_data.csv')

print("Shape:", df.shape)
print(df.head())


Shape: (2131, 16)
  UserID   SessionID TitleID                Title        Genres  IsSeries  \
0  U0001  U0001_S001   T0143     Horror Title 143        Horror      True   
1  U0001  U0001_S002   T0277    Romance Title 277       Romance     False   
2  U0001  U0001_S003   T0053      Sci-Fi Title 53        Sci-Fi      True   
3  U0001  U0001_S003   T0282  Animation Title 282     Animation      True   
4  U0001  U0001_S003   T0299     Horror Title 299  Horror|Drama      True   

   Season  EpisodeNumber  FullContentDurationSeconds           WatchStart  \
0     2.0            3.0                        2460  2025-06-16 06:48:32   
1     NaN            NaN                        6720  2025-08-02 22:34:19   
2     4.0            1.0                        2880  2025-04-01 11:09:43   
3     3.0            7.0                        1080  2025-04-01 11:24:16   
4     1.0            4.0                        1800  2025-04-01 11:39:29   

              WatchEnd  WatchDurationSeconds EventType  

In [None]:
# 2. Basic Data Audit
# ----------------------------
print(df.info())
print(df.describe(include='all'))

# Missing values
print("Missing values per column:\n", df.isna().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2131 entries, 0 to 2130
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   UserID                      2131 non-null   object 
 1   SessionID                   2131 non-null   object 
 2   TitleID                     2131 non-null   object 
 3   Title                       2131 non-null   object 
 4   Genres                      2131 non-null   object 
 5   IsSeries                    2131 non-null   bool   
 6   Season                      1268 non-null   float64
 7   EpisodeNumber               1268 non-null   float64
 8   FullContentDurationSeconds  2131 non-null   int64  
 9   WatchStart                  2131 non-null   object 
 10  WatchEnd                    2131 non-null   object 
 11  WatchDurationSeconds        2131 non-null   int64  
 12  EventType                   2131 non-null   object 
 13  Device                      2131 

In [None]:
# 3. Cleaning
# ----------------------------
# Drop duplicates
df.drop_duplicates(inplace=True)

# Normalize genres (strip spaces, title case)
df['Genres'] = df['Genres'].str.strip().str.title()

# Convert date
df['WatchStart'] = pd.to_datetime(df['WatchStart'])
df['WatchEnd'] = pd.to_datetime(df['WatchEnd'])

df['WatchDuration'] = (df['WatchEnd'] - df['WatchStart']).dt.total_seconds() / 60  # in minutes



In [None]:
# 4. Feature Engineering
# ----------------------------
# Watch hours
df['WatchHours'] = df['WatchDurationSeconds'] / 3600

# Binge detection: if same user watches >2 titles on same day
df['BingeFlag'] = df.groupby(['UserID', 'WatchDuration'])['Title'].transform('count') > 2

# Average rating per genre
genre_rating = df.groupby('Genres')['Rating'].mean().reset_index()

# Total watch hours per genre
genre_hours = df.groupby('Genres')['WatchHours'].sum().reset_index()


In [None]:
# 5. Visualization Examples
# ----------------------------
# Bar chart - total watch hours by genre
fig1 = px.bar(genre_hours.sort_values('WatchHours', ascending=False),
              x='Genres', y='WatchHours',
              title='Total Watch Hours by Genre')
fig1.show()

# Scatter plot - Ratings vs Watch Hours
fig2 = px.scatter(df, x='WatchHours', y='Rating', color='Genres',
                  title='Ratings vs Watch Hours')
fig2.show()


In [None]:

# 6. User Segmentation Example (KMeans)
# ----------------------------
user_stats = df.groupby('UserID').agg({
    'WatchHours': 'sum',
    'Rating': 'mean',
    'BingeFlag': 'mean'
}).reset_index()

# Fill NaN ratings with 0
user_stats['Rating'] = user_stats['Rating'].fillna(0)

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
user_stats['Cluster'] = kmeans.fit_predict(user_stats[['WatchHours', 'Rating', 'BingeFlag']])

fig3 = px.scatter_3d(user_stats, x='WatchHours', y='Rating', z='BingeFlag',
                     color=user_stats['Cluster'].astype(str),
                     title='User Segments')
fig3.show()


In [None]:
# ----------------------------
# 7. Save Processed Data
# ----------------------------
from pathlib import Path

Path('data/processed').mkdir(parents=True, exist_ok=True)
df.to_csv('data/processed/netflix_viewing_data_clean.csv', index=False)
user_stats.to_csv('data/processed/user_segments.csv', index=False)

print("Processing complete. Cleaned data and user segments saved.")


Processing complete. Cleaned data and user segments saved.
