In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

plt.style.use('ggplot')

In [3]:
df = pd.read_csv('../data/watch_history.csv')

In [4]:
df

Unnamed: 0,watch_id,user_id,movie_id,watch_date,watch_duration,device_type
0,W20000,15013,2027,2024-11-01,115.0,Laptop
1,W20001,15008,2393,2024-11-01,135.0,Mobile
2,W20002,15015,437,2024-11-01,106.0,Laptop
3,W20003,15006,279,2024-11-01,9.0,Smart TV
4,W20004,15004,1137,2024-11-01,99.0,Mobile
...,...,...,...,...,...,...
98918,W118918,16649,2247,2025-03-14,113.0,Laptop
98919,W118919,16036,238,2025-03-14,106.0,Mobile
98920,W118920,16508,2835,2025-03-14,6.0,Laptop
98921,W118921,16847,2418,2025-03-14,9.0,Laptop


In [5]:
df.shape

(98923, 6)

In [8]:
df.dtypes

watch_id           object
user_id             int64
movie_id            int64
watch_date         object
watch_duration    float64
device_type        object
dtype: object

In [9]:
#check for missing
df.isna().sum()

watch_id            0
user_id             0
movie_id            0
watch_date          0
watch_duration    136
device_type       197
dtype: int64

In [10]:
#for watch duration to be null is understandable because there may be a user who just didnt wanted to see the movie so i would fill it as 0
df['watch_duration'] = df['watch_duration'].fillna(0)

In [11]:
df.isna().sum()

watch_id            0
user_id             0
movie_id            0
watch_date          0
watch_duration      0
device_type       197
dtype: int64

In [13]:
#for the unavailable devices let me fill in it with the most common values
df['device_type'] = df['device_type'].fillna(df['device_type'].mode()[0])

In [14]:
df.isna().sum()

watch_id          0
user_id           0
movie_id          0
watch_date        0
watch_duration    0
device_type       0
dtype: int64

In [16]:
#just like neg payments does it have neg watch duration
df.query('watch_duration < 0')

Unnamed: 0,watch_id,user_id,movie_id,watch_date,watch_duration,device_type
121,W20121,15027,260,2024-11-03,-1.0,Mobile
552,W20552,15023,2229,2024-11-09,-1.0,Mobile
974,W20974,15006,921,2024-11-11,-1.0,Smart TV
1184,W21184,15100,1771,2024-11-13,-1.0,Smart TV
1321,W21321,15211,1003,2024-11-14,-1800.0,Mobile
...,...,...,...,...,...,...
98159,W118159,15577,3275,2025-03-11,-15.0,Laptop
98187,W118187,16604,965,2025-03-12,-1.0,Laptop
98212,W118212,15833,1784,2025-03-12,-1800.0,Laptop
98601,W118601,15603,2352,2025-03-13,-15.0,Mobile


In [17]:
#yes there is so
df['watch_duration'] = df['watch_duration'].apply(lambda x: abs(x) if x < 0 else x)

In [20]:
df.query('watch_duration < 0')

Unnamed: 0,watch_id,user_id,movie_id,watch_date,watch_duration,device_type


In [22]:
df.query('user_id == 15027')

Unnamed: 0,watch_id,user_id,movie_id,watch_date,watch_duration,device_type
45,W20045,15027,118,2024-11-02,138.0,Smart TV
88,W20088,15027,3160,2024-11-03,101.0,Laptop
121,W20121,15027,260,2024-11-03,1.0,Mobile
407,W20407,15027,3101,2024-11-08,98.0,Laptop
469,W20469,15027,2418,2024-11-08,119.0,Laptop
...,...,...,...,...,...,...
95576,W115576,15027,2040,2025-03-08,191.0,Smart TV
95983,W115983,15027,194,2025-03-08,104.0,Laptop
96380,W116380,15027,955,2025-03-09,83.0,Laptop
96708,W116708,15027,3248,2025-03-09,83.0,Mobile


In [24]:
#as indicated by the pdf we have some abnormal user ids
df2 = pd.read_csv('../cleaned/users.csv')

In [25]:
valid_set = set(df2['user_id'])


In [26]:
filtered =  df[df['user_id'].isin(valid_set)]

In [28]:
o = len(df)
f = len(filtered)
drop = o - f

In [29]:
filtered

Unnamed: 0,watch_id,user_id,movie_id,watch_date,watch_duration,device_type
0,W20000,15013,2027,2024-11-01,115.0,Laptop
1,W20001,15008,2393,2024-11-01,135.0,Mobile
2,W20002,15015,437,2024-11-01,106.0,Laptop
3,W20003,15006,279,2024-11-01,9.0,Smart TV
4,W20004,15004,1137,2024-11-01,99.0,Mobile
...,...,...,...,...,...,...
98918,W118918,16649,2247,2025-03-14,113.0,Laptop
98919,W118919,16036,238,2025-03-14,106.0,Mobile
98920,W118920,16508,2835,2025-03-14,6.0,Laptop
98921,W118921,16847,2418,2025-03-14,9.0,Laptop


In [30]:
#so around 100 were invalid user_ids that were not in the database

In [31]:
filtered.to_csv('watch_history.csv', index=False)