# Import packages

In [1]:
import sys
import sklearn
import numpy as np
import pandas as pd
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime

# Get the data

## Preparation of ratings data

In [45]:
ratings_df = pd.read_csv('data/ratings.csv')

# Drop na values
ratings_df = ratings_df.dropna()
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63249 entries, 0 to 63249
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Programma  63249 non-null  object
 1   Zender     63249 non-null  object
 2   Datum      63249 non-null  object
 3   Start      63249 non-null  object
 4   Duur       63249 non-null  object
 5   Kijkers    63249 non-null  object
dtypes: object(6)
memory usage: 3.4+ MB


Converting the "Kijkers" column to an integer type does require some extra cleanup:

In [46]:
# Change notation of numbers (use dots as decimal separators instead of thousand separators)
ratings_df['Kijkers'] = ratings_df['Kijkers'].str.replace('.', '').str.replace(',', '.')

# Convert to float first (to handle decimal numbers). Non numeric will result in NaN
ratings_df['Kijkers'] = pd.to_numeric(ratings_df['Kijkers'], errors='coerce')

# Remove NaN values
print(f"Number of rows with non-numeric values: {ratings_df['Kijkers'].isna().sum()}")
ratings_df = ratings_df.dropna(subset=['Kijkers'])

# Then convert to integer (this will round the decimal numbers)
ratings_df['Kijkers'] = ratings_df['Kijkers'].astype(int)

Number of rows with non-numeric values: 1


Splitting "Datum" column into seperate parts: year, month, day of week

In [47]:
# Split "Datum" column in new features
ratings_df["Datum"] = pd.to_datetime(ratings_df["Datum"])
ratings_df["Jaar"] = ratings_df["Datum"].dt.year
ratings_df["Maand"] = ratings_df["Datum"].dt.month
ratings_df["Dag"] = ratings_df["Datum"].dt.day_of_week

# Drop Datum column
ratings_df = ratings_df.drop(columns=["Datum"], axis=1)

ratings_df.head()

Unnamed: 0,Programma,Zender,Start,Duur,Kijkers,Jaar,Maand,Dag
0,HET 7 UUR-JOURNAAL,EEN,19:00:05,00:31:38,721850,2016,10,5
1,FC DE KAMPIOENEN,EEN,20:41:00,00:38:39,709606,2016,10,5
2,WEG ZIJN WIJ,EEN,20:13:36,00:24:44,548239,2016,10,5
3,IEDEREEN BEROEMD,EEN,19:38:10,00:29:01,523610,2016,10,5
4,COMEDY TOPPERS,VTM,19:52:06,00:24:40,496216,2016,10,5


In [48]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 63248 entries, 0 to 63249
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Programma  63248 non-null  object
 1   Zender     63248 non-null  object
 2   Start      63248 non-null  object
 3   Duur       63248 non-null  object
 4   Kijkers    63248 non-null  int64 
 5   Jaar       63248 non-null  int32 
 6   Maand      63248 non-null  int32 
 7   Dag        63248 non-null  int32 
dtypes: int32(3), int64(1), object(4)
memory usage: 3.6+ MB
