# Use-case 2: Linking content data to ratings data

---
---

## Preparing environment

In [1]:
import pandas as pd

df_rating = pd.read_csv('data/banijay_op1data_ratings_processed.csv', compression='zip')
df_content = pd.read_csv('data/banijay_op1data_content_processed.csv', compression='zip')

In [2]:
del df_rating['Unnamed: 0'] #Deleting this column, because it was created unintentionally.
df_rating.head(10)

Unnamed: 0,Datum,Time,Program,Station,Target Group,Broadcast Type,Ratings Type,Kdh%,Kdh000,Zadl%,date_time
0,2020-01-06,22:18:00,op1,npo1,tot6plus,live/prerecorded uitzendingen,uitzenddag,9.676969,1546.863437,30.881672,2020-01-06 22:18:00
1,2020-01-06,22:18:00,op1,npo1,tot6plus,live/prerecorded uitzendingen,uitgesteld,1.484044,237.224411,24.692727,2020-01-06 22:18:00
2,2020-01-06,22:18:00,op1,npo1,tot6plus,live/prerecorded uitzendingen,totaal,10.299407,1646.36021,29.962268,2020-01-06 22:18:00
3,2020-01-06,22:18:00,op1,npo1,boodschapper_20_49,live/prerecorded uitzendingen,uitzenddag,3.217735,117.833443,14.746563,2020-01-06 22:18:00
4,2020-01-06,22:18:00,op1,npo1,boodschapper_20_49,live/prerecorded uitzendingen,uitgesteld,1.146627,41.989484,17.021271,2020-01-06 22:18:00
5,2020-01-06,22:18:00,op1,npo1,boodschapper_20_49,live/prerecorded uitzendingen,totaal,3.975244,145.57342,15.812862,2020-01-06 22:18:00
6,2020-01-06,22:18:00,op1,npo1,boodschapper_25_54,live/prerecorded uitzendingen,uitzenddag,5.294855,212.111904,19.696293,2020-01-06 22:18:00
7,2020-01-06,22:18:00,op1,npo1,boodschapper_25_54,live/prerecorded uitzendingen,uitgesteld,1.662682,66.60703,22.868945,2020-01-06 22:18:00
8,2020-01-06,22:18:00,op1,npo1,boodschapper_25_54,live/prerecorded uitzendingen,totaal,6.271663,251.242825,20.356892,2020-01-06 22:18:00
9,2020-01-06,22:18:00,op1,npo1,6_12_jr,live/prerecorded uitzendingen,uitzenddag,1.025859,13.510569,43.995912,2020-01-06 22:18:00


In [3]:
df_content.head()

Unnamed: 0.1,Unnamed: 0,date,hosts,id,length,start,end,title,summary,keywords,date_time_start,date_time_end,show_id,fragment
0,0,01-02-2021,"['Hilbrand, Sophie', 'Logtenberg, Hugo']",OP1__________-WON02197428_01_segment,00:21:21,22:20:11,22:41:32,"Gerard Smetsers, Annelie Jager en Károly Illy ...","De basisscholen mogen weer open, maar dat bete...","['directeur', 'gesloten', 'basisscholen']",2021-01-02 22:20:11,2021-01-02 22:41:32,OP1__________-WON02197428,1_segment
1,1,01-02-2021,"['Hilbrand, Sophie', 'Logtenberg, Hugo']",OP1__________-WON02197428_02_segment,00:11:04,22:41:36,22:52:41,IC-arts Hugo Touw wil versoepeling van de coro...,Intensivisten pleiten voor versoepeling van de...,"['accepteren', 'waarom', 'coronamaatregelen']",2021-01-02 22:41:36,2021-01-02 22:52:41,OP1__________-WON02197428,2_segment
2,2,01-02-2021,"['Hilbrand, Sophie', 'Logtenberg, Hugo']",OP1__________-WON02197428_03_segment,00:11:15,22:53:04,23:04:20,Feike Sijbesma over de gevolgen van de klimaat...,Volgens ex-coronagezant Feike Sijbesma is de c...,"['snel', 'wereldleiders', 'klimaatcrisis']",2021-01-02 22:53:04,2021-01-02 23:04:20,OP1__________-WON02197428,3_segment
3,3,01-02-2021,"['Hilbrand, Sophie', 'Logtenberg, Hugo']",OP1__________-WON02197428_04_segment,00:11:20,23:04:22,23:15:39,Danny Ghosen en ex-designerdrugsverslaafde Kev...,Steeds vaker komen er meldingen binnen van men...,"['nieuwste', 'combinatie', 'designerdrug']",2021-01-02 23:04:22,2021-01-02 23:15:39,OP1__________-WON02197428,4_segment
4,4,01-02-2022,"['Brink, Tijs van den', 'Ostiana, Giovanca', '...",OP1__________-WON02290378_01_segment,00:06:22,22:38:18,22:44:40,Op1 - De opening van dinsdag 1 februari,"Marc van der Linden, Jeroen Snel en Jesse Klav...","['snel', 'linden', 'nieuws']",2022-01-02 22:38:18,2022-01-02 22:44:40,OP1__________-WON02290378,1_segment


In [4]:
del df_content['Unnamed: 0']

In [5]:
#Credits to Kian for the code below!

arr = [] # Creating an array
for x in df_rating["date_time"]: # For each value in column:
    fragment = df_content.loc[(df_content['date_time_start'] <= x) & (df_content['date_time_end'] >= x)] # Locate the row belonging to that value and storing it in fragment.
    if fragment.shape[0] == 0 :
        arr.append(None) # If fragment does not have a value append nothing; else append id.
    else:
        arr.append(fragment.iloc[0]['id']) # I am merging on id, which includes show and fragment.

df_rating['id'] = pd.Series(arr).values # Convert the array into a series which can be put into a column in the ratings data.

In [6]:
df_merged = pd.merge(df_rating, df_content, how="left", on='id') # Merge the data set with the column I created just now.

In [7]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1251982 entries, 0 to 1251981
Data columns (total 24 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   Datum            1251982 non-null  object 
 1   Time             1251982 non-null  object 
 2   Program          1251982 non-null  object 
 3   Station          1251982 non-null  object 
 4   Target Group     1251982 non-null  object 
 5   Broadcast Type   1251982 non-null  object 
 6   Ratings Type     1251982 non-null  object 
 7   Kdh%             1251982 non-null  float64
 8   Kdh000           1251982 non-null  float64
 9   Zadl%            1251982 non-null  float64
 10  date_time        1251982 non-null  object 
 11  id               858449 non-null   object 
 12  date             858449 non-null   object 
 13  hosts            858449 non-null   object 
 14  length           858449 non-null   object 
 15  start            858449 non-null   object 
 16  end              8

In [8]:
df_merged = df_merged.dropna() # Drop all rows that have no value.

In [9]:
df_merged.info() # Check the Non-Null values to see if the code provided above worked.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 858449 entries, 0 to 1235248
Data columns (total 24 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Datum            858449 non-null  object 
 1   Time             858449 non-null  object 
 2   Program          858449 non-null  object 
 3   Station          858449 non-null  object 
 4   Target Group     858449 non-null  object 
 5   Broadcast Type   858449 non-null  object 
 6   Ratings Type     858449 non-null  object 
 7   Kdh%             858449 non-null  float64
 8   Kdh000           858449 non-null  float64
 9   Zadl%            858449 non-null  float64
 10  date_time        858449 non-null  object 
 11  id               858449 non-null  object 
 12  date             858449 non-null  object 
 13  hosts            858449 non-null  object 
 14  length           858449 non-null  object 
 15  start            858449 non-null  object 
 16  end              858449 non-null  obj

In [10]:
df_merged.to_csv('data/banijay_op1data_content_ratings.csv', compression='zip') # Saving the merged dataset.