# Preprocess Data

In [8]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import ast

## Load dataset

In [3]:
video_df = pd.read_csv('../data/videos.csv')
user_df = pd.read_csv('../data/usuarios.csv')
inter_df = pd.read_csv('../data/interacciones2.csv')
user_df.rename(columns={'id': 'user_id'}, inplace=True)
video_df.rename(columns={'id': 'video_id'}, inplace=True)


In [4]:

user_df.columns

Index(['user_id', 'name', 'edad', 'danceStyles', 'categorias_preferidas'], dtype='object')

In [5]:
# Dataset Merging
df = inter_df.merge(user_df, on='user_id', how='left')
df = df.merge(video_df, on='video_id', how='left')

In [6]:
df.tail(10)

Unnamed: 0,id,user_id,video_id,like,comentario,watchtime,dont_suggest,name,edad,danceStyles,categorias_preferidas,videoTitle,etiquetas,categoria,estilo
100004,100004,1001,189,1,Genial,120.0,0,Manolo,35,"['flamenco', 'salsa']",['duet'],Someone great much student,"['pareja', 'coreografia', 'nivel_avanzado']",duet,"['salsa', 'flamenco']"
100005,100005,1001,248,0,,5.0,0,Manolo,35,"['flamenco', 'salsa']",['duet'],Movement,"['energia', 'creatividad']",freestyle,"['jazz', 'contemporary']"
100006,100006,1001,268,1,,35.0,0,Manolo,35,"['flamenco', 'salsa']",['duet'],Order,"['pareja', 'coreografia', 'nivel_avanzado']",freestyle,['contemporary']
100007,100007,1001,440,1,Me encanta mucho,130.0,0,Manolo,35,"['flamenco', 'salsa']",['duet'],Pay yes as,"['creatividad', 'solo', 'pareja', 'nivel_facil...",duet,['flamenco']
100008,100008,1002,27,1,Good Content,60.0,0,Emo,27,"['hiphop', 'breakdance']","['freestyle', 'battle', 'challenge']",Center owner,"['nivel_avanzado', 'nivel_facil', 'creatividad...",duet,"['breakdance', 'hiphop']"
100009,100009,1002,43,1,Keep up,80.0,0,Emo,27,"['hiphop', 'breakdance']","['freestyle', 'battle', 'challenge']",Thus between whether onto effect,"['nivel_avanzado', 'ritmo', 'grupo', 'nivel_fa...",freestyle,"['hiphop', 'contemporary']"
100010,100010,1002,68,0,,3.0,0,Emo,27,"['hiphop', 'breakdance']","['freestyle', 'battle', 'challenge']",Religious air,"['pareja', 'grupo']",performance,['flamenco']
100011,100011,1002,58,0,Cringe,10.0,1,Emo,27,"['hiphop', 'breakdance']","['freestyle', 'battle', 'challenge']",Some strategy act,"['coreografia', 'grupo', 'energia', 'nivel_fac...",performance,['flamenco']
100012,100012,1002,421,1,Good stuff,50.0,0,Emo,27,"['hiphop', 'breakdance']","['freestyle', 'battle', 'challenge']",Defense free,"['nivel_medio', 'creatividad', 'solo', 'grupo'...",freestyle,['breakdance']
100013,100013,1000,105,1,Me gusta!,80.0,0,Walter White,50,['jazz'],"['tutorial', 'challenge']",Home agree,"['creatividad', 'grupo', 'nivel_facil', 'nivel...",challenge,['jazz']


In [9]:
# Convert Categorical Variables to one-hot encoding

df['danceStyles'] = df['danceStyles'].apply(ast.literal_eval)


In [12]:
df.head(10)


Unnamed: 0,id,user_id,video_id,like,comentario,watchtime,dont_suggest,name,edad,danceStyles,categorias_preferidas,videoTitle,etiquetas,categoria,estilo
0,0,372,66,1,Inside health spend cold few glass whose forei...,297.46,0,Shannon Madden,16,"[jazz, contemporary]","['challenge', 'tutorial']",Former election hair manager,"['ritmo', 'solo', 'nivel_medio', 'nivel_avanza...",duet,"['jazz', 'breakdance']"
1,1,34,425,0,Might stage practice station thousand hundred ...,59.97,0,Jody Johnson,47,"[flamenco, salsa, jazz]","['challenge', 'duet']",Particularly strategy local democratic,"['nivel_facil', 'ritmo', 'pareja', 'creativida...",performance,"['breakdance', 'contemporary']"
2,2,386,323,0,Various myself factor difference better four s...,33.91,0,Edward Baker,19,"[salsa, ballet, contemporary]","['performance', 'freestyle', 'battle']",Shoulder,"['nivel_medio', 'grupo', 'nivel_facil', 'pareja']",tutorial,['hiphop']
3,3,962,223,1,,510.88,0,Timothy Ewing,19,[hiphop],"['battle', 'freestyle', 'performance', 'class']",Example,"['grupo', 'creatividad', 'coreografia', 'solo'...",performance,['salsa']
4,4,318,467,0,,36.45,0,Margaret Lowe,36,"[breakdance, jazz, ballet]","['freestyle', 'class', 'challenge', 'duet']",Great statement buy enter,"['pareja', 'creatividad', 'nivel_medio', 'solo']",tutorial,"['flamenco', 'breakdance']"
5,5,801,383,0,,49.33,0,Anthony Schultz,35,[flamenco],"['battle', 'performance', 'challenge', 'duet']",Include listen,"['nivel_avanzado', 'energia', 'ritmo', 'coreog...",challenge,['jazz']
6,6,933,190,0,Move property animal nearly poor trial include...,28.5,0,Marissa Henderson,39,[breakdance],"['tutorial', 'challenge', 'performance']",Our,"['solo', 'nivel_medio', 'grupo', 'ritmo', 'cor...",class,['contemporary']
7,7,899,336,0,Enough could even whether next send hit accoun...,44.77,0,Brittany Case,15,"[salsa, flamenco, hiphop]","['tutorial', 'performance']",Successful trouble your treat,"['energia', 'ritmo', 'creatividad']",tutorial,"['breakdance', 'flamenco']"
8,8,129,76,1,Hot expect drop amount mission quickly deal ne...,261.34,0,Megan Washington,59,"[flamenco, contemporary]","['class', 'battle', 'duet']",Rest size,"['nivel_medio', 'ritmo', 'creatividad']",battle,['hiphop']
9,9,480,448,0,Today federal husband six bank second everyone...,55.31,0,Tami Hall,39,[hiphop],"['freestyle', 'performance', 'class', 'battle']",Team part most no,"['nivel_medio', 'pareja', 'grupo', 'creatividad']",challenge,"['salsa', 'contemporary']"
