# Refining SVD recommender

In [4]:
import pymysql.cursors
import pandas as pd


# Function return a connection.
def getConnection():
    """
    parameters already fixed inside the function
    :return: the connection to the server
    """
    host_ivoox = ''
    user_ivoox = ''
    password_ivoox = ''
    db_ivoox = ''

    connection = pymysql.connect(host=host_ivoox,
                                 user=user_ivoox,
                                 password=password_ivoox,
                                 db=db_ivoox,
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)
    return connection



def __get_programs_connection_error__():
    programs = pd.read_csv('data/Programs_from_2017.csv', header=0, error_bad_lines=False, warn_bad_lines=False)
    programs.columns = ['id', 'name', 'recomendations', 'category', 'subcat', 'audios', 'last_update']
    programs = programs[['id', 'name']]
    programs['id'] = programs['id'].astype(str)
    programs['name'] = programs['name'].astype(str)
    return programs


def get_programs():
    """
    function to get the table with the programs from the MySQL server.
    :return: DataFrame containing the programs and their information.
    """
    connection = getConnection()
    query = "SELECT programs_id, programs_name FROM ivoox.programs"
    df = pd.read_sql(query, con=connection)

    df.columns = ['id', 'name']

    df['id'] = df['id'].astype(str)
    df['name'] = df['name'].astype(str)
    return df

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

print(color.BOLD + 'Hello World !' + color.END)

[1mHello World ![0m


In [12]:
import pandas as pd
import numpy as np
import random
import sys

df = pd.read_csv('data/user_program_count.csv', header=0)
df.columns = ['user_id', 'program_id', 'download_count', 'download_updated']

print(df.head())
print('\nMemory taken by the dataframe: ' + str(sys.getsizeof(df)//(2**20)) + 'MB')

   user_id  program_id  download_count     download_updated
0       43          96              27  2016-04-11 00:15:03
1       43         100              16                  NaN
2       43         112              24                  NaN
3       43         164               5                  NaN
4       43         213              24  2017-02-27 00:28:18

Memory taken by the dataframe: 2414MB


In [13]:
df["user_id"] = df["user_id"].astype(str)
df["program_id"] = df["program_id"].astype(str)
df['download_count'] = df['download_count'].astype(int)
df['download_updated'] = pd.to_datetime(df['download_updated'], errors = 'coerce')
users = df['user_id'].unique()
programs = df['program_id'].unique()

print('Número de entradas de las base de datos: ' + str(len(df)))
print('Número de programas en total: '+ str(len(programs)))
print('Número de usuarios: ' + str(len(users)))

print('\nMemory taken by the dataframe: ' + str(sys.getsizeof(df)//(2**20)) + 'MB')

Número de entradas de las base de datos: 25353339
Número de programas en total: 179587
Número de usuarios: 3345446

Memory taken by the dataframe: 3330MB


In [3]:
count_prog = df.groupby('program_id')['user_id'].nunique()

In [4]:
from datetime import datetime

entradas_origen = len(df)
programas_fecha = df[df['download_updated']>datetime(2017,1,1,0,0,0)].program_id.unique() #programas que se han escuchado una vez desde 2015
df = df[df.program_id.isin(programas_fecha)]
rango_oyentes = range(1,11)
programas_oyentes = df[~df.program_id.isin(count_prog[count_prog.isin(rango_oyentes)].index)].program_id.unique()#programas que solo son escuchados entre 1 y 10 
df = df[df.program_id.isin(programas_oyentes)]
print('Antes teníamos ' + str(len(programs)) + ', ahora tenemos un total de ' + str(len(df.program_id.unique())) + ' programas.')
print('Ésto implica que de una base de datos de ' + str(entradas_origen) + ' de entradas hemos pasado a una con ' + str(len(df)) + ' entradas.')
new_users = df['user_id'].unique()
new_programs = df['program_id'].unique()

Antes teníamos 179587, ahora tenemos un total de 51102 programas.
Ésto implica que de una base de datos de 25353339 de entradas hemos pasado a una con 24948472 entradas.


In [5]:
from dateutil.relativedelta import relativedelta
two_years_ago = datetime.now() - relativedelta(years=2)
entradas_origen2 = len(df)
usuarios_fecha = df[df['download_updated']>two_years_ago].user_id.unique() #programas que se han escuchado una vez desde 2015
df = df[df.user_id.isin(usuarios_fecha)]

#eliminamos de nuevo programas con poca audiencia en caso de que los usuarios eliminados hallan podido cambiar la distribución
rango_oyentes = range(1,11)
count_prog = df.groupby('program_id')['user_id'].nunique()
programas_oyentes = df[~df.program_id.isin(count_prog[count_prog.isin(rango_oyentes)].index)].program_id.unique()

print('Antes teníamos ' + str(len(users)) + ', ahora tenemos un total de ' + str(len(df.user_id.unique())) + ' usuarios.')
print('Ésto implica que de una base de datos de ' + str(entradas_origen) + ' de entradas hemos pasado a una con ' + str(len(df)) + ' entradas.')
new_users = df['user_id'].unique()
new_programs = df['program_id'].unique()

Antes teníamos 3345446, ahora tenemos un total de 2304367 usuarios.
Ésto implica que de una base de datos de 25353339 de entradas hemos pasado a una con 20265957 entradas.


In [102]:
try:
    programs = get_programs()
except err.InternalError as e:
    print(e)
    warnings.warn('Fail to conect to the sever, using the downloaded programs dataframe, some programs maybe missing.')
    programs = __get_programs_connection_error__()

except err.OperationalError as e2:
    print(e2)
    warnings.warn('Fail to connect to the sever, using the downloaded programs dataframe, some programs maybe missing.')
    programs = __get_programs_connection_error__()

In [8]:
from datetime import datetime
from math import floor

def get_sample(df,p):
    users = df['user_id'].unique()
    if p >1:
        sample = np.random.choice(users, floor(len(users)*p), replace=False)
    elif p==1:
        sample = users
    elif p>0 and p<1:
        sample = np.random.choice(users, floor(len(users)*p), replace=False)
    else:
        return None
    return sample

date = datetime(2017,1,1,0,0,0)

df1 = df[df['download_updated']>date]
df1 = df1[df1['user_id'].isin(df1['user_id'].value_counts(ascending = True).keys()[df1['user_id'].value_counts(ascending = True)>1])]
sample = get_sample(df, 0.05)


print(len(sample))

ivoox_dict = {'Jose':'5209511', 'Fede':'19764', 'Juan':['2982917', '7811817'], 'Miguel':'911419', 'Emilio':'1276315', 'Yo':'6776060', 'Laura':'138931'}
ivoox = ['5209511', '19764', '2982917', '7811817', '911419','1276315', '6776060', '138931']
sample= np.append(sample, ivoox)
sample.sort()
print(sample)

df1 = df1[df1.user_id.isin(sample)]

print('Users: ' + str(df1['user_id'].unique().shape[0]))
print('Programs: ' + str(df1['program_id'].unique().shape[0]))
print(df1.shape)
df1.head()

Users: 1296825
Programs: 51061
(15179618, 4)


Unnamed: 0,user_id,program_id,download_count,download_updated
4,43,213,24,2017-02-27 00:28:18
6,43,236,72,2017-12-04 00:16:04
11,43,261,8,2017-07-03 00:15:38
18,43,301,269,2017-07-17 00:15:47
20,43,321,120,2017-10-09 00:15:43


In [8]:
from scipy.sparse import csr_matrix

__ones__ = True

user_u = list(sorted(df1.user_id.unique()))
item_u = list(sorted(df1.program_id.unique()))

row = df1.user_id.astype('category').cat.codes
col = df1.program_id.astype('category').cat.codes

df1['row'] = row


if not __ones__:
    data = df1['download_count'].tolist()
else:
    data = np.ones(df1['download_count'].shape)

# data = df1['download_count'].tolist()

table = csr_matrix((data, (row, col)), shape=(len(user_u), len(item_u)))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [9]:
from scipy.sparse import linalg

U, s, V = linalg.svds(table, 100)# k=19) #k: numero de variables latentes, el parametro a optimizar
sigma = np.diag(s)


In [None]:
user_recom = np.dot(np.dot(U, sigma), V)
user_recom = pd.DataFrame(user_recom, columns = df1.program_id.astype('category').cat.categories)

user_recom.shape, df1.user_id.unique().shape

In [None]:
def recommend_podcast(predictions, user, programs, original, df1, row, item_u, k=5):
    # Get and sort the user's predictions
    user_row_number = row[df1.user_id == user].iloc[0] # índice en la matriz del usuario
    sorted_user_predictions = predictions.iloc[user_row_number].sort_values(ascending=False)
    sorted_user_predictions = pd.DataFrame(np.array([sorted_user_predictions.index.astype(str), 
                                                     sorted_user_predictions]).T,
                                                     columns=['id', 'Predictions'], )
    # Get the user's data and merge in the movie information.
    user_data = original[user_row_number,:].todense().T
    user_df = pd.DataFrame(item_u, columns = ['id'])
    user_df['values'] = user_data
    user_df['id'] = user_df['id'].astype(str)
#     print(user_df)
    user_df = user_df[user_df.values[:,1]>0.0]
    user_full = (user_df.merge(programs[['id','name']], how = 'left', left_on = 'id', right_on = 'id').
                     sort_values(['values'], ascending=False)
                 )
    print('User {0} has already listened to {1} programs.'.format(user, len(user_df)))
    print('Recommending the highest {0} predicted program not already listented.'.format(k))
    sorted_user_predictions['id'] = sorted_user_predictions['id'].astype(str)

    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    intersect = programs[~programs['id'].isin(user_full['id'])]

    recommendations = intersect.merge(sorted_user_predictions,
                                      how = 'left', left_on = 'id', right_on = 'id')
    recommendations = recommendations.sort_values('Predictions', ascending = False).iloc[:k, :]
    
    return user_full, recommendations

In [None]:
already_rated, predictions = recommend_podcast(user_recom, ivoox_dict['Miguel'], programs[['id','name']], table, df1, row, item_u, k=10)


print('Lo que ha escuchado el usuario: ')
print(already_rated.to_string(index=False))
print('\nLo que el recomendador dice: ')
print(predictions.to_string(index=False))
# print(df1[df1['user_id']==user_u[i]].sort_values(['download_count'], ascending=False))

## Optimizando Recommendador

In [40]:
def __get_dict__(df):
    d = {k: g['program_id'].tolist() for k, g in df.groupby('user_id')}
    return d

def recommend(U, St, row_batch, item_u, user_u, user_prog, k):
    """
    Recomend to batch of users the top k programs
    :param U: U from SVD
    :param St: Sigma*V from SVD
    :param user_batch: users for which recommend programs
    :param item_u: programs in data
    :param user_u: all users in data
    :param user_prog: programs that users have listened
    :param k: number of recommendations
    :return: small dictionary with recomendations for user in batch
    """
    #     predictions, user, programs, original, row, item_u, k=5
    # Get and sort the user's predictions
    predicted = np.dot(U[row_batch[0]:row_batch[-1]+1, :], St)
    prog = dict()
    j = 0
    
    for i in row_batch: #es necesario el for?
        listened_prog = user_prog[user_u[i]]
        u_cols = np.nonzero(np.isin(item_u, listened_prog))
        current = predicted[j, :]
        j += 1
        current[u_cols] = -1
        ind = current.argsort()[-1:-k - 1:-1]
        prog[user_u[i]] = item_u[list(ind)]
    return prog

In [12]:
user_prog = __get_dict__(df1)

In [33]:
user_batch.shape

(1297,)

In [41]:
St = np.dot(sigma, V)
batches =  np.array_split(list(range(len(user_u))), 50)
row_batch = batches[1]
k = 10

item_u = np.array(sorted(df1.program_id.unique()))
user_u = np.array(sorted(df1.user_id.unique()))
print('Tiempo para {} usuarios.'.format(len(row_batch)))
%timeit recom = recommend(U, St, row_batch, item_u, user_u, user_prog, k) #creo que se puede mejorar

Tiempo para 1297 usuarios.
8.74 s ± 210 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
batches =  np.array_split(user_u, 10)
user_batch = batches[1]

In [43]:
St = np.dot(sigma, V)
batches =  np.array_split(list(range(len(user_u))), 50)
row_batch = batches[1]
k = 10
batches =  np.array_split(user_u, 50)
user_batch = batches[1]

%timeit np.nonzero(np.isin(user_u, user_batch))

user_row_number = np.nonzero(np.isin(user_u, user_batch))
%timeit np.dot(U[user_row_number, :], St)

user_row_number = np.nonzero(np.isin(user_u, user_batch))[0]
%timeit np.dot(U[user_row_number, :], St)

%timeit np.dot(U[user_row_number[0]:user_row_number[-1]+1, :], St)

%timeit np.dot(U[row_batch[0]:row_batch[-1]+1, :], St)

11.7 ms ± 91.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
19.2 s ± 400 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
354 ms ± 5.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
368 ms ± 34.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
user_row_number[0].shape

(1297,)

In [20]:
[user_row_number[i+1]-user_row_number[i] for i in range(len(user_row_number))]

(array([1297, 1298, 1299, ..., 2591, 2592, 2593], dtype=int64),)

# Dando sentido a las componentes

In [117]:
def __get_dict__(df):
    d = {g[0]: g[1] for k, g in df[['id', 'name']].iterrows()}
    return d

def get_columns(programs, column_id):
    prog_name = __get_dict__(programs)
    try:
        c = list(map(lambda x: prog_name[x], column_id))
    except KeyError:
        c = []
        for i in column_id:
            try : c.append(prog_name[i])
            except KeyError: c.append(i)
    return c

def cosine(x, y):
    return np.dot(x.T, y)/(np.linalg.norm(x)*np.linalg.norm(y))
            

def get_features_table(VS, df, programs, k = None):
    count_prog = df.groupby('program_id')['user_id'].nunique().sort_values(ascending=False)
    item_u = list(sorted(df1.program_id.unique()))
    if k is not None:
        ind = list(np.nonzero(np.isin(item_u, count_prog[0:k].index.values))[0])
    else: 
        ind = list(np.nonzero(np.isin(item_u, count_prog.index.values))[0])

    c = get_columns(programs, np.array(item_u)[ind])

    return pd.DataFrame(VS[:,ind], columns=c).T
    

In [26]:
count_prog = df.groupby('program_id')['user_id'].nunique().sort_values(ascending=False)

In [47]:
from scipy.sparse import linalg
from scipy.sparse import csr_matrix

__ones__ = True

user_u = list(sorted(df1.user_id.unique()))
item_u = list(sorted(df1.program_id.unique()))

row = df1.user_id.astype('category').cat.codes
col = df1.program_id.astype('category').cat.codes

df1['row'] = row


if not __ones__:
    data = df1['download_count'].tolist()
else:
    data = np.ones(df1['download_count'].shape)

# data = df1['download_count'].tolist()

table = csr_matrix((data, (row, col)), shape=(len(user_u), len(item_u)))


U, s, V = linalg.svds(table, 200)#k: numero de variables latentes, el parametro a optimizar
sigma = np.diag(s)
VS = np.dot(np.sqrt(sigma),V)

AttributeError: module 'numpy' has no attribute 'sqr'

In [103]:
programs[programs.id.isin(count_prog[0:20].index.values)][['id', 'name']]

Unnamed: 0,id,name
20,213,Canal Comunidad
27,251,Luces en la oscuridad
31,261,Espacio en Blanco
300,772,Milenio3
389,936,Meditación y relajación
552,1254,La Rosa de los Vientos
1934,3652,SER Historia
1984,3761,Misterios
2483,4754,Podcast La Rueda del Misterio
3893,7162,Cuarto Milenio (Oficial)


In [147]:
ind = list(np.nonzero(np.isin(item_u, count_prog[0:10].index.values))[0])
VS[:, ind]

array([[  94.32912776,    5.42944877,  -44.24808504,   25.03231887,
        -177.1740517 ,    2.87679557,   61.23465358,   -9.75402341,
         -19.76468147,  114.97398247],
       [ -16.05747166,    5.01145144,  -36.58022439,  111.67756475,
         -86.68780299,   -3.11831486,   80.68614935,   18.65250242,
          14.98824356, -109.39904196],
       [-150.98000923,   -2.69242634, -118.24740931,   70.36381311,
         -51.26077099,   -1.20441007,   81.30611115,   36.44793146,
         -10.22843556,  -32.52318515],
       [ 140.36432881,  -11.59363017,   43.13180336,   72.16971176,
         -14.67684245,   -6.30285979,   65.49286398,   -2.33185957,
          18.06721096,  -62.90853936],
       [-109.30287795,  -23.68076636,  -55.3121644 ,   38.04878292,
          96.71327886,  -10.0364582 ,   83.94400638,    7.68934668,
          32.55409009,  180.39110169],
       [   2.65052932,   -7.38927251,   10.07973698,   87.32826285,
         -29.02055369,  -18.32947447,   63.22725259,   12

In [59]:
VS = np.dot(sigma,V)
progs_features = get_features_table(VS, df1, programs)

In [105]:
count_prog = df.groupby('program_id')['user_id'].nunique().sort_values(ascending=False)
item_u = list(sorted(df1.program_id.unique()))
ind = list(np.nonzero(np.isin(item_u, count_prog[0:50].index.values))[0])
c = get_columns(programs, np.array(item_u)[ind])

In [112]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, Birch
from sklearn import preprocessing

# Convert DataFrame to matrix
progs_mat = progs_features.as_matrix()
max_abs_scaler = preprocessing.MaxAbsScaler()
x_scaled = max_abs_scaler.fit_transform(progs_mat)

# Using sklearn
km = AgglomerativeClustering(n_clusters=13, affinity='l1', linkage='complete')
# km = Birch(threshold=0.5, branching_factor=50, n_clusters=None)
km.fit(progs_mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pd.DataFrame(data=labels, columns=['cluster'], index = progs_features.index)

for i in list(results[results.index.isin(c)].groupby('cluster')):
    print(color.BOLD + 'Cluster: {}. \n'.format(i[0]) + color.END +
          str('\n'.join(i[1].index.values)+'\n'))

[1mCluster: 0. 
[0mLa Órbita De Endor - podcast-
Oh my LOL La Vida Moderna
El Larguero
Nadie sabe nada, con Buenafuente y Romero
El Partidazo de COPE

[1mCluster: 1. 
[0mCarneCruda.es PROGRAMAS

[1mCluster: 2. 
[0mSER Historia
Canal Comunidad
Misterios
Luces en la oscuridad
Documentales

[1mCluster: 3. 
[0mRetazos de Historia
Podcast La Biblioteca Perdida

[1mCluster: 4. 
[0mTe Invito Un Café | TIUC
Podcast Mantra FM
Libros para Emprendedores
Podcast Superacion Personal
Martha Debayle
Podcast AYUDA PARA LA HUMANIDAD
Podcast Reflexiones de Luz y Amor
::Autoayuda y Exito Garantizado::

[1mCluster: 6. 
[0mMundo Desconocido
Voces del Misterio
El Oro de los Dioses
Podcast La Rueda del Misterio

[1mCluster: 7. 
[0mLa Rosa de los Vientos
Espacio en Blanco
Universo Iker (Oficial)
Misterios
Cuarto Milenio (Oficial)
Documentales Sonoros 

[1mCluster: 9. 
[0mEs la Mañana de Federico
Julia en la Onda

[1mCluster: 10. 
[0mCanal Comunidad
Luces en la oscuridad
Mindalia.com-Salud,Es

In [111]:
from sklearn.cluster import KMeans, DBSCAN
# Convert DataFrame to matrix
progs_mat= progs_features.as_matrix()
# Using sklearn
km = KMeans(12)
km.fit(progs_mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pd.DataFrame(data=labels, columns=['cluster'], index = progs_features[0].index)

for i in list(results[results.index.isin(c)].groupby('cluster')):
    print(color.BOLD + 'Cluster: {}. \n'.format(i[0]) + color.END +
          str('\n'.join(i[1].index.values)+'\n'))

[1mCluster: 0. 
[0mSER Historia
Canal Comunidad
Misterios
Luces en la oscuridad
Martha Debayle
Podcast Reflexiones de Luz y Amor
Documentales

[1mCluster: 1. 
[0mLuces en la oscuridad

[1mCluster: 2. 
[0mLa Rosa de los Vientos
Espacio en Blanco
La Escóbula de la Brújula

[1mCluster: 3. 
[0mTe Invito Un Café | TIUC
Libros para Emprendedores
Podcast Superacion Personal
Podcast AYUDA PARA LA HUMANIDAD
::Autoayuda y Exito Garantizado::

[1mCluster: 4. 
[0mMundo Desconocido
Escuchando Documentales
Solo Documental
Es la Mañana de Federico
Podcast Mantra FM
Retazos de Historia
HistoCast
Voces del Misterio
SER Historia
Podcast El Abrazo del Oso
DIAS EXTRAÑOS con Santiago Camacho
 Docu - Singles
El Oro de los Dioses
Mindalia.com-Salud,Espiritualidad,Conocimiento
Podcast La Rueda del Misterio
CarneCruda.es PROGRAMAS
Julia en la Onda
Podcast La Biblioteca Perdida
La Caja de Pandora
Meditación y relajación

[1mCluster: 6. 
[0mNadie sabe nada, con Buenafuente y Romero

[1mCluster: 7. 


In [118]:
top_20 = get_features_table(VS, df1, programs, k = 20)
count_prog = df.groupby('program_id')['user_id'].nunique().sort_values(ascending=False)
item_u = list(sorted(df1.program_id.unique()))
ind = list(np.nonzero(np.isin(item_u, count_prog[0:20].index.values))[0])
c = get_columns(programs, np.array(item_u)[ind])

In [119]:
from sklearn import preprocessing

x = top_20.values #returns a numpy array
max_abs_scaler = preprocessing.MaxAbsScaler()
x_scaled = max_abs_scaler.fit_transform(x)
top_20 = pd.DataFrame(x_scaled, index = c)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
La Rosa de los Vientos,-0.07005,-0.192095,-0.092034,0.037745,0.05584,-0.019096,0.018969,-0.132083,-0.203562,0.246093,...,0.487188,0.323957,1.0,-1.0,-0.410439,0.057117,0.368325,0.120301,0.104171,0.732313
Oh my LOL La Vida Moderna,-0.047609,0.181765,-0.061037,-0.852529,-0.894207,-0.226107,0.315917,-0.142828,0.760926,0.22275,...,0.034196,0.08376,0.595818,0.317356,0.067246,0.810717,0.55912,0.388651,-0.002506,0.181239
Canal Comunidad,-0.315016,-0.09344,0.370753,0.234658,0.042883,-1.0,1.0,1.0,0.280948,-0.28813,...,-0.176774,-0.078303,-0.183204,0.035268,0.051857,0.036569,-0.122392,0.047484,-0.61371,0.146997
Luces en la oscuridad,0.392354,0.286036,-0.095228,0.018288,-0.176634,0.533407,-0.621131,0.035943,-0.038946,0.064242,...,0.045804,0.025083,0.096469,0.025413,-0.152661,-0.083257,-0.660836,0.353397,-0.185129,0.392962
Espacio en Blanco,0.10945,0.373433,-0.105709,-0.178405,-0.108153,-0.186751,0.066891,0.200986,-0.164021,0.078674,...,-0.24565,0.39074,0.815876,-0.490883,-0.168284,0.050889,-0.113114,-0.566505,0.165102,0.769292
Universo Iker (Oficial),0.144819,0.166518,-0.496033,-0.126889,-0.03916,-0.207131,0.190263,0.337033,0.099324,0.101041,...,0.294768,-0.989518,-0.527811,-0.176952,0.241525,0.395489,0.005149,-1.0,0.237853,0.725532
HistoCast,-1.0,-1.0,1.0,-0.686235,-0.829152,-0.692283,0.308913,0.34809,0.444586,1.0,...,-0.288122,-0.492961,-0.000677,-0.049568,-0.000231,-0.239794,0.487541,0.506007,0.020877,0.344228
SER Historia,0.218304,0.060178,-0.511235,-0.64178,0.048993,0.074684,-0.190057,0.328723,-0.196929,0.113712,...,-0.189994,-0.346936,0.385584,-0.37516,0.036045,-0.158094,0.471747,0.383078,0.041846,0.412207
Misterios,-0.051799,0.158886,-0.118767,0.039441,-0.128797,-0.041951,0.023173,0.186461,-0.0269,-0.034958,...,-1.0,0.610864,0.135128,0.012059,0.615658,-0.128554,-0.017347,-0.542544,0.175797,1.0
Docu - Singles,-0.262279,0.18084,0.437912,-0.315807,-0.215509,-0.581277,0.265968,-0.37256,-0.004381,0.484668,...,0.043581,-0.174795,-0.039698,0.08875,-0.222221,0.093837,-0.050705,-0.317402,0.089051,0.328573


In [129]:
current = top_20[3]
list(current.index[np.abs(current).argsort()[-1:-5:-1]].values)

['Meditación y relajación',
 'Oh my LOL La Vida Moderna',
 'HistoCast',
 'La Escóbula de la Brújula']

In [22]:
print(programs[programs.id.isin(df[df.user_id=='10030'].program_id.values)])
programs[programs.id.isin(['9997', '23309', '23568', '23549', '23531', '2350', '234975',
       '2346', '2345', '2344'])]

        id                        name
300    772                    Milenio3
890   1810            Leyendas urbanas
1294  2473    Es la Mañana de Federico
1323  2529  @Enigmas y Misterios@-2008
3893  7162    Cuarto Milenio (Oficial)
4642  8413        El Partidazo de COPE


Unnamed: 0,id,name
1217,2344,Estrategias de Éxito con Mac Kroupensky
1218,2345,Radio Mente Abierta
1219,2346,Pensamiento Creativo
1223,2350,Podcast El Sexto Continente
5589,9997,Leer es un placer
13644,23309,Ilustres Ignorantes 11/12
13784,23531,Podcast Blues y Jazz
13794,23549,"Santa Teresa de Jesús, su vida en audio"
13809,23568,ANDY DISCO
179792,234975,Podcast de Omar Patricio Medina Grullón
