# Ratings

In [None]:
import pandas as pd
import numpy as np
import pickle
import scipy.sparse

In [None]:
ratings = pd.read_csv('../clean_data/ratings_gender.csv') # Los usuarios con menos de 15 reseñas han sido eliminados

In [None]:
ratings.head()

In [None]:
ratings.info()

In [None]:
ratings.duplicated().sum()

In [None]:
#ratings.drop_duplicates(inplace=True)

In [None]:
ratings.isnull().sum()

In [None]:
# From string to tuple
ratings['tconst_gender'] = ratings['tconst_gender'].apply(lambda x: eval(x))

In [None]:
ratings[ratings.primaryTitle=='Booksmart']

In [None]:
ratings.userID.nunique()

In [None]:
ratings.tconst.nunique()

In [None]:
ratings_unique_primaryName = ratings.groupby(['primaryName','gender'])['userID'].count().reset_index().rename(columns={'userID':'count'})
ratings_unique_primaryName.gender.value_counts()

In [None]:
ratings_unique_tconst = ratings.groupby(['tconst','gender'])['userID'].count().reset_index().rename(columns={'userID':'count'})
ratings_unique_tconst.gender.value_counts()

In [None]:
ratings.gender.value_counts()

In [None]:
ratings['tconst'].nunique()

In [None]:
ratings.head()

# Sparse matrix

In [None]:
from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Args:
        df: pandas dataframe
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    N = df['userID'].nunique()
    M = df['tconst_gender'].nunique()
    

    user_mapper = dict(zip(np.unique(df["userID"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["tconst_gender"]), list(range(M))))
    
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userID"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["tconst_gender"])))
    
    user_index = [user_mapper[i] for i in df['userID']]
    movie_index = [movie_mapper[i] for i in df['tconst_gender']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [None]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)

In [None]:
# save X matrix to disk
#scipy.sparse.save_npz('../clean_data/sparse_matrix.npz', X)

# read X from disk
#X = scipy.sparse.load_npz('../clean_data/sparse_matrix.npz')

In [None]:
X.shape

In [None]:
X.shape[0]*X.shape[1]

In [None]:
X.count_nonzero()

In [None]:
sparsity = X.count_nonzero()/(X.shape[0]*X.shape[1])

print(f"Matrix sparsity: {round(sparsity*100,2)}%")

## Finding similar movies using k-Nearest Neighbours

This approach looks for the $k$ nearest neighbours of a given movie by identifying $k$ points in the dataset that are closest to movie $m$. kNN makes use of distance metrics such as:

1. Cosine similarity
2. Euclidean distance
3. Manhattan distance
4. Pearson correlation 

Although difficult to visualize, we are working in a M-dimensional space where M represents the number of movies in our X matrix. 

In [None]:
from sklearn.neighbors import NearestNeighbors
kNN = NearestNeighbors(n_neighbors=500, algorithm="brute", metric='cosine')
kNN.fit(X)

In [None]:
# save the model to disk
#filename = '../_model_/KNN.sav'
#pickle.dump(kNN, open(filename, 'wb'))

# read the model from disk
#kNN = pickle.load(open('../_model_/KNN.sav', 'rb'))

In [None]:
def find_similar_movies(movie_id, k, show_distance=False):
    """
    Finds k-nearest neighbours for a given movie id.
    
    Args:
        movie_id: id of the movie of interest
        X: user-item utility matrix
        k: number of similar movies to retrieve (por ahora solo va a devolver 5)
        metric: distance metric for kNN calculations
    
    Returns:
        list of k similar movie ID's
    """
    neighbour_ids = []
    
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    
    
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    
    i = 0
    j = 0
    while i < 5:
        n = neighbour.item(j)
        j = j + 1
        if movie_inv_mapper[n][1] == 'F':
            neighbour_ids.append(movie_inv_mapper[n])
            i = i + 1
        if j==k:
            break
            
    neighbour_ids.pop(0)
    return neighbour_ids

In [None]:
movie_titles = dict(zip(ratings['tconst_gender'], ratings['primaryTitle']))

movie_id = ('tt1205489', 'M')

similar_ids = find_similar_movies(movie_id, 500)

movie_title = movie_titles[movie_id]

print(f"Because you watched {movie_title}")
for i in similar_ids:
    print(movie_titles[i])

# Finding movies by genre

In [None]:
ratings.head()

In [None]:
ratings_female = ratings.drop_duplicates('tconst')[ratings.gender == 'F'][['tconst','primaryTitle','primaryName','gender','genres']]

In [None]:
ratings_female.info()

In [None]:
ratings_female[ratings_female["genres"].str.lower().str.contains("comedy", regex=False, na=False)]

In [None]:
split_female_genres_list = [x.split(',') for x in ratings_female.genres.tolist()]
#flat list
flate_female_genres_list = [item for sublist in split_female_genres_list for item in sublist]
# to dataframe
df_female_genres = pd.DataFrame({'genres':flate_female_genres_list})

In [None]:
df_female_genres.drop_duplicates(inplace=True)

In [None]:
df_female_genres = df_female_genres.reset_index().drop('index',axis=1)

In [None]:
df_female_genres

In [None]:
def select_by_genre(data, genre):
    return data[data["genres"].str.lower().str.contains(genre, regex=False, na=False)]   

In [None]:
sel_genre = select_by_genre(ratings_female, 'news')