# <font color='blue'>Data Science Academy - Machine Learning</font>

# <font color='blue'>Capítulo 15</font>

In [None]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

# <font color='blue'>Part 2 - Netflix Movie Recommendation System</font>

## Instalando e Carregando os Pacotes

In [None]:
# Para atualizar um pacote, execute o comando abaixo no terminal ou prompt de comando:
# pip install -U nome_pacote

# Para instalar a versão exata de um pacote, execute o comando abaixo no terminal ou prompt de comando:
# pip install nome_pacote==versão_desejada

# Depois de instalar ou atualizar o pacote, reinicie o jupyter notebook.

In [None]:
# Instala o pacote watermark. 
# Esse pacote é usado para gravar as versões de outros pacotes usados neste jupyter notebook.
!pip install -q -U watermark

In [None]:
# Imports
import os
import random
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import scipy
from scipy import sparse
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
import xgboost as xgb
from datetime import datetime

In [None]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark -a "Data Science Academy" --iversions

## Data Preparation

The function below will be used to extract data samples from the sparse matrices created in part 1 of the Mini-Project.

In [None]:
# Function to get sparse matrix sample
def gera_amostra_matriz_esparsa(sparse_matrix, num_users, num_movies, path, verbose = True):

    # Tuple: (row, col) and (rating) of sparse matrix
    row_ind, col_ind, ratings = sparse.find(sparse_matrix)
    users = np.unique(row_ind)
    movies = np.unique(col_ind)

    # Random seed to reproduce the random process
    np.random.seed(15)
    
    # User and movie samples
    sample_users = np.random.choice(users, num_users, replace = True)
    sample_movies = np.random.choice(movies, num_movies, replace = True)
    
    # Generate the boolean mask
    mask = np.logical_and(np.isin(row_ind, sample_users), np.isin(col_ind, sample_movies))
    
    # Matriz esparsa com as amostras da matriz original
    amostra_matriz_esparsa = sparse.csr_matrix((ratings[mask], (row_ind[mask], col_ind[mask])), 
                                               shape = (max(sample_users) + 1, max(sample_movies) + 1))
    
    # Save to disk
    print('Saving to disk...')
    sparse.save_npz(path, amostra_matriz_esparsa)
    
    if verbose:
            print('Task completed.\n')
    
    return amostra_matriz_esparsa

### Generate Sample Training Data

In [None]:
%%time

# Path where is the sparse training matrix generated in Part 1 of the Mini-Project
caminho_matriz_treino_original = "dados/matriz_esparsa_treino.npz"

# Loading the sparse array
matriz_esparsa_treino_loaded = sparse.load_npz(caminho_matriz_treino_original)
print("Original Matrix Loaded.")

# Where to save the sample
path = 'dados/amostra_matriz_esparsa_treino.npz'

# We get ratings from 1000 users to 100 movies in sparse training matrix
amostra_matriz_esparsa_treino = gera_amostra_matriz_esparsa(matriz_esparsa_treino_loaded, 
                                                            num_users = 1000, 
                                                            num_movies = 100, 
                                                            path = path)

### Generating Sample Test Data

In [None]:
%%time

# Path where is the sparse training matrix generated in Part 1 of the Mini-Project
caminho_matriz_teste_original = "dados/matriz_esparsa_teste.npz"

# Loading the sample array (if it already exists)
matriz_esparsa_teste_loaded = sparse.load_npz(caminho_matriz_teste_original)
print("Original Matrix Loaded.")

# Where to save the sample
path = 'dados/amostra_matriz_esparsa_teste.npz'

# We get ratings from 200 users to 20 movies in the sparse training matrix
amostra_matriz_esparsa_teste = gera_amostra_matriz_esparsa(matriz_esparsa_teste_loaded, 
                                                           num_users = 200, 
                                                           num_movies = 20,
                                                           path = path)

In [None]:
# Summary
print('Number of evaluations in the matrix with training samples: {}'.format(amostra_matriz_esparsa_treino.count_nonzero()))
print('Number of evaluations in the matrix with test samples: {}'.format(amostra_matriz_esparsa_teste.count_nonzero()))

Note: Samples created. Change the number of users and number of movies if you want to work with larger samples.

### Metrics Extracted from Data

Checking some metrics from the data. The model will predict the user's rating of the movie.

In [None]:
# Create the dictionary
amostra_medias_treino = dict()

The function below will be used to calculate the average of ratings.

In [None]:
def calcula_media_ratings(sparse_matrix, of_users):
    
    # Average ratings
    #1 represents the users axis
    # 0 represents the movies axis
    ax = 1 if of_users else 0 

    # Sum of evaluations
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    
    # Boolean array of ratings (whether or not a user has rated a movie)
    is_rated = sparse_matrix != 0
    
    # Number of ratings for each user or movie
    no_of_ratings = is_rated.sum(axis = ax).A1
    
    # Sparse array ids, u for user and m for movie
    u,m = sparse_matrix.shape
    
    # User dictionary and their ratings
    average_ratings = {i:sum_of_ratings[i] / no_of_ratings[i] 
                       for i in range(u if of_users else m) 
                       if no_of_ratings[i] != 0}

    return average_ratings

Overall average of movie ratings:

In [None]:
# Overall average rating
media_global = amostra_matriz_esparsa_treino.sum() / amostra_matriz_esparsa_treino.count_nonzero()
amostra_medias_treino['global'] = media_global
amostra_medias_treino

Average rating per user:

In [None]:
# Calculates average user rating
amostra_medias_treino['user'] = calcula_media_ratings(amostra_matriz_esparsa_treino, of_users = True)

In [None]:
# Let's extract one of the users from the movie dictionary (the goal here is just to automate the process)
um_usuario = [a for a, b in amostra_medias_treino['user'].items()][0]
um_usuario

In [None]:
# Print
print('Média de Avaliação do Usuário ' + str(um_usuario) + ':', amostra_medias_treino['user'][um_usuario])

Average rating per film:

In [None]:
# Calculates the average rating of movies
amostra_medias_treino['movie'] =  calcula_media_ratings(amostra_matriz_esparsa_treino, of_users = False)

In [None]:
# Let's extract one of the movies from the movie dictionary (the goal here is just to automate the process)
um_filme = [a for a, b in amostra_medias_treino['movie'].items()][0]
um_filme

In [None]:
# Pring
print('Média de Avaliação do Filme ' + str(um_filme) + ':', amostra_medias_treino['movie'][um_filme])

## Formatting the Data

Construction of a regression model, since it is intended to predict the evaluations (numerical values).

Prepare the training and test data in the cells below.

These are the model variables.

Predictor Variables (input):

- **GAvg** : Global average of the evaluations


- **Review from similar users**:
    - sur1, sur2, sur3, sur4, sur5 (top 5 users similar to each user who rated a movie)
    

- **Similar movies rated by a user**:
    - smr1, smr2, smr3, smr4, smr5 (top 5 movies similar to each movie rated)


- **UAvg** : Average user ratings


- **MAvg** : Average rating of the film


Target Variable (output):

- **rating** : Rating of the movie given by a user

### Preparing Training Data for the Regression Model

In [None]:
# Extracting the data from the sample matrix
amostra_usuarios_treino, amostra_filmes_treino, amostra_avaliacoes_treino = sparse.find(amostra_matriz_esparsa_treino)

The cell below takes a long time to run.

In [None]:
%%time

# Check if the file already exists
if os.path.isfile('dados/dados_treino_reg.csv'):
    print("The file already exists and we don't need to create it move..." )
else:
    print('Preparing {} tuples for the dataset...\n'.format(len(amostra_medias_treino)))
    with open('dados/dados_treino_reg.csv', mode = 'w') as reg_data_file:
        count = 0
        for (user, movie, rating) in zip(amostra_usuarios_treino, amostra_filmes_treino, amostra_avaliacoes_treino):
             
            ###### Rating of a "movie" by users similar to the current user ######
            
            # Calculate user similar to current user       
            user_sim = cosine_similarity(amostra_matriz_esparsa_treino[user], 
                                         amostra_matriz_esparsa_treino).ravel()
            
            # Get top users
            top_sim_users = user_sim.argsort()[::-1][1:]
            
            # Get ratings from similar users
            top_ratings = amostra_matriz_esparsa_treino[top_sim_users, movie].toarray().ravel()
            
            # Top similar users up to 5
            top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_users_ratings.extend([amostra_medias_treino['movie'][movie]]*(5 - len(top_sim_users_ratings))) 

            ##### User ratings for movies similar to the current movie #####
            
            # Calculate movies similar to the current movie       
            movie_sim = cosine_similarity(amostra_matriz_esparsa_treino[:,movie].T, 
                                          amostra_matriz_esparsa_treino.T).ravel()
            
            # Top filmes
            top_sim_movies = movie_sim.argsort()[::-1][1:] 
            
            # Get movie ratings most similar to current user
            top_ratings = amostra_matriz_esparsa_treino[user, top_sim_movies].toarray().ravel()
            
            # Top similar users up to 5
            top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_movies_ratings.extend([amostra_medias_treino['user'][user]] * (5-len(top_sim_movies_ratings))) 

            ##### Prepares the line to be stored in the file #####
            row = list()
            row.append(user)
            row.append(movie)
            
            # Added other attributes
            row.append(amostra_medias_treino['global']) 
            row.extend(top_sim_users_ratings)
            row.extend(top_sim_movies_ratings)
            row.append(amostra_medias_treino['user'][user])
            row.append(amostra_medias_treino['movie'][movie])

            row.append(rating)
            count = count + 1
            
            if count == 10:
                break

            reg_data_file.write(','.join(map(str, row)))
            reg_data_file.write('\n')        
            if (count)%10000 == 0:
                print("Concluído para {} linhas----- {}".format(count, datetime.now() - start))

We load the file and place it in a dataframe.

In [None]:
df_dados_treino_reg = pd.read_csv('dados/dados_treino_reg.csv', 
                               names = ['user', 
                                        'movie', 
                                        'GAvg', 
                                        'sur1', 
                                        'sur2', 
                                        'sur3', 
                                        'sur4', 
                                        'sur5',
                                        'smr1', 
                                        'smr2', 
                                        'smr3', 
                                        'smr4', 
                                        'smr5', 
                                        'UAvg', 
                                        'MAvg', 
                                        'rating'], 
                               header = None)

In [None]:
# Dados
df_dados_treino_reg.head()

### Preparing Test Data for the Regression Model

Same process with training data.

In [None]:
# Extracting the data from the sample matrix
amostra_usuarios_teste, amostra_filmes_teste, amostra_avaliacoes_teste = sparse.find(amostra_matriz_esparsa_teste)

In [None]:
%%time

if os.path.isfile('dados/dados_teste_reg.csv'):
    print("The file already exists and we don't need to create it move...")
else:

    print('Preparing {} tuples for the dataset...\n'.format(len(amostra_avaliacoes_teste)))
    with open('dados/dados_teste_reg.csv', mode='w') as reg_data_file:
        count = 0 
        for (user, movie, rating)  in zip(amostra_usuarios_teste, amostra_filmes_teste, amostra_avaliacoes_teste):
            st = datetime.now()

            # Similarity of users
            try:
                user_sim = cosine_similarity(amostra_matriz_esparsa_treino[user], 
                                             amostra_matriz_esparsa_treino).ravel()
                
                top_sim_users = user_sim.argsort()[::-1][1:] 
                top_ratings = amostra_matriz_esparsa_treino[top_sim_users, movie].toarray().ravel()
                top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
                top_sim_users_ratings.extend([amostra_medias_treino['movie'][movie]]*(5 - len(top_sim_users_ratings)))

            except (IndexError, KeyError):
                top_sim_users_ratings.extend([amostra_medias_treino['global']]*(5 - len(top_sim_users_ratings)))
            except:
                print(user, movie)
                raise

            # Film similarity
            try:
                movie_sim = cosine_similarity(amostra_matriz_esparsa_treino[:,movie].T, 
                                              amostra_matriz_esparsa_treino.T).ravel()
                
                top_sim_movies = movie_sim.argsort()[::-1][1:] 
                top_ratings = amostra_matriz_esparsa_treino[user, top_sim_movies].toarray().ravel()
                top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
                top_sim_movies_ratings.extend([amostra_medias_treino['user'][user]]*(5-len(top_sim_movies_ratings))) 
            except (IndexError, KeyError):
                top_sim_movies_ratings.extend([amostra_medias_treino['global']]*(5-len(top_sim_movies_ratings)))
            except :
                raise

            # Prepare data to write to file
            row = list()
            row.append(user)
            row.append(movie)
            row.append(amostra_medias_treino['global']) 
            row.extend(top_sim_users_ratings)
            row.extend(top_sim_movies_ratings)

            try:
                row.append(amostra_medias_treino['user'][user])
            except KeyError:
                row.append(amostra_medias_treino['global'])
            except:
                raise

            try:
                row.append(amostra_medias_treino['movie'][movie])
            except KeyError:
                row.append(amostra_medias_treino['global'])
            except:
                raise

            row.append(rating)
            
            count = count + 1
            
            if count == 5:
                break
    
            reg_data_file.write(','.join(map(str, row)))
            reg_data_file.write('\n')        
            if (count)%1000 == 0:
                print("Concluído em {} linhas----- {}".format(count, datetime.now() - start))

We load the file and place it in a dataframe.

In [None]:
# Generate the test dataset
df_dados_teste_reg = pd.read_csv('dados/dados_teste_reg.csv', names = ['user', 
                                                                       'movie', 
                                                                       'GAvg', 
                                                                       'sur1', 
                                                                       'sur2', 
                                                                       'sur3', 
                                                                       'sur4', 
                                                                       'sur5',
                                                                       'smr1', 
                                                                       'smr2', 
                                                                       'smr3', 
                                                                       'smr4', 
                                                                       'smr5',
                                                                       'UAvg', 
                                                                       'MAvg', 
                                                                       'rating'], 
                                 header = None)

In [None]:
df_dados_teste_reg.head()

## Building the Machine Learning Model

The last stage of the work: building, training and evaluating the model.

In [None]:
# Dictionaries for model evaluation
models_evaluation_train = dict()
models_evaluation_test = dict()

Abaixo algumas funções para executar o modelo.

In [None]:
# Function for calculating the model error
def calcula_metricas(y_true, y_pred):
    rmse = np.sqrt(np.mean([ (y_true[i] - y_pred[i])**2 for i in range(len(y_pred)) ]))
    mape = np.mean(np.abs( (y_true - y_pred)/y_true )) * 100
    return rmse, mape

In [None]:
# Function for training and testing the model
def executa_modelo_xgboost(modelo, x_train, y_train, x_test, y_test, verbose = True):

    # Dictionary
    train_results = dict()
    test_results = dict()
    
    # Model training
    print('Training the model..')
    start = datetime.now()
    modelo.fit(x_train, y_train, eval_metric = 'rmse')
    print('Concluded. Total time: {}\n'.format(datetime.now() - start))

    # Calculating model error on training data
    print('Calculating Metrics with Training Data.')
    start = datetime.now()
    y_train_pred = modelo.predict(x_train)
    rmse_train, mape_train = calcula_metricas(y_train.values, y_train_pred)
    
    # Write the results
    train_results = {'rmse': rmse_train, 'mape' : mape_train, 'previsoes' : y_train_pred}
    
    if verbose:
        print('\nModel Error in Training Data')
        print('-'*30)
        print('RMSE : ', rmse_train)
        print('MAPE : ', mape_train)
        
    #Evaluating the model with test data
    print('\nEvaluating the model with test data.')
    y_test_pred = modelo.predict(x_test) 
    rmse_test, mape_test = calcula_metricas(y_true = y_test.values, y_pred = y_test_pred)
    
    # Write the results
    test_results = {'rmse': rmse_test, 'mape' : mape_test, 'previsoes':y_test_pred}
    
    if verbose:
        # Write the results
        print('-'*30)
        print('RMSE : ', rmse_test)
        print('MAPE : ', mape_test)
        
    return train_results, test_results

In [None]:
# Seed
my_seed = 15
random.seed(my_seed)
np.random.seed(my_seed)

## Model Training

In [None]:
# Prepare training data
x_treino = df_dados_treino_reg.drop(['user', 'movie', 'rating'], axis = 1)
y_treino = df_dados_treino_reg['rating']

In [None]:
# Prepare testing data
x_teste = df_dados_teste_reg.drop(['user', 'movie', 'rating'], axis = 1)
y_teste = df_dados_teste_reg['rating']

In [None]:
# Create the regression model with 100 estimators
modelo_xgb = xgb.XGBRegressor(silent = False, random_state = 15, n_estimators = 100)

In [None]:
# Model training
train_results, test_results = executa_modelo_xgboost(modelo_xgb, x_treino, y_treino, x_teste, y_teste)

In [None]:
# Store the model evaluation results
models_evaluation_train['modelo_xgb'] = train_results
models_evaluation_test['modelo_xgb'] = test_results

In [None]:
# Most important variables for the model
xgb.plot_importance(modelo_xgb)
plt.show()

In addition to building the model, more relevant variables were also identified.

Note: User ratings are critical to recommending rated movies to other users.

## Saving the Result

In [None]:
# Save results to disk
pd.DataFrame(models_evaluation_test).to_csv('dados/resultado.csv')
models = pd.read_csv('dados/resultado.csv', index_col = 0)
models.loc['rmse'].sort_values()