In [1]:
import pandas as pd
import numpy as np
import torch

from collections import namedtuple
from itertools import chain
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
#Ratings
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', header=None, encoding='latin-1', engine='python')
ratings.columns = ['userId','movieId','rating','timestamp']


#Movies
movies = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, encoding='latin-1', engine='python')
movies.columns = ['movieId','title','genres']

#Users
users = pd.read_csv('ml-1m/users.dat', sep='::', header=None, encoding='latin-1',  engine='python')
users.columns = ['userId','gender','age','occupation','zipCode']

In [3]:
#Data quality
print('Duplicated rows in ratings file: ' + str(ratings.duplicated().sum()))

n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]

print('Number of users: {}'.format(n_users))
print('Number of movies: {}'.format(n_movies))
print('Sparsity: {:4.3f}%'.format(float(ratings.shape[0]) / float(n_users*n_movies) * 100))

Duplicated rows in ratings file: 0
Number of users: 6040
Number of movies: 3706
Sparsity: 4.468%


#### Preprocessing (to be ready)

In [4]:
# rating preprocessing
#ratings = ratings.drop('timestamp', axis=1)

# movie preprocessing
movies['genres'] = movies.apply(lambda row : row['genres'].split("|")[0],axis=1)
movies['movie_year'] = movies.apply(lambda row : int(row['title'].split("(")[-1][:-1]),axis=1)
movies.drop(['title'],axis=1,inplace=True)

# combine rating and movie
rating_movie = pd.merge(ratings,movies,how='left',on="movieId")

# user preprocessing
users['gender'] = users['gender'].replace({'F':0,'M':1})
users['age'] = users['age'].replace({1:0,18:1, 25:2, 35:3, 45:4, 50:5, 56:6 })
users.drop(['zipCode'],axis=1,inplace=True)

# combine into final dataframe
final_df = pd.merge(rating_movie,users,how='left',on='userId')
final_df.sample(5)

  users['gender'] = users['gender'].replace({'F':0,'M':1})


Unnamed: 0,userId,movieId,rating,timestamp,genres,movie_year,gender,age,occupation
386093,2258,2300,4,974589715,Comedy,1968,0,1,4
201408,1242,999,4,974835092,Crime,1996,0,3,0
956585,5771,1214,2,958694174,Action,1979,1,2,7
83124,549,2130,4,976122372,Crime,1980,1,2,6
988442,5964,2701,2,956996610,Action,1999,1,1,5


In [5]:
def encoder(df, cols=None):
    if cols == None:
        cols = list(df.select_dtypes(include=['object']).columns)

    val_types = dict()
    for c in cols:
        val_types[c] = df[c].unique()

    val_to_idx = dict()
    for k, v in val_types.items():
        val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}

    for k, v in val_to_idx.items():
        df[k] = df[k].apply(lambda x: v[x])

    return val_to_idx, df

In [6]:
#function used to break the data into test and train sets and make embedding of the data
def data_processing(df, wide_cols, embeddings_cols, continuous_cols, target,
    scale=False, def_dim=8):


    if type(embeddings_cols[0]) is tuple:
        emb_dim = dict(embeddings_cols)
        embeddings_cols = [emb[0] for emb in embeddings_cols]
    else:
        emb_dim = {e:def_dim for e in embeddings_cols}
    deep_cols = embeddings_cols+continuous_cols

    # Extract the target and copy the dataframe so we don't mutate it
    # internally.
    Y = np.array(df[target])
    all_columns = list(set(wide_cols + deep_cols ))
    df_tmp = df.copy()[all_columns]


    # Extract the categorical column names that can be one hot encoded later
    categorical_columns = list(df_tmp.select_dtypes(include=['object']).columns)

    
    encoding_dict,df_tmp = encoder(df_tmp)
    encoding_dict = {k:encoding_dict[k] for k in encoding_dict if k in deep_cols}
    embeddings_input = []
    for k,v in encoding_dict.items():
        embeddings_input.append((k, len(v), emb_dim[k]))

    df_deep = df_tmp[deep_cols]
    deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}


    if scale:
        scaler = StandardScaler()
        for cc in continuous_cols:
            df_deep[cc]  = scaler.fit_transform(df_deep[cc].values.reshape(-1,1))

    df_wide = df_tmp[wide_cols]
    del(df_tmp)
    dummy_cols = [c for c in wide_cols if c in categorical_columns]
    df_wide = pd.get_dummies(df_wide, columns=dummy_cols)

    X_train_deep, X_test_deep = train_test_split(df_deep.values, test_size=0.3, random_state=1463)
    X_train_wide, X_test_wide = train_test_split(df_wide.values, test_size=0.3, random_state=1463)
    y_train, y_test = train_test_split(Y, test_size=0.3, random_state=1981)

    group_dataset = dict()
    train_dataset = namedtuple('train_dataset', 'wide, deep, labels')
    test_dataset  = namedtuple('test_dataset' , 'wide, deep, labels')
    group_dataset['train_dataset'] = train_dataset(X_train_wide, X_train_deep, y_train)
    group_dataset['test_dataset']  = test_dataset(X_test_wide, X_test_deep, y_test)
    group_dataset['embeddings_input']  = embeddings_input
    group_dataset['deep_column_idx'] = deep_column_idx
    group_dataset['encoding_dict'] = encoding_dict

    return group_dataset

In [7]:
#settings for the data
wide_cols = ['movie_year','gender','age', 'occupation','genres','userId','movieId']
embeddings_cols = [('genres',20), ('userId',100), ('movieId',100)]
crossed_cols = ()
continuous_cols = ["movie_year","gender","age","occupation"]
target = 'rating'

#split the data and generate the embeddings
data_processed = data_processing(final_df, wide_cols, embeddings_cols, continuous_cols, target, scale=True)

#### Custom Dataset Loader & Model Class

In [8]:
#Loadthe dataset
class DatasetLoader(Dataset):
    def __init__(self, data):

        # Access the tuple fields directly, not by string indexing
        self.X_wide = np.array(data.wide, dtype=np.float32)
        self.X_deep = np.array(data.deep, dtype=np.float32)
        self.Y = np.array(data.labels, dtype=np.float32)

    def __getitem__(self, idx):

        xw = torch.tensor(self.X_wide[idx], dtype=torch.float64)
        xd = torch.tensor(self.X_deep[idx], dtype=torch.float64)
        y = torch.tensor(self.Y[idx], dtype=torch.float64)

        return xw, xd, y

    def __len__(self):
        return len(self.Y)

In [13]:
#class defining the wide and deep neural network
class NeuralNet(nn.Module):

    def __init__(self,
                 wide_dim,
                 embeddings_input,
                 continuous_cols,
                 deep_column_idx,
                 hidden_layers,
                 dropout,
                 encoding_dict,
                 n_class):

        super(NeuralNet, self).__init__()
        self.wide_dim = wide_dim
        self.deep_column_idx = deep_column_idx
        self.embeddings_input = embeddings_input
        self.continuous_cols = continuous_cols
        self.hidden_layers = hidden_layers
        self.dropout = dropout
        self.encoding_dict = encoding_dict
        self.n_class = n_class
        self.loss_values=[]

        # Build the embedding layers to be passed through the deep-side
        for col,val,dim in self.embeddings_input:
            setattr(self, 'emb_layer_'+col, nn.Embedding(val, dim))

        # Build the deep-side hidden layers with dropout if specified
        input_emb_dim = np.sum([emb[2] for emb in self.embeddings_input])
        self.linear_1 = nn.Linear(input_emb_dim+len(continuous_cols), self.hidden_layers[0])
        if self.dropout:
            self.linear_1_drop = nn.Dropout(self.dropout[0])
        for i,h in enumerate(self.hidden_layers[1:],1):
            setattr(self, 'linear_'+str(i+1), nn.Linear( self.hidden_layers[i-1], self.hidden_layers[i] ))
            if self.dropout:
                setattr(self, 'linear_'+str(i+1)+'_drop', nn.Dropout(self.dropout[i]))

        # Connect the wide- and dee-side of the model to the output neuron(s)
        self.output = nn.Linear(self.hidden_layers[-1]+self.wide_dim, self.n_class)


    def compile(self, optimizer="Adam", learning_rate=0.001, momentum=0.0):
        
        self.activation, self.criterion = None, F.mse_loss

        if optimizer == "Adam":
            self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        if optimizer == "RMSprop":
            self.optimizer = torch.optim.RMSprop(self.parameters(), lr=learning_rate)
        if optimizer == "SGD":
            self.optimizer = torch.optim.SGD(self.parameters(), lr=learning_rate, momentum=momentum)

        self.method = 'regression'


    def forward(self, X_w, X_d):

        # Deep Side
        emb = [getattr(self, 'emb_layer_'+col)(X_d[:,self.deep_column_idx[col]].long())
               for col,_,_ in self.embeddings_input]
        if self.continuous_cols:
            cont_idx = [self.deep_column_idx[col] for col in self.continuous_cols]
            cont = [X_d[:, cont_idx].float()]
            deep_inp = torch.cat(emb+cont, 1)
        else:
            deep_inp = torch.cat(emb, 1)

        x_deep = F.relu(self.linear_1(deep_inp))
        if self.dropout:
            x_deep = self.linear_1_drop(x_deep)
        for i in range(1,len(self.hidden_layers)):
            x_deep = F.relu( getattr(self, 'linear_'+str(i+1))(x_deep) )
            if self.dropout:
                x_deep = getattr(self, 'linear_'+str(i+1)+'_drop')(x_deep)

        # Deep + Wide sides
        wide_deep_input = torch.cat([x_deep, X_w.float()], 1)

        if not self.activation:
            out = self.output(wide_deep_input)
        else:
            out = self.activation(self.output(wide_deep_input))

        return out


    def fit(self, dataset, n_epochs, batch_size):

        widedeep_dataset = DatasetLoader(dataset)
        train_loader = torch.utils.data.DataLoader(dataset=widedeep_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True)

        # set the model in training mode
        net = self.train()
        for epoch in range(n_epochs):
            total=0
            correct=0
            for i, (X_wide, X_deep, target) in enumerate(train_loader):
                X_w = Variable(X_wide)
                X_d = Variable(X_deep)
                y = (Variable(target).float() if self.method != 'multiclass' else Variable(target))

                X_w, X_d, y = X_w.to(device), X_d.to(device), y.to(device)

                self.optimizer.zero_grad()
                y_pred =  net(X_w, X_d)
                y_pred = torch.squeeze(y_pred)
                loss = self.criterion(y_pred, y)
                loss.backward()
                self.optimizer.step()

                if self.method != "regression":
                    total+= y.size(0)
                    if self.method == 'logistic':
                        y_pred_cat = (y_pred > 0.5).squeeze(1).float()
                    if self.method == "multiclass":
                        _, y_pred_cat = torch.max(y_pred, 1)
                    correct+= float((y_pred_cat == y).sum().data[0])
            self.loss_values.append(loss.item())
            print ('Epoch {} of {}, Loss: {}'.format(epoch+1, n_epochs,
                    round(loss.item(),3)))


    def predict(self, dataset):


        X_w = Variable(torch.from_numpy(np.array(dataset.wide, dtype=np.float32))).float()
        X_d = Variable(torch.from_numpy(np.array(dataset.deep, dtype=np.float32)))

        X_w, X_d = X_w.to(device), X_d.to(device)

        # set the model in evaluation mode so dropout is not applied
        net = self.eval()
        pred = net(X_w,X_d).cpu()
        if self.method == "regression":
            return pred.squeeze(1).data.numpy()
        if self.method == "logistic":
            return (pred > 0.5).squeeze(1).data.numpy()
        if self.method == "multiclass":
            _, pred_cat = torch.max(pred, 1)
            return pred_cat.data.numpy()


 

    def get_embeddings(self, col_name):
        params = list(self.named_parameters())
        emb_layers = [p for p in params if 'emb_layer' in p[0]]
        emb_layer  = [layer for layer in emb_layers if col_name in layer[0]][0]
        embeddings = emb_layer[1].cpu().data.numpy()
        col_label_encoding = self.encoding_dict[col_name]
        inv_dict = {v:k for k,v in col_label_encoding.items()}
        embeddings_dict = {}
        for idx,value in inv_dict.items():
            embeddings_dict[value] = embeddings[idx]

        return embeddings_dict

In [14]:
# Network set up
wide_dim = data_processed['train_dataset'].wide.shape[1]
n_unique = len(np.unique(data_processed['train_dataset'].labels))
n_class = 1

deep_column_idx = data_processed['deep_column_idx']
embeddings_input= data_processed['embeddings_input']
encoding_dict   = data_processed['encoding_dict']
hidden_layers = [100,50]
dropout = [0.5,0.5]

model = NeuralNet(wide_dim, embeddings_input, continuous_cols, deep_column_idx, hidden_layers, dropout, encoding_dict, n_class)
model.compile(optimizer='Adam')
model.to(device)

NeuralNet(
  (emb_layer_genres): Embedding(18, 20)
  (linear_1): Linear(in_features=24, out_features=100, bias=True)
  (linear_1_drop): Dropout(p=0.5, inplace=False)
  (linear_2): Linear(in_features=100, out_features=50, bias=True)
  (linear_2_drop): Dropout(p=0.5, inplace=False)
  (output): Linear(in_features=74, out_features=1, bias=True)
)

In [16]:
train_dataset = data_processed['train_dataset']
test_dataset  = data_processed['test_dataset']

In [15]:
#train the model
model.fit(dataset=train_dataset, n_epochs=20, batch_size=32)

Epoch 1 of 20, Loss: 1.848
Epoch 2 of 20, Loss: 2.235
Epoch 3 of 20, Loss: 1.445
Epoch 4 of 20, Loss: 0.939
Epoch 5 of 20, Loss: 3.265
Epoch 6 of 20, Loss: 1.629
Epoch 7 of 20, Loss: 2.336
Epoch 8 of 20, Loss: 2.051
Epoch 9 of 20, Loss: 1.645
Epoch 10 of 20, Loss: 1.582
Epoch 11 of 20, Loss: 0.772
Epoch 12 of 20, Loss: 1.738
Epoch 13 of 20, Loss: 2.072
Epoch 14 of 20, Loss: 2.082
Epoch 15 of 20, Loss: 0.801
Epoch 16 of 20, Loss: 1.88
Epoch 17 of 20, Loss: 1.379
Epoch 18 of 20, Loss: 3.207
Epoch 19 of 20, Loss: 1.038
Epoch 20 of 20, Loss: 1.751


#### Generating Recommendations by Ranking based on ratings

In [17]:
def data_processing_unlabeled(df, wide_cols, embeddings_cols, continuous_cols, scale=False, def_dim=8):
    """
    Processes the input data (unlabeled data) for prediction, creating the necessary wide and deep embeddings.
    """
    if type(embeddings_cols[0]) is tuple:
        emb_dim = dict(embeddings_cols)
        embeddings_cols = [emb[0] for emb in embeddings_cols]
    else:
        emb_dim = {e:def_dim for e in embeddings_cols}
    deep_cols = embeddings_cols + continuous_cols

    # Copy the dataframe so we don't mutate the original one
    df_tmp = df.copy()

    # Extract the categorical column names for one-hot encoding
    categorical_columns = list(df_tmp.select_dtypes(include=['object']).columns)

    # Encoding the categorical columns (if any)
    encoding_dict, df_tmp = encoder(df_tmp)
    encoding_dict = {k: encoding_dict[k] for k in encoding_dict if k in deep_cols}
    embeddings_input = []
    for k, v in encoding_dict.items():
        embeddings_input.append((k, len(v), emb_dim[k]))

    df_deep = df_tmp[deep_cols]
    deep_column_idx = {k: v for v, k in enumerate(df_deep.columns)}

    # Scaling the continuous columns if required
    if scale:
        scaler = StandardScaler()
        for cc in continuous_cols:
            df_deep[cc] = scaler.fit_transform(df_deep[cc].values.reshape(-1, 1))

    df_wide = df_tmp[wide_cols]
    dummy_cols = [c for c in wide_cols if c in categorical_columns]
    df_wide = pd.get_dummies(df_wide, columns=dummy_cols)

    # Prepare the dataset (without labels for unlabeled data)
    dataset = namedtuple('dataset', 'wide, deep, labels')(
        wide=df_wide.values,
        deep=df_deep.values,
        labels=np.zeros(len(df_deep))  # No target labels, so we set this to zero
    )

    # Return all the processed data (wide, deep, embeddings, etc.)
    return {
        'dataset': dataset,
        'embeddings_input': embeddings_input,
        'deep_column_idx': deep_column_idx,
        'encoding_dict': encoding_dict
    }

In [18]:
# Assume predict user with this user id
predict_user = 6040

# rated and unrated movies by user
rated_movies = final_df[final_df['userId'] == predict_user]['movieId'].unique()
unrated_movies = movies[~movies['movieId'].isin(rated_movies)]
# user data/info
user_data = final_df[final_df['userId'] == predict_user].iloc[0]
user_features = {col: user_data[col] for col in ['userId', 'gender', 'age', 'occupation']}
# prediction input
prediction_input = unrated_movies.copy()
for col, val in user_features.items():
    prediction_input[col] = val
prediction_df = pd.DataFrame(prediction_input)

In [19]:
# Process the prediction data using the adjusted data_processing function for unlabeled data
processed = data_processing_unlabeled(
    prediction_df, 
    wide_cols, 
    embeddings_cols, 
    continuous_cols, 
    scale=True
)

In [20]:
# Generate the dataset for prediction
dataset = processed['dataset']

In [21]:
# Get the predicted ratings for all unrated movies
predicted_ratings = model.predict(dataset)

In [22]:
unrated_movies = np.array(unrated_movies)  # Convert to numpy array if it's a DataFrame or list
predicted_ratings = np.array(predicted_ratings)
unrated_movie_ids = unrated_movies[:, 0]  # This gives the movieId (first column)

unrated_movie_ids = np.ravel(unrated_movie_ids)  # Flatten to 1D if necessary
predicted_ratings = np.ravel(predicted_ratings)  # Flatten to 1D if necessary

# Check the shapes of both arrays to ensure they match
print("Shape of unrated_movie_ids:", unrated_movie_ids.shape)
print("Shape of predicted_ratings:", predicted_ratings.shape)

Shape of unrated_movie_ids: (3542,)
Shape of predicted_ratings: (3542,)


In [24]:
recommendations = pd.DataFrame({
    'movieId': unrated_movie_ids,
    'predicted_rating': predicted_ratings
}).sort_values(by='predicted_rating', ascending=False)

# Get the top N recommendations
top_recommendations = recommendations.head(50)
top_recommendations

Unnamed: 0,movieId,predicted_rating
53,66,3.6366
63,76,3.633712
165,185,3.621214
725,792,3.616106
301,338,3.60367
15,18,3.603591
20,23,3.603017
4,6,3.596178
7,9,3.595834
8,10,3.59572


In [29]:
#Movies
movie_records = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, encoding='latin-1', engine='python')
movie_records.columns = ['movieId','title','genres']

In [31]:
# Convert unrated_movies array into a DataFrame
unrated_movies_df = pd.DataFrame(unrated_movies, columns=['movieId', 'genres', 'movie_year'])

# Merge unrated_movies_df with movie_records to get the movie titles
unrated_movies_df = unrated_movies_df.merge(movie_records[['movieId', 'title']], on='movieId', how='left')

# top movies with details
top_movies_with_details = top_recommendations.merge(unrated_movies_df[['movieId', 'title', 'genres', 'movie_year']], 
                                                    on='movieId', how='left')

top_movies_with_details

Unnamed: 0,movieId,predicted_rating,title,genres,movie_year
0,66,3.6366,Lawnmower Man 2: Beyond Cyberspace (1996),Sci-Fi,1996
1,76,3.633712,Screamers (1995),Sci-Fi,1995
2,185,3.621214,"Net, The (1995)",Sci-Fi,1995
3,792,3.616106,"Hungarian Fairy Tale, A (1987)",Fantasy,1987
4,338,3.60367,Virtuosity (1995),Sci-Fi,1995
5,18,3.603591,Four Rooms (1995),Thriller,1995
6,23,3.603017,Assassins (1995),Thriller,1995
7,6,3.596178,Heat (1995),Action,1995
8,9,3.595834,Sudden Death (1995),Action,1995
9,10,3.59572,GoldenEye (1995),Action,1995


#### Finalize Recommendation function by top k algorithm

In [93]:
def search_query(query, movie_records):
    """
    This function filters the movie records based on a search query.
    The query is case-insensitive and can match substrings in the title or genres.
    
    Args:
    query (str): The search term to filter movies.
    movie_records (DataFrame): The DataFrame containing movie data.

    Returns:
    filtered_movies (DataFrame): The DataFrame of movies that match the query.
    """
    # Normalize the query to lowercase for case-insensitive matching
    query = query.lower()
    
    # Filter by title and genre, if query matches any part of them
    filtered_movies = movie_records[
        movie_records['title'].str.lower().str.contains(query) |
        movie_records['genres'].str.lower().str.contains(query)
    ]
    
    return filtered_movies

In [98]:
def recommend_top_k_movies(predict_user, final_df, movie_records, model, wide_cols, embeddings_cols, continuous_cols, k = 10, search_term = None):
    # If a search term is provided, filter the movies
    #if search_term:
    #    movie_records = search_query(search_term, movie_records)

    # Preprocess movie_records data
    movies = movie_records.copy()
    movie_records['genres'] = movie_records.apply(lambda row : row['genres'].split("|")[0],axis=1)
    movie_records['movie_year'] = movie_records.apply(lambda row : int(row['title'].split("(")[-1][:-1]),axis=1)
    movie_records.drop(['title'],axis=1,inplace=True)
    
    # rated and unrated movies by user
    rated_movies = final_df[final_df['userId'] == predict_user]['movieId'].unique()
    unrated_movies = movie_records[~movie_records['movieId'].isin(rated_movies)]
    # user data/info
    user_data = final_df[final_df['userId'] == predict_user].iloc[0]
    user_features = {col: user_data[col] for col in ['userId', 'gender', 'age', 'occupation']}
    # prediction input
    prediction_input = unrated_movies.copy()
    for col, val in user_features.items():
        prediction_input[col] = val
    prediction_df = pd.DataFrame(prediction_input)

    # Process the prediction data using the adjusted data_processing function for unlabeled data
    processed = data_processing_unlabeled(
        prediction_df, 
        wide_cols, 
        embeddings_cols, 
        continuous_cols, 
        scale=True
    )

    # Generate the dataset for prediction and get prediction from model
    dataset = processed['dataset']
    predicted_ratings = model.predict(dataset)

    unrated_movies = np.array(unrated_movies)  # Convert to numpy array if it's a DataFrame or list
    predicted_ratings = np.array(predicted_ratings)
    unrated_movie_ids = unrated_movies[:, 0]  # This gives the movieId (first column)
    
    unrated_movie_ids = np.ravel(unrated_movie_ids)  # Flatten to 1D if necessary
    predicted_ratings = np.ravel(predicted_ratings)  # Flatten to 1D if necessary

    top_k_recommendations = pd.DataFrame({
        'movieId': unrated_movie_ids,
        'predicted_rating': predicted_ratings
    }).sort_values(by='predicted_rating', ascending=False).head(k)

    # Convert unrated_movies array into a DataFrame and Merge unrated_movies_df with movie_records to get the movie titles
    unrated_movies_df = pd.DataFrame(unrated_movies, columns=['movieId', 'genres', 'movie_year'])
    unrated_movies_df = unrated_movies_df.merge(movies[['movieId', 'title']], on='movieId', how='left')
    
    # top movies with details
    top_movies_with_details = top_k_recommendations.merge(unrated_movies_df[['movieId', 'title', 'genres', 'movie_year']], 
                                                        on='movieId', how='left')

    return top_movies_with_details

In [99]:
predict_user = 6030
final_df = pd.read_csv("final_data.csv")
term = "Action"

#Movies
movie_records = pd.read_csv('ml-1m/movies.dat', sep='::', header=None, encoding='latin-1', engine='python')
movie_records.columns = ['movieId','title','genres']

#settings for the data
wide_cols = ['movie_year','gender','age', 'occupation','genres','userId','movieId']
embeddings_cols = [('genres',20), ('userId',100), ('movieId',100)]
continuous_cols = ["movie_year","gender","age","occupation"]

In [100]:
top_movies = recommend_top_k_movies(predict_user, final_df, movie_records, model, wide_cols, embeddings_cols, continuous_cols, k = 20, search_term = term)

In [101]:
top_movies

Unnamed: 0,movieId,predicted_rating,title,genres,movie_year
0,66,3.613442,Lawnmower Man 2: Beyond Cyberspace (1996),Sci-Fi,1996
1,76,3.610554,Screamers (1995),Sci-Fi,1995
2,185,3.598055,"Net, The (1995)",Sci-Fi,1995
3,792,3.592948,"Hungarian Fairy Tale, A (1987)",Fantasy,1987
4,338,3.580512,Virtuosity (1995),Sci-Fi,1995
5,18,3.580432,Four Rooms (1995),Thriller,1995
6,23,3.579859,Assassins (1995),Thriller,1995
7,6,3.57302,Heat (1995),Action,1995
8,9,3.572676,Sudden Death (1995),Action,1995
9,10,3.572561,GoldenEye (1995),Action,1995


#### Save and Load Model

In [32]:
def save_model(model, filepath):
    """
    Save the model's state dictionary and additional information to a file.
    
    Args:
        model: The instance of the NeuralNet model to save.
        filepath: The path where the model will be saved.
    """
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'encoding_dict': model.encoding_dict,
        'loss_values': model.loss_values
    }
    torch.save(checkpoint, filepath)
    print(f"Model saved to {filepath}")


In [40]:
def load_model(model_class, model_args, filepath, device):
    """
    Load a model from a saved file.

    Args:
        model_class: The class definition of the model.
        model_args: A dictionary of arguments to initialize the model.
        filepath: The path to the saved model file.
        device: The device on which to load the model (e.g., "cpu" or "cuda").

    Returns:
        An instance of the model with the loaded state and additional data.
    """
    # Load the checkpoint
    checkpoint = torch.load(filepath, map_location=device)
    
    # Initialize the model
    model = model_class(**model_args)
    
    # Load the state dictionary into the model
    model.load_state_dict(checkpoint['model_state_dict'])
    
    # Restore additional information
    model.encoding_dict = checkpoint.get('encoding_dict', {})
    model.loss_values = checkpoint.get('loss_values', [])
    
    model.to(device)
    print(f"Model loaded from {filepath}")
    return model

In [34]:
save_model(model, "./model/movie_recommendation_model.pth")

Model saved to ./model/movie_recommendation_model.pth


In [41]:
model_args = {
    'wide_dim': wide_dim,
    'embeddings_input': embeddings_input,
    'continuous_cols': continuous_cols,
    'deep_column_idx': deep_column_idx,
    'hidden_layers': hidden_layers,
    'dropout': dropout,
    'encoding_dict': encoding_dict,  # Will be updated during loading
    'n_class': 1
}

loaded_model = load_model(NeuralNet, model_args, "./model/movie_recommendation_model.pth", device)

Model loaded from ./model/movie_recommendation_model.pth
