In [3]:
!pip install numpy==1.24.4

Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m101.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blosc2 3.5.1 requires numpy>=1.26, but you have numpy 1.24.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.24.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1

In [1]:
# Install required packages
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2469550 sha256=0d40a7db76064ed2b9091023d4bf7a4ec04aa0562e3d07f240e2e56cc3145130
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163

In [1]:
import numpy as np
# Verify numpy version
print("NumPy version:", np.__version__)

NumPy version: 1.24.4


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gc

In [3]:
# Improved memory optimization function
def reduce_mem_usage(df):
    """ Iterate through all columns of a dataframe and modify the data type
        to reduce memory usage, handling categorical data properly.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype

        if col_type == object:
            # Convert object columns to category if they have low cardinality
            if len(df[col].unique()) / len(df[col]) < 0.5:
                df[col] = df[col].astype('category')
        elif col_type.name == 'category':
            # Skip already categorical columns
            continue
        else:
            # Numeric columns optimization
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%')

    return df


In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d parasharmanas/movie-recommendation-system
!unzip movie-recommendation-system.zip

Dataset URL: https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system
License(s): ODbL-1.0
Downloading movie-recommendation-system.zip to /content
 50% 83.0M/165M [00:00<00:00, 867MB/s]
100% 165M/165M [00:00<00:00, 715MB/s] 
Archive:  movie-recommendation-system.zip
  inflating: movies.csv              
  inflating: ratings.csv             


In [4]:
# Load the datasets with memory optimization
print("Loading movies data...")
movies_df = pd.read_csv('movies.csv')

# Preprocess data before memory optimization
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)')
movies_df['year'] = pd.to_numeric(movies_df['year'], errors='coerce')
movies_df['clean_title'] = movies_df['title'].str.replace(r'\(\d{4}\)', '').str.strip()

Loading movies data...


In [5]:
# Now apply memory optimization
movies_df = reduce_mem_usage(movies_df)

Memory usage of dataframe is 2.38 MB
Memory usage after optimization is: 1.50 MB
Decreased by 36.8%


In [6]:
print("\nLoading ratings data...")
# Load ratings in chunks to manage memory
chunksize = 1000000
ratings_chunks = pd.read_csv('ratings.csv', chunksize=chunksize)
ratings_df = pd.concat([reduce_mem_usage(chunk) for chunk in ratings_chunks])
del ratings_chunks
gc.collect()


Loading ratings data...
Memory usage of dataframe is 30.52 MB
Memory usage after optimization is: 11.44 MB
Decreased by 62.5%
Memory usage of dataframe is 30.52 MB
Memory usage after optimization is: 11.44 MB
Decreased by 62.5%
Memory usage of dataframe is 30.52 MB
Memory usage after optimization is: 11.44 MB
Decreased by 62.5%
Memory usage of dataframe is 30.52 MB
Memory usage after optimization is: 11.44 MB
Decreased by 62.5%
Memory usage of dataframe is 30.52 MB
Memory usage after optimization is: 11.44 MB
Decreased by 62.5%
Memory usage of dataframe is 30.52 MB
Memory usage after optimization is: 13.35 MB
Decreased by 56.2%
Memory usage of dataframe is 30.52 MB
Memory usage after optimization is: 13.35 MB
Decreased by 56.2%
Memory usage of dataframe is 30.52 MB
Memory usage after optimization is: 13.35 MB
Decreased by 56.2%
Memory usage of dataframe is 30.52 MB
Memory usage after optimization is: 13.35 MB
Decreased by 56.2%
Memory usage of dataframe is 30.52 MB
Memory usage after 

13

In [7]:
# Sample a subset of the data for faster prototyping
print("\nSampling data for faster prototyping...")
sample_frac = 0.1  # Use 10% of data - adjust based on your system's memory
ratings_df = ratings_df.sample(frac=sample_frac, random_state=42)
gc.collect()


Sampling data for faster prototyping...


0

In [8]:
# Prepare data for Surprise
print("\nPreparing data for modeling...")
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)


Preparing data for modeling...


In [9]:
# Split data
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [12]:
# Build SVD model
print("\nTraining SVD model...")
svd = SVD(n_factors=100, n_epochs=30, lr_all=0.007, reg_all=0.05)
svd.fit(trainset)


Training SVD model...


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e1b93df1790>

In [13]:
# Evaluate
print("\nEvaluating model...")
predictions = svd.test(testset)
print("RMSE:", accuracy.rmse(predictions))
print("MAE:", accuracy.mae(predictions))


Evaluating model...
RMSE: 0.8896
RMSE: 0.889623385633858
MAE:  0.6801
MAE: 0.6801035806498711


In [14]:
# Save model
print("\nSaving model...")
model_path = 'movie_recommender_svd_model.pkl'
joblib.dump(svd, model_path)


Saving model...


['movie_recommender_svd_model.pkl']

In [15]:
# Content-based filtering components
print("\nPreparing content-based components...")
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['clean_title'].fillna(''))


Preparing content-based components...


In [16]:
# Save TF-IDF components
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [17]:
# Recommendation functions
def hybrid_recommendations(user_id, title_query=None, n=10):
    """Hybrid recommendation function combining collaborative and content-based filtering"""
    if title_query:
        # Content-based filtering
        query_vec = tfidf.transform([title_query])
        similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
        similar_indices = np.argpartition(similarity_scores, -n)[-n:]
        similar_movies = movies_df.iloc[similar_indices][['movieId', 'clean_title']]
        similar_movies['predicted_rating'] = similar_movies['movieId'].apply(
            lambda x: svd.predict(user_id, x).est
        )
        return similar_movies.sort_values('predicted_rating', ascending=False).head(n)
    else:
        # Collaborative filtering
        all_movie_ids = ratings_df['movieId'].unique()
        rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].values
        unrated_movies = [mid for mid in all_movie_ids if mid not in rated_movies]

        if len(unrated_movies) > 10000:
            unrated_movies = np.random.choice(unrated_movies, 10000, replace=False)

        predictions = []
        for movie_id in unrated_movies:
            predictions.append((movie_id, svd.predict(user_id, movie_id).est))

        predictions.sort(key=lambda x: x[1], reverse=True)
        top_n = predictions[:n]

        recommendations = []
        for movie_id, pred_rating in top_n:
            title = movies_df[movies_df['movieId'] == movie_id]['clean_title'].values[0]
            recommendations.append({'title': title, 'predicted_rating': pred_rating})

        return pd.DataFrame(recommendations)

# Test the system
print("\nTesting recommendation system...")
user_id = 1
print(f"\nTop 10 recommendations for user {user_id}:")
print(hybrid_recommendations(user_id))

query = "Toy Story"
print(f"\nMovies similar to '{query}' that user {user_id} might like:")
print(hybrid_recommendations(user_id, title_query=query))

print("\nRecommendation system implementation complete!")


Testing recommendation system...

Top 10 recommendations for user 1:
                                               title  predicted_rating
0                   Shawshank Redemption, The (1994)          4.032677
1                         Au Hasard Balthazar (1966)          4.030592
2                                        DiG! (2004)          4.029182
3            Winter Light (Nattvardsgästerna) (1963)          4.026713
4  My Mother's Castle (Château de ma mère, Le) (1...          4.016335
5                                 Dersu Uzala (1975)          4.008282
6                                Planet Earth (2006)          3.978574
7                          Harakiri (Seppuku) (1962)          3.967903
8                         Straight Story, The (1999)          3.967163
9                                   Inception (2010)          3.961637

Movies similar to 'Toy Story' that user 1 might like:
       movieId                        clean_title  predicted_rating
14813    78499            