In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/alx-movie-recommendation-project-2025/sample_submission.csv
/kaggle/input/alx-movie-recommendation-project-2025/movies.csv
/kaggle/input/alx-movie-recommendation-project-2025/imdb_data.csv
/kaggle/input/alx-movie-recommendation-project-2025/genome_tags.csv
/kaggle/input/alx-movie-recommendation-project-2025/genome_scores.csv
/kaggle/input/alx-movie-recommendation-project-2025/train.csv
/kaggle/input/alx-movie-recommendation-project-2025/test.csv
/kaggle/input/alx-movie-recommendation-project-2025/tags.csv
/kaggle/input/alx-movie-recommendation-project-2025/links.csv


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

# Load data
train_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2025/train.csv')#.sample(frac=0.9, random_state=42)
test_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2025/test.csv')#.sample(frac=None, random_state=None)
movie_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2025/movies.csv')#.sample(frac=None, random_state=None)
tag = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2025/tags.csv')#.sample(frac=None, random_state=None)
links_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2025/links.csv')#.sample(frac=None, random_state=None)
imdb_df = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2025/imdb_data.csv')#.sample(frac=None, random_state=None)
genome_scores = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2025/genome_scores.csv')#.sample(frac=None, random_state=None)
genome_tags = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2025/genome_tags.csv')#.sample(frac=None, random_state=None)

def preprocess_data():
    """Preprocess movie features for recommendations."""
    movie_df.dropna(inplace=True)  
    imdb_df.fillna("", inplace=True)
    
    # Convert genres to space-separated words
    movie_df['genres'] = movie_df['genres'].apply(lambda x: x.replace('|', ' '))
    
    # Merge IMDB data
    movies_imdb = movie_df.merge(imdb_df, on="movieId", how="left")
    movies_imdb['plot_keywords'].fillna("", inplace=True)
    
    return movies_imdb

def recommend_movies(movie_title, movies_imdb, top_n=10):
    """Recommend movies using nearest neighbors."""
    tfidf_genre = TfidfVectorizer(stop_words='english')
    genre_matrix = tfidf_genre.fit_transform(movies_imdb['genres']) 
    nn = NearestNeighbors(n_neighbors=top_n+1, metric='cosine')
    nn.fit(genre_matrix)
    
    idx = movies_imdb.index[movies_imdb['title'].str.contains(movie_title, case=False, na=False)].tolist()
    if not idx:
        return []
    idx = idx[0]
    
    distances, indices = nn.kneighbors(genre_matrix[idx])
    return movies_imdb.iloc[indices[0][1:]][['title', 'genres']]

def train_surprise_model(train_df):
    """Train an SVD model using the Surprise library."""
    reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
    data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
    trainset = data.build_full_trainset()
    
    model = SVD()
    model.fit(trainset)
    
    return model 
def predict_ratings(test_df, model):
    """Generate predictions using the trained SVD model."""
    test_df['Id'] = test_df['userId'].astype(str) + "_" + test_df['movieId'].astype(str)
    test_df['rating'] = test_df.apply(lambda row: model.predict(row['userId'], row['movieId']).est, axis=1)
    return test_df[['Id', 'rating']]

def evaluate_rmse(model, test_df):
    """Calculate RMSE using the test dataset."""
    actual_ratings = np.array(test_df['rating'])
    predicted_ratings = np.array([model.predict(uid, iid).est for uid, iid in zip(test_df['userId'], test_df['movieId'])])
    rmse = np.sqrt(np.mean((actual_ratings - predicted_ratings) ** 2))
    return rmse

# Process data
movies_imdb = preprocess_data()

# Train the Surprise SVD model
model = train_surprise_model(train_df)

# Example recommendation
print(recommend_movies("Toy Story", movies_imdb)) 

# Generate predictions using the trained model
predictions_df = predict_ratings(test_df, model)
predictions_df.to_csv('submission77.csv', index=False)

# Evaluate RMSE
rmse_score = evaluate_rmse(model, test_df)
print(f"RMSE Score: {rmse_score}")

  imdb_df.fillna("", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies_imdb['plot_keywords'].fillna("", inplace=True)


                                                   title  \
30472         Scooby-Doo! Mask of the Blue Falcon (2012)   
58039                        Here Comes the Grump (2018)   
57586          Dragons: Dawn Of The Dragon Racers (2014)   
17431  Asterix and the Vikings (Ast√©rix et les Viking...   
52826                  Tangled: Before Ever After (2017)   
22353                              Boxtrolls, The (2014)   
60800                                   UglyDolls (2019)   
55898                             Penguin Highway (2018)   
48614       Puss in Book: Trapped in an Epic Tale (2017)   
43614                                       Moana (2016)   

                                            genres  
30472  Adventure Animation Children Comedy Fantasy  
58039  Adventure Animation Children Comedy Fantasy  
57586  Adventure Animation Children Comedy Fantasy  
17431  Adventure Animation Children Comedy Fantasy  
52826  Adventure Animation Children Comedy Fantasy  
22353  Adventure Ani

In [5]:
sub_df2=pd.read_csv('/kaggle/working/submission5.csv')

In [6]:
sub_df2.shape

(5000019, 2)