In [5]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import pickle

# 📂 Load the dataset
df = pd.read_csv("fullDataset.csv")

# 🧹 Clean and Weight Features
# (Make sure the weighted columns exist)
for col in ['directors_count', 'actor1_count', 'actor2_count', 'rating']:
    if col in df.columns:
        if col == 'directors_count':
            df[col] *= 3
        elif col in ['actor1_count', 'actor2_count', 'rating']:
            df[col] *= 2

# 🔎 Keep a reference to IMDb IDs (tconst) and save it separately
tconsts = df['tconst']

# Set tconst as index, but it will not be included in the feature scaling
df.set_index('tconst', inplace=True)

# 🧼 Drop non-numeric columns before training
non_numeric_cols = ['primaryTitle', 'startYear']
df_train = df.drop(columns=non_numeric_cols, errors='ignore')

# ⚖️ Scale features (without including tconst)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_train)

# 🧾 Create scaled DataFrame and preserve index
df_scaled = pd.DataFrame(scaled_features, columns=df_train.columns, index=df_train.index)


# 🤖 Train KNN model (only on scaled features, without tconst)
knn_model = NearestNeighbors(n_neighbors=6, metric='cosine')
knn_model.fit(scaled_features)

# ✅ Function to get similar movies based on any number of inputs
def get_similar_movies(input_ids, k=10):
    if not isinstance(input_ids, list):
        input_ids = [input_ids]

    # Get the feature vectors for the selected movies
    input_vectors = df_scaled.loc[df_scaled.index.isin(input_ids)]

    if input_vectors.empty:
        raise ValueError("None of the provided movie IDs were found in the dataset.")

    # Average vector of input movies
    avg_vector = np.mean(input_vectors.values, axis=0).reshape(1, -1)

    # Get nearest neighbors
    distances, indices = knn_model.kneighbors(avg_vector, n_neighbors=k + len(input_ids))

    # Get the corresponding IMDb IDs
    similar_ids = df_scaled.index[indices[0]]

    # Remove input movies from the results
    results = [movie_id for movie_id in similar_ids if movie_id not in input_ids][:k]

    return results

# 🧪 Test Example
test_ids = ['tt0111161', 'tt0068646']  # The Shawshank Redemption & The Godfather
try:
    results = get_similar_movies(test_ids)
    print("Top similar movies:", results)
except ValueError as e:
    print("Error:", e)

# 💾 Save model and data
df_train.reset_index(inplace=True)  # Reset index so that tconst can be saved in movies.csv
df_train.to_csv("movies.csv", index=False)
df_scaled.to_csv("scaled_features.csv", index=False)

with open("knn_model.pkl", "wb") as f:
    pickle.dump(knn_model, f)

print("✅ Model, data, and scaled features saved for Flask API.")


Top similar movies: ['tt0071562', 'tt7286456', 'tt0499549', 'tt1392190', 'tt4154756', 'tt0169547', 'tt0076759', 'tt0114369', 'tt3498820', 'tt3315342']
✅ Model, data, and scaled features saved for Flask API.
