In [13]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import pickle

# 📂 Load the dataset
df = pd.read_csv("movieDataset.csv")

# 🧹 Clean and Weight Features
# (Make sure the weighted columns exist)
for col in ['directors_count', 'actor1_count', 'actor2_count', 'rating']:
    if col in df.columns:
        if col == 'directors_count':
            df[col] *= 3
        elif col in ['actor1_count', 'actor2_count', 'rating']:
            df[col] *= 2

# 🔎 Keep a reference to IMDb IDs (tconst) and save it separately
tconsts = df['tconst']

# Set tconst as index, but it will not be included in the feature scaling
df.set_index('tconst', inplace=True)

# 🧼 Drop non-numeric columns before training
non_numeric_cols = ['primaryTitle', 'startYear']
df_train = df.drop(columns=non_numeric_cols, errors='ignore')

# ⚖️ Scale features (without including tconst)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_train)

# 🧾 Create scaled DataFrame (no 'tconst' column)
df_scaled = pd.DataFrame(scaled_features, columns=df_train.columns)

# 🤖 Train KNN model (only on scaled features, without tconst)
knn_model = NearestNeighbors(n_neighbors=6, metric='cosine')
knn_model.fit(scaled_features)

# Check columns of df_train (used for training) and df_scaled
print("Training columns:", df_train.columns)
print("Scaled data columns:", df_scaled.columns)

# 💾 Save model and data
df_train.reset_index(inplace=True)  # Reset index so that tconst can be saved in movies.csv
df_train.to_csv("movies.csv", index=False)

df_scaled.to_csv("scaled_features.csv", index=False)

# Save KNN model to file
with open("knn_model.pkl", "wb") as f:
    pickle.dump(knn_model, f)

print("✅ Model, data, and scaled features saved for Flask API.")


Training columns: Index(['runtimeMinutes', 'rating', 'primaryGenre_Adult',
       'primaryGenre_Adventure', 'primaryGenre_Animation',
       'primaryGenre_Biography', 'primaryGenre_Comedy', 'primaryGenre_Crime',
       'primaryGenre_Documentary', 'primaryGenre_Drama', 'primaryGenre_Family',
       'primaryGenre_Fantasy', 'primaryGenre_Film-Noir',
       'primaryGenre_History', 'primaryGenre_Horror', 'primaryGenre_Music',
       'primaryGenre_Musical', 'primaryGenre_Mystery', 'primaryGenre_News',
       'primaryGenre_Reality-TV', 'primaryGenre_Romance',
       'primaryGenre_Sci-Fi', 'primaryGenre_Short', 'primaryGenre_Sport',
       'primaryGenre_Thriller', 'primaryGenre_War', 'primaryGenre_Western',
       'primaryGenre_\N', 'directors_count', 'actor1_count', 'actor2_count'],
      dtype='object')
Scaled data columns: Index(['runtimeMinutes', 'rating', 'primaryGenre_Adult',
       'primaryGenre_Adventure', 'primaryGenre_Animation',
       'primaryGenre_Biography', 'primaryGenre_Comedy'