In [168]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import pickle

Loading the dataset

In [169]:
df = pd.read_csv("fullDataset.csv")

In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16038 entries, 0 to 16037
Data columns (total 36 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tconst                        16038 non-null  object 
 1   primaryTitle                  16038 non-null  object 
 2   startYear                     16038 non-null  int64  
 3   runtimeMinutes                16038 non-null  int64  
 4   rating                        16038 non-null  float64
 5   primaryGenre_Adult            16038 non-null  bool   
 6   primaryGenre_Adventure        16038 non-null  bool   
 7   primaryGenre_Animation        16038 non-null  bool   
 8   primaryGenre_Biography        16038 non-null  bool   
 9   primaryGenre_Comedy           16038 non-null  bool   
 10  primaryGenre_Crime            16038 non-null  bool   
 11  primaryGenre_Documentary      16038 non-null  bool   
 12  primaryGenre_Drama            16038 non-null  bool   
 13  p

Scaling the features in the dataframe

In [83]:
#df = df.drop("primaryGenre_\\N")

In [171]:
df["directors_count"] *= 1
df["actor1_count"] *= 1
df["actor2_count"] *= 1
df["rating"] *= 1

Changing the IMDB IDs to an index

In [172]:
tconsts = df['tconst']
df.set_index('tconst', inplace=True)

Dropping non-numeric columns

In [173]:
non_numeric_cols = ['primaryTitle', 'startYear']
df_train = df.drop(columns=non_numeric_cols, errors='ignore')

Scaling the remaining features

In [174]:
columns_to_scale = ['runtimeMinutes', 'rating']

# Create a scaler and fit only on those columns
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_train[columns_to_scale])

# Convert scaled values back into a DataFrame
scaled_df = pd.DataFrame(scaled_features, columns=columns_to_scale, index=df_train.index)

# Keep the non-scaled one-hot encoded + other features
non_scaled_df = df_train.drop(columns=columns_to_scale)

# Combine them back
df_scaled = pd.concat([non_scaled_df, scaled_df], axis=1)

Training KNN model using scaled features

In [175]:
df_scaled.head(10)

Unnamed: 0_level_0,primaryGenre_Adult,primaryGenre_Adventure,primaryGenre_Animation,primaryGenre_Biography,primaryGenre_Comedy,primaryGenre_Crime,primaryGenre_Documentary,primaryGenre_Drama,primaryGenre_Family,primaryGenre_Fantasy,...,primaryGenre_TV Movie,primaryGenre_Thriller,primaryGenre_War,primaryGenre_Western,primaryGenre_\N,directors_count,actor1_count,actor2_count,runtimeMinutes,rating
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0111161,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,4,7,22,1.096833,3.694201
tt0068646,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,14,14,11,2.121268,3.543144
tt0468569,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,13,24,4,1.407268,3.249903
tt0167260,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,14,16,6,2.928398,3.247007
tt0108052,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,34,32,9,2.742138,3.243606
tt0071562,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,14,25,16,2.959442,3.242783
tt0050083,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,17,12,2,-0.331167,3.235308
tt0252487,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,14,9,4,-0.672646,3.131219
tt0110912,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,11,28,9,1.469355,3.1
tt0120737,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,14,16,6,2.214398,3.099162


In [179]:
knn_model = NearestNeighbors(n_neighbors=11, metric='cosine')
knn_model.fit(df_scaled)

Function to get movie recommendations based on any number of inputs

In [180]:
def get_similar_movies(input_ids, k=10):
    input_vectors = df_scaled.loc[df_scaled.index.isin(input_ids)]
    if input_vectors.empty:
        raise ValueError("None of the provided movie IDs were found.")
    
    avg_vector = np.mean(input_vectors.values, axis=0).reshape(1, -1)
    distances, indices = knn_model.kneighbors(avg_vector, n_neighbors=k + len(input_ids))
    
    # Filter out inputs from result
    similar_ids = df_scaled.index[indices[0]]
    return [mid for mid in similar_ids if mid not in input_ids][:k]


Saving the dataframes as csv files

In [181]:
test_ids = ['tt0111161', 'tt0068646']  # The Shawshank Redemption & The Godfather
try:
    results = get_similar_movies(test_ids)
    print("Top similar movies:", results)
except ValueError as e:
    print("Error:", e)

Top similar movies: ['tt2358913', 'tt29515802', 'tt0831888', 'tt0120789', 'tt0329575', 'tt0499549', 'tt0031455', 'tt0066763', 'tt0314412', 'tt1232829']




In [182]:
df_train.reset_index(inplace=True) 
df_train.to_csv("movies.csv", index=False)
df_scaled.to_csv("scaled_features.csv", index=False)


Saving the model as a pkl

In [183]:
with open("knn_model.pkl", "wb") as f:
    pickle.dump(knn_model, f)
print("Model saved as knn_model.pkl")

Model saved as knn_model.pkl


In [78]:
#looking for sometyhing different?
#focus on age and personal info
