In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"E:\TakeYourFilm\Cleaned_dataset\merged_movies.csv")

# Define features and target variable
features = ['genre', 'rating', 'votes']
target = ['normalized_rating']

# Separate features (X) and target variable (y)
X = df[features]
y = df[target]

# Display initial data
print("Features (X):")
print(X.head())

print("\nTarget Variable (Y):")
print(y.head())


Features (X):
    genre  rating     votes
0  Action     6.9  204835.0
1  Action     7.8  295119.0
2  Action     6.5   26220.0
3  Action     8.0  327858.0
4  Action     0.0       0.0

Target Variable (Y):
   normalized_rating
0               0.69
1               0.78
2               0.65
3               0.80
4               0.00


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Handle missing values
X.fillna({'year': 'Unknown', 'runtime': 0, 'rating': 0, 'votes': 0}, inplace=True)

# Encode categorical 'genre' using One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
genre_encoded = encoder.fit_transform(X[['genre']])
genre_columns = encoder.get_feature_names_out(['genre'])

# Convert encoding to DataFrame
genre_df = pd.DataFrame(genre_encoded, columns=genre_columns)

# Drop original 'genre' column and merge encoded values
X = X.drop(columns=['genre'])
X = pd.concat([X, genre_df], axis=1)

# Normalize numerical features
scaler = MinMaxScaler()
X[['rating', 'votes']] = scaler.fit_transform(X[['rating', 'votes']])

# Split data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("✅ Preprocessing completed. Data is ready for training!")


✅ Preprocessing completed. Data is ready for training!


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna({'year': 'Unknown', 'runtime': 0, 'rating': 0, 'votes': 0}, inplace=True)


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📊 Model Evaluation:\nMSE: {mse:.4f}\nR² Score: {r2:.4f}")


📊 Model Evaluation:
MSE: 0.0000
R² Score: 1.0000


In [5]:
import numpy as np

def recommend_movies(user_genre, model, encoder, scaler, df):
    # One-hot encode the user-input genre
    genre_encoded = encoder.transform([[user_genre]])
    genre_df = pd.DataFrame(genre_encoded, columns=encoder.get_feature_names_out(['genre']))

    # Create input DataFrame with default numerical values
    input_data = pd.DataFrame([[0, 0]], columns=['rating', 'votes'])

    # Merge genre encoding
    input_data = pd.concat([input_data, genre_df], axis=1)

    # Ensure column order matches trained model input
    input_data = input_data.reindex(columns=X_train.columns, fill_value=0)

    # Normalize rating and votes
    input_data[['rating', 'votes']] = scaler.transform(input_data[['rating', 'votes']])

    # Handle any NaN values
    input_data.fillna(0, inplace=True)

    # Predict IMDb rating
    predicted_rating = model.predict(input_data)[0][0]

    # Ensure genre column exists in dataset
    genre_column = f'genre_{user_genre}'
    if genre_column not in df.columns:
        raise ValueError(f"❌ Genre '{user_genre}' not found in dataset!")

    # Filter movies based on genre and predicted rating
    recommended_movies = df[(df[genre_column] == 1) & (df['normalized_rating'] >= predicted_rating)]
    
    # Sort and return top 10 movies
    return recommended_movies[['movie_name', 'rating', 'votes']].sort_values(by='normalized_rating', ascending=False).head(10)


In [6]:
# Example: Recommend top 'Action' movies
recommend_movies('Fantasy', model, encoder, scaler, df)




ValueError: ❌ Genre 'Fantasy' not found in dataset!

In [31]:
# Check available genre columns
print("Available Genre Columns:", [col for col in df.columns if col.startswith('genre_')])
print("Unique genres before encoding:", X['genre'].unique())


Available Genre Columns: []


KeyError: 'genre'

In [32]:
# Check if 'genre' exists before accessing
if 'genre' in X.columns:
    print("Unique genres before encoding:", X['genre'].unique())
else:
    print("❌ 'genre' column is missing in X! Available columns:", X.columns)


❌ 'genre' column is missing in X! Available columns: Index(['year', 'runtime', 'rating', 'votes', 'genre_Action', 'genre_Family',
       'genre_Fantasy', 'genre_Horror', 'genre_Mystery', 'genre_Romance',
       'genre_Sci-Fi', 'genre_Thriller'],
      dtype='object')
