In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from feature_engineering import engineer_features
from model_training import train_and_evaluate


In [20]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.Defaulting to user installation because normal site-packages is not writeable




[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
def preprocess_data(df):
    df.columns = ['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

    df = df.copy()
    label_encoders = {}
    categorical_cols = ['Name', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']


    # First fill NaN values with 'Unknown' and replace empty or whitespace-only strings with 'Unknown'
    df['Name'] = df['Name'].fillna('Unknown')  # Replace NaN with 'Unknown'
    df['Name'] = df['Name'].replace('', 'Unknown')  # Replace empty strings with 'Unknown'
    df['Name'] = df['Name'].apply(lambda x: 'Unknown' if x.strip() == '' else x)  # Replace strings with only spaces with 'Unknown'

    # Clean 'Duration' column
    if df['Duration'].dtype == 'object':
        df['Duration'] = df['Duration'].str.extract('(\d+)').astype(float)

# Fill missing Duration values with median
    df['Duration'] = df['Duration'].fillna(df['Duration'].median())

    # Clean 'Votes' column
    df['Votes'] = df['Votes'].astype(str).str.replace(',', '')
    df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

    # Fill other missing values
    df.fillna({
        'Year': df['Year'].median(),
        'Genre': 'Unknown',
        'Rating': df['Rating'].median(),
        'Votes': df['Votes'].median(),
        'Director': 'Unknown',
        'Actor 1': 'Unknown',
        'Actor 2': 'Unknown',
        'Actor 3': 'Unknown',
    }, inplace=True)

    # Clean the 'Name' column
    df['Name'] = df['Name'].str.replace(r'\(.*?\)', '', regex=True)  # Remove text inside parentheses
    df['Name'] = df['Name'].str.replace(r'[^\w\s]', '', regex=True)  # Remove non-alphanumeric characters
    df['Name'] = df['Name'].str.strip()  # Remove leading/trailing spaces

    # Encode categorical columns like 'Genre' and 'Director'
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

    # Rename columns to standard format
    df.columns = ['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

    print("Columns:", df.columns.tolist())
    return df, label_encoders


In [10]:
df = pd.read_csv(r"c:\Users\Lalitha\OneDrive\Desktop\dataaa.csv")
df_cleaned, encoders = preprocess_data(df)
df_engineered = engineer_features(df_cleaned) 
print(df_engineered.columns.tolist())
print(df_engineered.isnull().sum())
mse, r2 = train_and_evaluate(df_engineered)
print("MSE:", mse)
print("R² Score:", r2)

Columns: ['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'director_success_rate', 'genre_avg_rating']
Name                        0
Year                        0
Duration                 8269
Genre                       0
Rating                      0
Votes                       0
Director                    0
Actor 1                     0
Actor 2                     0
Actor 3                     0
director_success_rate       0
genre_avg_rating            0
dtype: int64


ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values