In [1]:
import pandas as pd
import numpy as np
import os

# ✅ Check if the file exists before loading
file_path = 'kent.csv'  # Change this to the full path if needed

if not os.path.exists(file_path):
    print(f"❌ Error: The file '{file_path}' was not found in the current directory: {os.getcwd()}")
    print("📌 Please check if the file exists and is named correctly.")
else:
    # ✅ Load dataset
    df = pd.read_csv(file_path)
    print("✔ File loaded successfully!\n")

    # ✅ Check available columns
    print("Columns in dataset:", df.columns)

    # ✅ Remove duplicates based on 'id' (if exists)
    if 'id' in df.columns:
        df = df.drop_duplicates(subset=['id'], keep='first')

    # ✅ Function to remove outliers using IQR
    def remove_outliers_iqr(df, column):
        if column in df.columns:
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
        else:
            print(f"⚠ Warning: Column '{column}' not found in dataset.")
            return df  # Return unchanged dataset if column doesn't exist

    # ✅ List of numerical columns to clean
    numerical_columns = ['budget', 'popularity', 'runtime', 'vote_average', 'vote_count']
    
    # ✅ Filter only existing numerical columns to avoid errors
    existing_numerical_columns = [col for col in numerical_columns if col in df.columns]

    if not existing_numerical_columns:
        print("⚠ Warning: No numerical columns found for outlier removal.")
    else:
        # ✅ Apply outlier removal for each column
        for column in existing_numerical_columns:
            df = remove_outliers_iqr(df, column)

    # ✅ Summary of cleaned dataset
    print("\nSummary of the cleaned dataset:")
    print(df.info())

    # ✅ Save cleaned dataset
    cleaned_file_path = 'cleaned_kent.csv'
    df.to_csv(cleaned_file_path, index=False)
    print(f"\n✔ Cleaned dataset saved as '{cleaned_file_path}'.")


✔ File loaded successfully!

Columns in dataset: Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

Summary of the cleaned dataset:
<class 'pandas.core.frame.DataFrame'>
Index: 3673 entries, 83 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                3673 non-null   int64  
 1   genres                3673 non-null   object 
 2   homepage              1125 non-null   object 
 3   id                    3673 non-null   int64  
 4   keywords              3673 non-null   object 
 5   original_language     3673 non-null   object 
 6   original_title        3673 non-null   object 
 7   overview    