In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("movies_updated.csv")

In [3]:
df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,"runtime,,"
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000,46998772.0,Warner Bros.,"146.0,"
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000,58853106.0,Columbia Pictures,"104.0,"
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000,538375067.0,Lucasfilm,"124.0,"
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000,83453539.0,Paramount Pictures,"88.0,"
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000,39846344.0,Orion Pictures,"98.0,"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       4000 non-null   object 
 1   rating     3960 non-null   object 
 2   genre      4000 non-null   object 
 3   year       4000 non-null   int64  
 4   released   4000 non-null   object 
 5   score      4000 non-null   float64
 6   votes      4000 non-null   int64  
 7   director   4000 non-null   object 
 8   writer     3999 non-null   object 
 9   star       3999 non-null   object 
 10  country    4000 non-null   object 
 11  budget     4000 non-null   int64  
 12  gross      3831 non-null   float64
 13  company    3990 non-null   object 
 14  runtime,,  4000 non-null   object 
dtypes: float64(2), int64(3), object(10)
memory usage: 468.9+ KB


In [5]:
df.isnull().sum() # df.is.na().sum()

name           0
rating        40
genre          0
year           0
released       0
score          0
votes          0
director       0
writer         1
star           1
country        0
budget         0
gross        169
company       10
runtime,,      0
dtype: int64

In [6]:
df.columns

Index(['name', 'rating', 'genre', 'year', 'released', 'score', 'votes',
       'director', 'writer', 'star', 'country', 'budget', 'gross', 'company',
       'runtime,,'],
      dtype='object')

In [7]:
df['rating'].unique()

array(['R', 'PG', 'G', nan, 'Not Rated', 'NC-17', 'Approved', 'TV-PG',
       'PG-13', 'Unrated', 'X', 'TV-MA'], dtype=object)

In [8]:
df['rating'] = df['rating'].fillna('Not Rated')

In [9]:
df['rating_simplified'] = df['rating'].replace({
    'Unrated': 'Not Rated', 'Not Rated': 'Not Rated', 'Approved': 'Not Rated',
    'TV-PG': 'TV', 'TV-MA': 'TV', 'X': 'NC-17'
})

In [10]:
df['rating_simplified'] = df['rating_simplified'].replace({'NC-17':'Other', 'TV':'Other'})

In [11]:
df['rating_simplified'].unique()

array(['R', 'PG', 'G', 'Not Rated', 'Other', 'PG-13'], dtype=object)

In [12]:
df['release_date'] = df['released'].str.extract(r'^(.*)\s\(')[0]
df['release_country'] = df['released'].str.extract(r'\((.*)\)')[0]

In [13]:
df = df.dropna(subset=['star', 'writer', 'company'])

In [14]:
df.columns

Index(['name', 'rating', 'genre', 'year', 'released', 'score', 'votes',
       'director', 'writer', 'star', 'country', 'budget', 'gross', 'company',
       'runtime,,', 'rating_simplified', 'release_date', 'release_country'],
      dtype='object')

In [15]:
# Extract main star/writer
df['main_star'] = df['star'].str.split(',').str[0]
df['main_writer'] = df['writer'].str.split(',').str[0]
df['main_director'] = df['director'].str.split(',').str[0]


# Encode main star/writer
le_star = LabelEncoder()
df['main_star_encoded'] = le_star.fit_transform(df['main_star'])

le_writer = LabelEncoder()
df['main_writer_encoded'] = le_writer.fit_transform(df['main_writer'])

le_star = LabelEncoder()
df['main_director_encoded'] = le_star.fit_transform(df['main_director'])


In [16]:
# Extract primary genre
df['primary_genre'] = df['genre'].str.split(',').str[0]

# Multi-hot encoding for top genres (optional)
top_genres = df['primary_genre'].value_counts().nlargest(10).index
for genre in top_genres:
    df[f'genre_{genre}'] = df['primary_genre'].apply(lambda x: 1 if x == genre else 0)


In [17]:
# Top 10 countries
top_countries = df['country'].value_counts().nlargest(10).index
df['country_clean'] = df['country'].apply(lambda x: x if x in top_countries else 'Other')

# Encode
from sklearn.preprocessing import LabelEncoder
le_country = LabelEncoder()
df['country_encoded'] = le_country.fit_transform(df['country_clean'])


In [18]:
# Top 20 companies
top_companies = df['company'].value_counts().nlargest(20).index
df['company_clean'] = df['company'].apply(lambda x: x if x in top_companies else 'Other')

# Encode
le_company = LabelEncoder()
df['company_encoded'] = le_company.fit_transform(df['company_clean'])


In [19]:
df.head(5)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,...,genre_Adventure,genre_Biography,genre_Horror,genre_Animation,genre_Fantasy,genre_Mystery,country_clean,country_encoded,company_clean,company_encoded
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000,Stanley Kubrick,Stephen King,Jack Nicholson,...,0,0,0,0,0,0,United Kingdom,9,Warner Bros.,20
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,...,1,0,0,0,0,0,United States,10,Columbia Pictures,2
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000,Irvin Kershner,Leigh Brackett,Mark Hamill,...,0,0,0,0,0,0,United States,10,Other,12
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000,Jim Abrahams,Jim Abrahams,Robert Hays,...,0,0,0,0,0,0,United States,10,Paramount Pictures,13
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000,Harold Ramis,Brian Doyle-Murray,Chevy Chase,...,0,0,0,0,0,0,United States,10,Orion Pictures,11


In [20]:
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df['gross'] = pd.to_numeric(df['gross'], errors='coerce')

In [21]:
median_gross = df['gross'].median()
df['gross'].fillna(median_gross, inplace=True)

In [22]:
df['profit'] = df['gross'] - df['budget']

In [23]:
df.columns

Index(['name', 'rating', 'genre', 'year', 'released', 'score', 'votes',
       'director', 'writer', 'star', 'country', 'budget', 'gross', 'company',
       'runtime,,', 'rating_simplified', 'release_date', 'release_country',
       'main_star', 'main_writer', 'main_director', 'main_star_encoded',
       'main_writer_encoded', 'main_director_encoded', 'primary_genre',
       'genre_Comedy', 'genre_Drama', 'genre_Action', 'genre_Crime',
       'genre_Adventure', 'genre_Biography', 'genre_Horror', 'genre_Animation',
       'genre_Fantasy', 'genre_Mystery', 'country_clean', 'country_encoded',
       'company_clean', 'company_encoded', 'profit'],
      dtype='object')

In [24]:
x = df[['year','score','votes','budget', 'gross','main_star_encoded','main_writer_encoded', 'main_director_encoded',
       'genre_Comedy', 'genre_Drama', 'genre_Action', 'genre_Crime','genre_Adventure', 'genre_Biography', 'genre_Horror', 'genre_Animation',
       'genre_Fantasy', 'genre_Mystery', 'country_encoded', 'company_encoded', 'profit']]

y = df['rating_simplified']

smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x, y)

In [25]:
x_train,x_test,y_train,y_test=train_test_split(x_res, y_res,test_size=0.2,random_state=42)

In [186]:
classifier = RandomForestClassifier(n_estimators = 800,max_depth=None,max_features='sqrt',bootstrap=True, criterion= 'gini', random_state=42)
classifier.fit(x_train,y_train)

In [187]:
y_pred = classifier.predict(x_test)

In [188]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           G       0.94      0.97      0.96       399
   Not Rated       0.87      0.96      0.91       393
       Other       0.97      1.00      0.99       413
          PG       0.82      0.77      0.80       428
       PG-13       0.77      0.75      0.76       404
           R       0.75      0.71      0.73       409

    accuracy                           0.86      2446
   macro avg       0.86      0.86      0.86      2446
weighted avg       0.86      0.86      0.86      2446

