In [3386]:
# Import All Necessary Libraries

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import re


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3387]:
# pd.set_option('display.max_rows', None)

In [3388]:
# pd.reset_option('display.max_rows')

In [3389]:
# Load Dataset

df = pd.read_csv('../scraping/allmovies.csv')

In [3390]:
# Display the cleaned dataset

df.head()

Unnamed: 0,Movie Name,Genres,Time,Rating,Top Cast
0,Venom: The Last Dance\r\n(2024),"Action, Science Fiction, Adventure, Thriller",1h 49m,68,टॉम हार्डीEddie Brock / Venom Chiwetel Ej...
1,Moana 2\r\n(2024),"Animation, Adventure, Family, Comedy",1h 40m,69,Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...
2,Elevation\r\n(2024),"Action, Science Fiction, Thriller",1h 31m,65,एंथनी मैकीWill Morena BaccarinNina M...
3,Absolution\r\n(2024),"Action, Crime, Thriller",1h 52m,61,लियाम नीसनThug Ron PerlmanCharlie Conner ...
4,Red One\r\n(2024),"Action, Comedy, Fantasy",2h 4m,70,ड्वेन जॉनसनCallum Drift Chris EvansJack O...


In [3391]:
# 1. Remove newline characters from 'Movie Name'

df['Movie Name'] = df['Movie Name'].str.replace(r'\r\n', ' ', regex=True)

# Time Parsing

(Converting Time into Total Minutes)

In [3392]:
# 2. Clean the 'Time' column

def convert_time_to_minutes(time_str):
    """Convert time in format '1h 49m' to total minutes."""
    
    time_str = time_str.replace(" ",'')
    time_str = time_str.replace('m', '')

    time = time_str.split('h')

    hours = 0
    mins = 0

    if 'h' in time_str :
        hours = int(time[0])
        if time[1] == '' :
            mins = 0
        else :
            mins = int(time[1])
    elif len(time) == 1 :
        mins = int(time[0])
    else :
        hours = 0
        mins = 0

    return hours*60 + mins


In [3393]:
# Apply the conversion function

df['Time'] = df['Time'].apply(convert_time_to_minutes)

# Check for remaining NaN values in the 'Time' column

nan_times = df[df['Time'].isna()]
print("Entries with NaN in 'Time':")
print(nan_times)

Entries with NaN in 'Time':
Empty DataFrame
Columns: [Movie Name, Genres, Time, Rating, Top Cast]
Index: []


In [3394]:
# Concise Summary of Dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Movie Name  95 non-null     object
 1   Genres      95 non-null     object
 2   Time        95 non-null     int64 
 3   Rating      95 non-null     int64 
 4   Top Cast    95 non-null     object
dtypes: int64(2), object(3)
memory usage: 3.8+ KB


In [3395]:
# Checking for Null Values

df.isnull().sum()

Movie Name    0
Genres        0
Time          0
Rating        0
Top Cast      0
dtype: int64

In [3396]:
# Checking for Outliers

df.describe()

Unnamed: 0,Time,Rating
count,95.0,95.0
mean,104.831579,67.263158
std,28.209193,10.929377
min,11.0,0.0
25%,92.0,62.0
50%,105.0,68.0
75%,116.5,72.0
max,198.0,97.0


In [3397]:
# Check for Duplicates

df.duplicated().sum()

0

# Removing Movie Names which are in Hindi Language

( Want to See if a movie name is in English lang or not

Searching for a row in a dataset :
if false remove whole row from the dataset )

In [3398]:
# 3. Cleaning the 'Movie Name' Column

def is_english(var) :
    for i in range(0,len(var)) :
        if ord(var[i]) > 200 :
            return False 
    return True   
        
is_english("Eल")

False

In [3399]:
# Checking if it Works

df["Movie Name"].apply(is_english)

0      True
1      True
2      True
3      True
4      True
      ...  
90     True
91     True
92    False
93     True
94     True
Name: Movie Name, Length: 95, dtype: bool

In [3463]:
len(df)

95

In [3400]:
# Apply the function and filter rows

df["Is_English"] = df["Movie Name"].apply(is_english)
df_filtered = df[df["Is_English"]]  # Keep only rows where Is_English is True

# Drop the Is_English column

df_filtered = df_filtered.drop(columns=["Is_English"])

print("Filtered Dataset:")
print(df_filtered)

Filtered Dataset:
                                         Movie Name  \
0                      Venom: The Last Dance (2024)   
1                                    Moana 2 (2024)   
2                                  Elevation (2024)   
3                                 Absolution (2024)   
4                                    Red One (2024)   
..                                              ...   
87                              F Marry Kill (2024)   
90                   Beetlejuice Beetlejuice (2024)   
91             My Hero Academia: You're Next (2024)   
93  Female Teacher: In Front of the Students (1982)   
94                        365 Days: This Day (2022)   

                                           Genres  Time  Rating  \
0    Action, Science Fiction, Adventure, Thriller   109      68   
1            Animation, Adventure, Family, Comedy   100      69   
2               Action, Science Fiction, Thriller    91      65   
3                         Action, Crime, Thriller   1

In [3401]:
# Printing After Removing Hindi 'Movie Names'

df_filtered

Unnamed: 0,Movie Name,Genres,Time,Rating,Top Cast
0,Venom: The Last Dance (2024),"Action, Science Fiction, Adventure, Thriller",109,68,टॉम हार्डीEddie Brock / Venom Chiwetel Ej...
1,Moana 2 (2024),"Animation, Adventure, Family, Comedy",100,69,Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...
2,Elevation (2024),"Action, Science Fiction, Thriller",91,65,एंथनी मैकीWill Morena BaccarinNina M...
3,Absolution (2024),"Action, Crime, Thriller",112,61,लियाम नीसनThug Ron PerlmanCharlie Conner ...
4,Red One (2024),"Action, Comedy, Fantasy",124,70,ड्वेन जॉनसनCallum Drift Chris EvansJack O...
...,...,...,...,...,...
87,F Marry Kill (2024),"Comedy, Thriller",97,67,Lucy HaleEva Vaugh Virginia GardnerKelly ...
90,Beetlejuice Beetlejuice (2024),"Comedy, Fantasy, Horror",105,71,Michael KeatonBeetlejuice विनोना रायडरLyd...
91,My Hero Academia: You're Next (2024),"Animation, Action, Adventure, Science Fiction",110,68,Daiki YamashitaIzuku Midoriya (voice) Nob...
93,Female Teacher: In Front of the Students (1982),"Drama, Mystery, Horror",69,66,Rushia SantôReiko / Teacher Rina OkaKumi ...


In [3464]:
df_filtered = df_filtered.reset_index(drop=True)

In [3465]:
df_filtered

Unnamed: 0,Movie Name,Genres,Time,Rating,Top Cast
0,Venom: The Last Dance (2024),"Action, Science Fiction, Adventure, Thriller",109,68,टॉम हार्डीEddie Brock / Venom Chiwetel Ej...
1,Moana 2 (2024),"Animation, Adventure, Family, Comedy",100,69,Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...
2,Elevation (2024),"Action, Science Fiction, Thriller",91,65,एंथनी मैकीWill Morena BaccarinNina M...
3,Absolution (2024),"Action, Crime, Thriller",112,61,लियाम नीसनThug Ron PerlmanCharlie Conner ...
4,Red One (2024),"Action, Comedy, Fantasy",124,70,ड्वेन जॉनसनCallum Drift Chris EvansJack O...
...,...,...,...,...,...
72,F Marry Kill (2024),"Comedy, Thriller",97,67,Lucy HaleEva Vaugh Virginia GardnerKelly ...
73,Beetlejuice Beetlejuice (2024),"Comedy, Fantasy, Horror",105,71,Michael KeatonBeetlejuice विनोना रायडरLyd...
74,My Hero Academia: You're Next (2024),"Animation, Action, Adventure, Science Fiction",110,68,Daiki YamashitaIzuku Midoriya (voice) Nob...
75,Female Teacher: In Front of the Students (1982),"Drama, Mystery, Horror",69,66,Rushia SantôReiko / Teacher Rina OkaKumi ...


In [3404]:
len(df_filtered)

77

# Genre - One Hot Encoding 

In [3402]:

pd.get_dummies(df_filtered, columns=['Genres'])

Unnamed: 0,Movie Name,Time,Rating,Top Cast,"Genres_Action, Adventure, Animation, Fantasy","Genres_Action, Adventure, Drama","Genres_Action, Adventure, Fantasy, Animation","Genres_Action, Adventure, History","Genres_Action, Adventure, Thriller","Genres_Action, Comedy, Crime, Thriller, Adventure",...,Genres_Music,Genres_Romance,"Genres_Romance, Drama","Genres_Science Fiction, Adventure, Action","Genres_Science Fiction, Horror, Thriller","Genres_Science Fiction, Thriller, Horror","Genres_TV Movie, Family, Comedy, Horror, Animation","Genres_Thriller, Adventure, Action, Science Fiction","Genres_Thriller, Horror, Crime, Mystery",Genres_War
0,Venom: The Last Dance (2024),109,68,टॉम हार्डीEddie Brock / Venom Chiwetel Ej...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Moana 2 (2024),100,69,Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Elevation (2024),91,65,एंथनी मैकीWill Morena BaccarinNina M...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Absolution (2024),112,61,लियाम नीसनThug Ron PerlmanCharlie Conner ...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Red One (2024),124,70,ड्वेन जॉनसनCallum Drift Chris EvansJack O...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,F Marry Kill (2024),97,67,Lucy HaleEva Vaugh Virginia GardnerKelly ...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
90,Beetlejuice Beetlejuice (2024),105,71,Michael KeatonBeetlejuice विनोना रायडरLyd...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
91,My Hero Academia: You're Next (2024),110,68,Daiki YamashitaIzuku Midoriya (voice) Nob...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
93,Female Teacher: In Front of the Students (1982),69,66,Rushia SantôReiko / Teacher Rina OkaKumi ...,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [3403]:
# Contains ONLY Genres (For One - Hot Encoding)

df_genres = df_filtered['Genres']

In [3405]:
len(df_genres)

77

In [3406]:
# Printing 

df_genres

0      Action, Science Fiction, Adventure, Thriller
1              Animation, Adventure, Family, Comedy
2                 Action, Science Fiction, Thriller
3                           Action, Crime, Thriller
4                           Action, Comedy, Fantasy
                          ...                      
87                                 Comedy, Thriller
90                          Comedy, Fantasy, Horror
91    Animation, Action, Adventure, Science Fiction
93                           Drama, Mystery, Horror
94                                   Romance, Drama
Name: Genres, Length: 77, dtype: object

In [3407]:
# Extracting All Unique Genres

unique_genres = []

for genres in df_genres :
    genres = genres.split(",")
    for j in genres :
        j = j.replace("\xa0", '')
        if j not in unique_genres :
            unique_genres.append(j)
        else :
            continue 

In [3408]:
unique_genres

['Action',
 'Science Fiction',
 'Adventure',
 'Thriller',
 'Animation',
 'Family',
 'Comedy',
 'Crime',
 'Fantasy',
 'Mystery',
 'History',
 'Drama',
 'Romance',
 'TV Movie',
 'Horror',
 'War',
 'Music']

In [3409]:
# Converting it to Data Frame

temp_genres = df_genres.to_frame()
temp_genres

Unnamed: 0,Genres
0,"Action, Science Fiction, Adventure, Thriller"
1,"Animation, Adventure, Family, Comedy"
2,"Action, Science Fiction, Thriller"
3,"Action, Crime, Thriller"
4,"Action, Comedy, Fantasy"
...,...
87,"Comedy, Thriller"
90,"Comedy, Fantasy, Horror"
91,"Animation, Action, Adventure, Science Fiction"
93,"Drama, Mystery, Horror"


In [3410]:
# Inserting All Unique Genres to Data Frame

for i in unique_genres :
    temp_genres.insert(0, i, 0)

In [3411]:
# Printing

temp_genres

Unnamed: 0,Music,War,Horror,TV Movie,Romance,Drama,History,Mystery,Fantasy,Crime,Comedy,Family,Animation,Thriller,Adventure,Science Fiction,Action,Genres
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Action, Science Fiction, Adventure, Thriller"
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Animation, Adventure, Family, Comedy"
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Action, Science Fiction, Thriller"
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Action, Crime, Thriller"
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Action, Comedy, Fantasy"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Comedy, Thriller"
90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Comedy, Fantasy, Horror"
91,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Animation, Action, Adventure, Science Fiction"
93,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Drama, Mystery, Horror"


In [3412]:
temp_genres.isna().sum() # Check the missing values in each column

Music              0
War                0
Horror             0
TV Movie           0
Romance            0
Drama              0
History            0
Mystery            0
Fantasy            0
Crime              0
Comedy             0
Family             0
Animation          0
Thriller           0
Adventure          0
Science Fiction    0
Action             0
Genres             0
dtype: int64

28/12/24 

In [3417]:
len(temp_genres)

77

In [3418]:
# Dont need to do it it , already did this in df_filtered

# temp_genres = temp_genres.reset_index(drop=True)

In [3419]:
# Assigning 1 to the Genre which Exists

for i in range(len(temp_genres)):
    # Get the genres for the current movie
    movie_genres = temp_genres["Genres"].iloc[i].split(",")
    # Check each genre and update its column to 1
    for genre in movie_genres:
        genre = genre.strip()  # Remove spaces
        temp_genres.at[i, genre] = 1
        
len(temp_genres)

77

In [3421]:
temp_genres

Unnamed: 0,Music,War,Horror,TV Movie,Romance,Drama,History,Mystery,Fantasy,Crime,Comedy,Family,Animation,Thriller,Adventure,Science Fiction,Action,Genres
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,"Action, Science Fiction, Adventure, Thriller"
1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,"Animation, Adventure, Family, Comedy"
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,"Action, Science Fiction, Thriller"
3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,"Action, Crime, Thriller"
4,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,"Action, Comedy, Fantasy"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,"Comedy, Thriller"
73,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,"Comedy, Fantasy, Horror"
74,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,"Animation, Action, Adventure, Science Fiction"
75,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,"Drama, Mystery, Horror"


In [3469]:
temp_genres = temp_genres.drop(columns=['Genres'])

In [3470]:
temp_genres

Unnamed: 0,Music,War,Horror,TV Movie,Romance,Drama,History,Mystery,Fantasy,Crime,Comedy,Family,Animation,Thriller,Adventure,Science Fiction,Action
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1
1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1
4,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
73,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0
74,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1
75,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [3422]:
temp_genres.isna().sum() # Check the missing values in each column

Music              0
War                0
Horror             0
TV Movie           0
Romance            0
Drama              0
History            0
Mystery            0
Fantasy            0
Crime              0
Comedy             0
Family             0
Animation          0
Thriller           0
Adventure          0
Science Fiction    0
Action             0
Genres             0
dtype: int64

In [3423]:
temp_genres.shape  # This will show the number of rows and columns.

(77, 18)

In [3424]:
temp_genres.duplicated().sum()  # It will show how many duplicate rows exist.

15

In [3425]:
temp_genres.head()  # Check the first few rows

Unnamed: 0,Music,War,Horror,TV Movie,Romance,Drama,History,Mystery,Fantasy,Crime,Comedy,Family,Animation,Thriller,Adventure,Science Fiction,Action,Genres
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,"Action, Science Fiction, Adventure, Thriller"
1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,"Animation, Adventure, Family, Comedy"
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,"Action, Science Fiction, Thriller"
3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,"Action, Crime, Thriller"
4,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,"Action, Comedy, Fantasy"


In [3426]:
temp_genres.tail()  # Check the last few rows

Unnamed: 0,Music,War,Horror,TV Movie,Romance,Drama,History,Mystery,Fantasy,Crime,Comedy,Family,Animation,Thriller,Adventure,Science Fiction,Action,Genres
72,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,"Comedy, Thriller"
73,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,"Comedy, Fantasy, Horror"
74,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,"Animation, Action, Adventure, Science Fiction"
75,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,"Drama, Mystery, Horror"
76,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,"Romance, Drama"


In [3472]:
df_main = pd.concat([df_filtered, temp_genres], axis=1)

In [3473]:
df_main

Unnamed: 0,Movie Name,Genres,Time,Rating,Top Cast,Music,War,Horror,TV Movie,Romance,...,Mystery,Fantasy,Crime,Comedy,Family,Animation,Thriller,Adventure,Science Fiction,Action
0,Venom: The Last Dance (2024),"Action, Science Fiction, Adventure, Thriller",109,68,टॉम हार्डीEddie Brock / Venom Chiwetel Ej...,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
1,Moana 2 (2024),"Animation, Adventure, Family, Comedy",100,69,Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...,0,0,0,0,0,...,0,0,0,1,1,1,0,1,0,0
2,Elevation (2024),"Action, Science Fiction, Thriller",91,65,एंथनी मैकीWill Morena BaccarinNina M...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
3,Absolution (2024),"Action, Crime, Thriller",112,61,लियाम नीसनThug Ron PerlmanCharlie Conner ...,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
4,Red One (2024),"Action, Comedy, Fantasy",124,70,ड्वेन जॉनसनCallum Drift Chris EvansJack O...,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,F Marry Kill (2024),"Comedy, Thriller",97,67,Lucy HaleEva Vaugh Virginia GardnerKelly ...,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
73,Beetlejuice Beetlejuice (2024),"Comedy, Fantasy, Horror",105,71,Michael KeatonBeetlejuice विनोना रायडरLyd...,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
74,My Hero Academia: You're Next (2024),"Animation, Action, Adventure, Science Fiction",110,68,Daiki YamashitaIzuku Midoriya (voice) Nob...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,1
75,Female Teacher: In Front of the Students (1982),"Drama, Mystery, Horror",69,66,Rushia SantôReiko / Teacher Rina OkaKumi ...,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [3474]:
df_main.to_csv('main_cleaned_movies.csv', index=False)

Visualization

Have to Work on it

In [3427]:

# # Count the number of movies in each genre
# genre_counts = genres.sum().sort_values(ascending=False)

# # Plot the distribution of genres
# plt.figure(figsize=(10, 6))
# genre_counts.plot(kind='bar', color='skyblue')
# plt.title('Distribution of Movie Genres')
# plt.xlabel('Genres')
# plt.ylabel('Number of Movies')
# plt.xticks(rotation=90)
# plt.show()


In [3428]:
# # Assuming 'Rating' is the column for ratings
# df_genre_ratings = pd.concat([df['Rating'], genres], axis=1)

# # Reshape data to long format for each genre
# df_genre_ratings_long = df_genre_ratings.melt(id_vars=['Rating'], var_name='Genre', value_name='Has_Genre')

# # Filter only the rows where a genre is present (Has_Genre is 1)
# df_genre_ratings_long = df_genre_ratings_long[df_genre_ratings_long['Has_Genre'] == 1]

# # Create a box plot for ratings grouped by genre
# plt.figure(figsize=(12, 8))
# sns.boxplot(data=df_genre_ratings_long, x='Genre', y='Rating', palette='Set2')
# plt.title('Average Ratings by Genre')
# plt.xlabel('Genre')
# plt.ylabel('Ratings')
# plt.xticks(rotation=45)
# plt.show()


In [3429]:
# # Sort movies by rating and select top 10
# top_movies = df[['Movie Name', 'Rating']].sort_values(by='Rating', ascending=False).head(10)

# # Plot the top 10 movies by rating
# plt.figure(figsize=(10, 6))
# plt.barh(top_movies['Movie Name'], top_movies['Rating'], color='lightgreen')
# plt.xlabel('Ratings')
# plt.title('Top 10 Movies by Rating')
# plt.gca().invert_yaxis()
# plt.show()


In [3430]:
# # Calculate correlation matrix between genres and ratings
# genre_columns = ['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama', 'Family', 'Fantasy', 
#                  'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War']
# genre_corr = df[genre_columns].corr()

# # Plot heatmap of correlations
# plt.figure(figsize=(12, 8))
# sns.heatmap(genre_corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
# plt.title('Correlation Heatmap of Genres')
# plt.show()


In [3431]:
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Sample data (replace with your actual dataframe)
# data = pd.read_csv('../scraping/allmovies.csv')
# df = pd.DataFrame(data)

# # Function to clean and convert 'Time' column to hours
# def convert_time_to_hours(time_str):
#     # Initialize hours and minutes
#     hours = 0
#     minutes = 0
    
#     # Extract hours and minutes using regex
#     hour_match = re.search(r'(\d+)\s*h', time_str)
#     minute_match = re.search(r'(\d+)\s*m', time_str)
    
#     if hour_match:
#         hours = int(hour_match.group(1))
    
#     if minute_match:
#         minutes = int(minute_match.group(1))
    
#     # Return total time in hours
#     return hours + minutes / 60

# # Apply the conversion function to the 'Time' column
# df['Time_in_hours'] = df['Time'].apply(convert_time_to_hours)

# # Check the conversion result
# print(df[['Time', 'Time_in_hours']])

# # Define time bins and labels
# bins = [0, 1, 1.5, 2, 2.5, 3, 5]
# labels = ['0-1 hrs', '1-1.5 hrs', '1.5-2 hrs', '2-2.5 hrs', '2.5-3 hrs', '3+ hrs']
# df['Time_bins'] = pd.cut(df['Time_in_hours'], bins=bins, labels=labels, right=False)

# # Check the binning result
# print(df[['Time_in_hours', 'Time_bins']])

# # Plot the distribution of ratings by movie duration (in the time bins)
# plt.figure(figsize=(12, 8))
# sns.boxplot(data=df, x='Time_bins', y='Rating', palette='coolwarm')
# plt.title('Movie Ratings vs. Duration (Grouped by Time Intervals)')
# plt.xlabel('Movie Duration (Time Interval)')
# plt.ylabel('Rating')
# plt.xticks(rotation=45)
# plt.show()


In [3432]:
# # Plotting a scatter plot of Movie Ratings vs. Movie Duration
# plt.figure(figsize=(12, 8))
# sns.scatterplot(data=df, x='Time_in_hours', y='Rating', hue='Rating', palette='coolwarm', s=100, edgecolor='black')
# plt.title('Movie Ratings vs. Duration')
# plt.xlabel('Movie Duration (Time in Hours)')
# plt.ylabel('Rating')
# plt.show()


In [3433]:
print(df.columns)

Index(['Movie Name', 'Genres', 'Time', 'Rating', 'Top Cast', 'Is_English'], dtype='object')


## Above : Everything Feels right (I Assume)

What I did ?

- Data Cleaning
  - Divided Genres into subfield - action, comedy, crime, thriller, etc
  - Converted Time into Time-Frames (0.5 - 1.0, 1.0 - 1.5, and so on)
- Some Visualizations

# Below : tried but wrong 

In [3434]:
# # Apply the function and filter rows
# df["Is_English"] = df["Top Cast"].apply(is_english)
# df_cast_filtered = df[df["Is_English"]]  # Keep only rows where Is_English is True

# # Drop the Is_English column
# df_cast_filtered = df_cast_filtered.drop(columns=["Is_English"])

# print("Filtered Dataset:")
# print(df_cast_filtered)

In [3435]:

# # 2. Clean the 'Top Cast' column
# def clean_top_cast(top_cast):
#     if isinstance(top_cast, str):  # Check if the value is a string
#         # Split by spaces to separate cast names
#         actors = []
#         for actor in top_cast.split(' '):
#             if len(actor) > 1:  # Include only valid names
#                 actors.append(actor.strip())
#         return ', '.join(actors)
#     return top_cast

# df['Top Cast'] = df['Top Cast'].apply(clean_top_cast)

# df['Top Cast']

In [3436]:

# Sample data (you can replace this with your actual dataset)
# data = {
#     'Movie Name': ['Venom: The Last Dance\r\n(2024)', 'Moana 2\r\n(2024)', 'Elevation\r\n(2024)', 'Absolution\r\n(2024)', 'Red One\r\n(2024)'],
#     'Time': [109.0, 100.0, 91.0, 112.0, 124.0],
#     'Rating': [68.0, 69.0, 65.0, 61.0, 70.0],
#     'Top Cast': ['टॉम हार्डीEddie Brock / Venom Chiwetel Ej...', 'Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...', 
#                  'एंथनी मैकीWill Morena BaccarinNina M...', 'लियाम नीसनThug Ron PerlmanCharlie Conner ...', 
#                  'ड्वेन जॉनसनCallum Drift Chris EvansJack O...']
# }

# data = pd.read_csv('../scraping/allmovies.csv')

# df = pd.DataFrame(data)

# # 1. Remove newline characters from 'Movie Name'
# df['Movie Name'] = df['Movie Name'].str.replace(r'\r\n', ' ', regex=True)

# # 2. Clean the 'Top Cast' column: Separate cast names correctly
# def clean_top_cast(top_cast):
#     if isinstance(top_cast, str):  # Check if the value is a string
#         # Split by known patterns (e.g., names in Hindi or English followed by roles or positions)
#         actors = []
#         for actor in top_cast.split(' '):
#             # Assuming a mix of Hindi and English actor names (basic rule: check for names without roles)
#             if len(actor) > 1:  # basic check for actor name length (can be adjusted)
#                 actors.append(actor.strip())
#         return ', '.join(actors)
#     return top_cast  # Return as is if it's not a string

# df['Top Cast'] = df['Top Cast'].apply(clean_top_cast)

# # 3. Clean the 'Time' column: Ensure it only contains numbers (no NaN or strings)
# df['Time'] = pd.to_numeric(df['Time'], errors='coerce')

# # Display cleaned dataset
# df.head()


In [3437]:
# import pandas as pd

# # Sample data (you can replace this with your actual dataset)
# data = pd.read_csv('../scraping/allmovies.csv')

# # Load data into a pandas DataFrame
# df = pd.DataFrame(data)

# # Clean the 'Genres' column by splitting and creating dummy variables for each genre
# df['Genres'] = df['Genres'].str.split(',').apply(lambda x: [genre.strip() for genre in x])

# # One-hot encoding for 'Genres'
# genre_dummies = df['Genres'].apply(pd.Series).stack().str.get_dummies().sum()

# # Combine back with the original dataset
# df = pd.concat([df.drop(columns='Genres'), genre_dummies], axis=1)

# # Clean the 'Time' column: Convert '1h 49m' to total minutes (e.g., 109 minutes)
# def convert_time_to_minutes(time_str):
#     if isinstance(time_str, str):  # Check if the time is a string
#         hours = 0
#         minutes = 0
#         time_str = time_str.replace(' ', '')  # Remove any spaces
#         if 'h' in time_str:
#             hours = int(time_str.split('h')[0])
#         if 'm' in time_str:
#             minutes = int(time_str.split('m')[0].split('h')[-1])
#         return hours * 60 + minutes
#     else:
#         return None  # Return None for non-string values

# # Apply the conversion function to the 'Time' column
# df['Time'] = df['Time'].apply(convert_time_to_minutes)

# # Clean the 'Top Cast' column: Ensure it has proper spacing and format
# def clean_top_cast(top_cast):
#     if isinstance(top_cast, str):  # Check if the value is a string
#         return ', '.join([actor.strip() for actor in top_cast.split(',')])
#     return top_cast  # Return as is if it's not a string

# df['Top Cast'] = df['Top Cast'].apply(clean_top_cast)

# # Display cleaned dataset
# df.head()


In [3438]:
# # Plot distribution of ratings by movie duration (Time)
# plt.figure(figsize=(12, 8))
# sns.scatterplot(data=df, x='Time', y='Rating', hue='Rating', palette='coolwarm', s=100, edgecolor='black')
# plt.title('Movie Ratings vs. Duration')
# plt.xlabel('Movie Duration (Time)')
# plt.ylabel('Rating')
# plt.show()

In [3439]:
# dataset = pd.read_csv('../scraping/allmovies.csv', delimiter=';', encoding="utf-8")
# print(dataset.head())

In [3440]:
# from io import StringIO

# df = pd.read_csv(StringIO('../scraping/allmovies.csv'), sep=",")

In [3441]:
# print(df.head())  # Display first few rows

In [3442]:
# # Ensure there are no trailing or leading spaces
# df.columns = df.columns.str.strip()

# # Split the data into columns if it's concatenated
# df[['Movie Name', 'Genres', 'Time', 'Rating', 'Top Cast']] = df['Movie Name,Genres,Time,Rating,Top Cast'].str.split(',', n=4, expand=True)

# # Drop the old concatenated column
# df.drop('Movie Name,Genres,Time,Rating,Top Cast', axis=1, inplace=True)

# # Preview the cleaned dataset
# df.head()

In [3443]:
# df = df.applymap(lambda x: x.strip().replace("\r\n", " ") if isinstance(x, str) else x)
# df.head()

In [3444]:
# import pandas as pd

# # Load the dataset (replace 'filename.csv' with your actual file path)
# df = pd.read_csv('../scraping/allmovies.csv', encoding='utf-8', header=None, names=['Movie_Data'])

# # Split the single column into multiple columns
# df_split = df['Movie_Data'].str.split(',', n=4, expand=True)
# df_split.columns = ['Movie Name', 'Genres', 'Time', 'Rating', 'Top Cast']

# # Remove newlines and trim spaces
# for col in df_split.columns:
#     df_split[col] = df_split[col].str.replace(r'\r\n', '', regex=True).str.strip()

# # Convert columns to appropriate data types (e.g., Rating to numeric)
# df_split['Rating'] = pd.to_numeric(df_split['Rating'], errors='coerce')

# # Preview the cleaned data
# print(df_split.head())

# # Save the cleaned data to a new CSV (optional)
# df_split.to_csv('cleaned_movies_dataset.csv', index=False)


In [3445]:
# import pandas as pd

# # Load the dataset (replace with your actual file path)
# df = pd.read_csv('../scraping/allmovies.csv', encoding='utf-8', header=None, names=['Movie_Data'])

# # Split the single column into multiple columns
# df_split = df['Movie_Data'].str.split(',', n=4, expand=True)
# df_split.columns = ['Movie Name', 'Genres', 'Time', 'Rating', 'Top Cast']

# # Remove newlines and trim spaces
# for col in df_split.columns:
#     df_split[col] = df_split[col].str.replace(r'\r\n', '', regex=True).str.strip()

# # Convert 'Rating' to numeric
# df_split['Rating'] = pd.to_numeric(df_split['Rating'], errors='coerce')

# # Handle 'Time' (convert to minutes if in "1h 40m" format)
# def convert_time_to_minutes(time_str):
#     if isinstance(time_str, str):
#         hours, minutes = 0, 0
#         if 'h' in time_str:
#             parts = time_str.split('h')
#             hours = int(parts[0].strip())
#             if 'm' in parts[1]:
#                 minutes = int(parts[1].strip('m').strip())
#         elif 'm' in time_str:
#             minutes = int(time_str.strip('m').strip())
#         return hours * 60 + minutes
#     return None

# df_split['Time'] = df_split['Time'].apply(convert_time_to_minutes)

# # Handle missing values (optional)
# df_split.fillna({'Rating': df_split['Rating'].mean(), 'Time': df_split['Time'].mean()}, inplace=True)

# # Ensure column names are as desired
# df_split = df_split[['Movie Name', 'Genres', 'Time', 'Rating', 'Top Cast']]

# # Preview the cleaned dataset
# print(df_split.head())

# # Save the cleaned data to a new CSV (optional)
# df_split.to_csv('cleaned_movies_dataset.csv', index=False)


In [3446]:
# import pandas as pd

# # Load the dataset (replace 'filename.csv' with your actual file path)
# df = pd.read_csv('../scraping/allmovies.csv', encoding='utf-8', header=None, names=['Movie_Data'])

# # Split the single column into multiple columns
# df_split = df['Movie_Data'].str.split(',', n=4, expand=True)
# df_split.columns = ['Movie Name', 'Genres', 'Time', 'Rating', 'Top Cast']

# # Remove newlines and trim spaces from all columns
# for col in df_split.columns:
#     df_split[col] = df_split[col].str.replace(r'\r\n', '', regex=True).str.strip()

# # Convert 'Rating' to numeric (since it's currently a string)
# df_split['Rating'] = pd.to_numeric(df_split['Rating'], errors='coerce')

# # Now, run describe to get the statistical summary
# print(df_split.describe(include='all'))  # 'include=all' will give you both numeric and object summary


In [3447]:
# import pandas as pd

# # Load the dataset
# df = pd.read_csv('../scraping/allmovies.csv', encoding='utf-8', header=None, names=['Movie_Data'])

# # Check the first few rows of the dataset to understand its structure
# print(df.head())

# # Split the 'Movie_Data' column into multiple columns
# df_split = df['Movie_Data'].str.split(',', n=4, expand=True)

# # Check the first few rows of the split data
# print(df_split.head())

# # Assign appropriate column names
# df_split.columns = ['Movie Name', 'Genres', 'Time', 'Rating', 'Top Cast']

# # Check the columns and data of df_split
# print("Columns in df_split:", df_split.columns)
# print(df_split.head())

# # Remove newlines and trim spaces from all columns
# for col in df_split.columns:
#     df_split[col] = df_split[col].str.replace(r'\r\n', '', regex=True).str.strip()

# # Convert 'Rating' to numeric
# df_split['Rating'] = pd.to_numeric(df_split['Rating'], errors='coerce')

# # Check the description of the cleaned DataFrame
# print(df_split.describe(include='all'))


In [3448]:
# import pandas as pd

# # Load the dataset
# df = pd.read_csv('../scraping/allmovies.csv')

# # Remove extra spaces in column names
# df.columns = df.columns.str.strip()

# # Preprocessing 'Genres' column - ensuring no extra spaces and splitting by commas
# df['Genres'] = df['Genres'].apply(lambda x: ', '.join(sorted(set(x.split(',')))) if isinstance(x, str) else x)

# # Clean 'Time' column to remove extra spaces and unwanted characters (like 'h' and 'm')
# df['Time'] = df['Time'].str.replace('h', ' hours').str.replace('m', ' minutes').str.strip()

# # Clean 'Rating' column - converting to numeric and handling any potential errors
# df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# # Remove leading/trailing spaces in 'Top Cast'
# df['Top Cast'] = df['Top Cast'].apply(lambda x: ' '.join([name.strip() for name in str(x).split() if name.strip()]))

# # Apply pd.get_dummies to the 'Genres' column, drop the first to avoid multicollinearity
# df = pd.get_dummies(df, columns=['Genres'], drop_first=True)

# # Save the cleaned and processed data to a new CSV file
# df.to_csv('cleaned_movies.csv', index=False)

# # Show the cleaned dataset
# df.head()  # Display the first few rows to check the changes


In [3449]:
# print(df.columns)

In [3450]:
# print(df_split.columns)

In [3451]:
# print(df_split.info())  # Get column data types and missing values

In [3452]:
# df_split.isnull().sum()

In [3453]:
# df_split.describe()

In [3454]:
# df_split.duplicated().sum()

In [3455]:
# dataset = pd.get_dummies(df_split, drop_first=True).astype(str,float)
# dataset.head()

In [3456]:
# dataset = pd.get_dummies(df_split, columns=['Genres'], drop_first=True)
# df_split.head()

In [3457]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Set up the plot style
# sns.set(style="whitegrid")

# # Plot the distribution of movie ratings
# plt.figure(figsize=(8, 6))
# sns.histplot(df_split['Rating'], bins=10, kde=True, color='skyblue')
# plt.title('Distribution of Movie Ratings')
# plt.xlabel('Rating')
# plt.ylabel('Frequency')
# plt.show()


In [3458]:
# # Extract the genres and split them into separate genres
# df_split['Genres'] = df_split['Genres'].str.split(',').apply(lambda x: [genre.strip() for genre in x])

# # Unnest the genres and count the frequency
# genre_list = [genre for sublist in df_split['Genres'] for genre in sublist]
# genre_counts = pd.Series(genre_list).value_counts()

# # Plot the top genres
# plt.figure(figsize=(10, 6))
# genre_counts.head(10).plot(kind='bar', color='salmon')
# plt.title('Top 10 Genres in the Movie Dataset')
# plt.xlabel('Genre')
# plt.ylabel('Frequency')
# plt.xticks(rotation=45)
# plt.show()


In [3459]:
# # Convert time to minutes
# def time_to_minutes(time_str):
#     if 'h' in time_str and 'm' in time_str:
#         hours, minutes = time_str.split('h')
#         minutes = minutes.replace('m', '').strip()
#         return int(hours) * 60 + int(minutes)
#     elif 'h' in time_str:
#         return int(time_str.replace('h', '').strip()) * 60
#     elif 'm' in time_str:
#         return int(time_str.replace('m', '').strip())
#     else:
#         return None

# df_split['Duration (minutes)'] = df_split['Time'].apply(time_to_minutes)

# # Plot movie duration vs rating
# plt.figure(figsize=(10, 6))
# sns.scatterplot(x='Duration (minutes)', y='Rating', data=df_split, color='purple')
# plt.title('Movie Duration vs Rating')
# plt.xlabel('Duration (Minutes)')
# plt.ylabel('Rating')
# plt.show()


In [3460]:
# from wordcloud import WordCloud

# # Join all the top cast names into a single string
# top_cast_str = ' '.join(df_split['Top Cast'].dropna())

# # Generate the word cloud
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(top_cast_str)

# # Plot the word cloud
# plt.figure(figsize=(10, 6))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.title('Most Frequent Top Cast')
# plt.show()


In [3461]:
# # Sort movies by rating
# top_rated_movies = df_split.sort_values('Rating', ascending=False).head(10)

# # Plot the top 10 movies by rating
# plt.figure(figsize=(10, 6))
# sns.barplot(x='Rating', y='Movie Name', data=top_rated_movies, palette='coolwarm')
# plt.title('Top 10 Movies by Rating')
# plt.xlabel('Rating')
# plt.ylabel('Movie Name')
# plt.show()


In [3462]:
# # Calculate the average rating by movie duration
# avg_rating_by_duration = df_split.groupby('Duration (minutes)')['Rating'].mean()

# # Plot average rating by duration
# plt.figure(figsize=(12, 6))
# avg_rating_by_duration.plot(kind='line', color='orange')
# plt.title('Average Rating by Movie Duration')
# plt.xlabel('Duration (Minutes)')
# plt.ylabel('Average Rating')
# plt.show()
