In [1]:
import pandas as pd

In [2]:
# loading the scraped data
# and checking it out

df = pd.read_csv('../scraping/allmovies.csv')
df

Unnamed: 0,Movie Name,Genres,Time,Rating,Top Cast
0,Venom: The Last Dance\r\n(2024),"Action, Science Fiction, Adventure, Thriller",1h 49m,68,टॉम हार्डीEddie Brock / Venom Chiwetel Ej...
1,Moana 2\r\n(2024),"Animation, Adventure, Family, Comedy",1h 40m,69,Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...
2,Elevation\r\n(2024),"Action, Science Fiction, Thriller",1h 31m,65,एंथनी मैकीWill Morena BaccarinNina M...
3,Absolution\r\n(2024),"Action, Crime, Thriller",1h 52m,61,लियाम नीसनThug Ron PerlmanCharlie Conner ...
4,Red One\r\n(2024),"Action, Comedy, Fantasy",2h 4m,70,ड्वेन जॉनसनCallum Drift Chris EvansJack O...
...,...,...,...,...,...
90,Beetlejuice Beetlejuice\r\n(2024),"Comedy, Fantasy, Horror",1h 45m,71,Michael KeatonBeetlejuice विनोना रायडरLyd...
91,My Hero Academia: You're Next\r\n(2024),"Animation, Action, Adventure, Science Fiction",1h 50m,68,Daiki YamashitaIzuku Midoriya (voice) Nob...
92,स्पाइडर-मैन: एक्रोस दा स्पाईडर-वर्स\r\n(2023),"Animation, Action, Adventure, Science Fiction",2h 20m,84,Shameik MooreMiles Morales (voice) हैली स...
93,Female Teacher: In Front of the Students\r\n(1...,"Drama, Mystery, Horror",1h 9m,66,Rushia SantôReiko / Teacher Rina OkaKumi ...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Movie Name  95 non-null     object
 1   Genres      95 non-null     object
 2   Time        95 non-null     object
 3   Rating      95 non-null     int64 
 4   Top Cast    95 non-null     object
dtypes: int64(1), object(4)
memory usage: 3.8+ KB


## Preprocessing the data for better model building

#### Movie Name
- remove `\r\n`
- drop movies whose names are not english

#### Time
- it will be easier to work if all the values are in the same format
- transform all time values to minutes

#### Genres
- the genres are in the form of `genre1,genre2,genre3`
- one hot encode it, so that each genre is a column
- will help in model building

#### Top Cast
- PENDING

----

#### Preprocessing the `Movie Name` column
- replacing `\r\n` with ` `
- dropping non english movie names

In [4]:
# removing '\r\n' from movie name
df['Movie Name'] = df['Movie Name'].str.replace('\r\n', ' ')
df

Unnamed: 0,Movie Name,Genres,Time,Rating,Top Cast
0,Venom: The Last Dance (2024),"Action, Science Fiction, Adventure, Thriller",1h 49m,68,टॉम हार्डीEddie Brock / Venom Chiwetel Ej...
1,Moana 2 (2024),"Animation, Adventure, Family, Comedy",1h 40m,69,Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...
2,Elevation (2024),"Action, Science Fiction, Thriller",1h 31m,65,एंथनी मैकीWill Morena BaccarinNina M...
3,Absolution (2024),"Action, Crime, Thriller",1h 52m,61,लियाम नीसनThug Ron PerlmanCharlie Conner ...
4,Red One (2024),"Action, Comedy, Fantasy",2h 4m,70,ड्वेन जॉनसनCallum Drift Chris EvansJack O...
...,...,...,...,...,...
90,Beetlejuice Beetlejuice (2024),"Comedy, Fantasy, Horror",1h 45m,71,Michael KeatonBeetlejuice विनोना रायडरLyd...
91,My Hero Academia: You're Next (2024),"Animation, Action, Adventure, Science Fiction",1h 50m,68,Daiki YamashitaIzuku Midoriya (voice) Nob...
92,स्पाइडर-मैन: एक्रोस दा स्पाईडर-वर्स (2023),"Animation, Action, Adventure, Science Fiction",2h 20m,84,Shameik MooreMiles Morales (voice) हैली स...
93,Female Teacher: In Front of the Students (1982),"Drama, Mystery, Horror",1h 9m,66,Rushia SantôReiko / Teacher Rina OkaKumi ...


In [5]:
# dropping the movie that has any hindi letter

def is_english(string):
    # for each character in string
    # check if it can exist in ascii (0 to 255)
    # we confirmed that hindi letters are way beyond 255
    for c in string:
        if ord(c) > 255:
            return False
    return True


# this mask has True for english movies and False for hindi movies
english_mask = df['Movie Name'].apply(is_english)

print("Before applying mask", df.shape)
df = df[english_mask]
print("After applying mask", df.shape)

df

Before applying mask (95, 5)
After applying mask (78, 5)


Unnamed: 0,Movie Name,Genres,Time,Rating,Top Cast
0,Venom: The Last Dance (2024),"Action, Science Fiction, Adventure, Thriller",1h 49m,68,टॉम हार्डीEddie Brock / Venom Chiwetel Ej...
1,Moana 2 (2024),"Animation, Adventure, Family, Comedy",1h 40m,69,Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...
2,Elevation (2024),"Action, Science Fiction, Thriller",1h 31m,65,एंथनी मैकीWill Morena BaccarinNina M...
3,Absolution (2024),"Action, Crime, Thriller",1h 52m,61,लियाम नीसनThug Ron PerlmanCharlie Conner ...
4,Red One (2024),"Action, Comedy, Fantasy",2h 4m,70,ड्वेन जॉनसनCallum Drift Chris EvansJack O...
...,...,...,...,...,...
87,F Marry Kill (2024),"Comedy, Thriller",1h 37m,67,Lucy HaleEva Vaugh Virginia GardnerKelly ...
90,Beetlejuice Beetlejuice (2024),"Comedy, Fantasy, Horror",1h 45m,71,Michael KeatonBeetlejuice विनोना रायडरLyd...
91,My Hero Academia: You're Next (2024),"Animation, Action, Adventure, Science Fiction",1h 50m,68,Daiki YamashitaIzuku Midoriya (voice) Nob...
93,Female Teacher: In Front of the Students (1982),"Drama, Mystery, Horror",1h 9m,66,Rushia SantôReiko / Teacher Rina OkaKumi ...


In [6]:
# we just droppepd some rows
# reset the index, so its easier to work with
df = df.reset_index(drop=True)

#### Preprocessing the `Time` column
- transforming the time column to a numerical format
- uniform format

In [7]:
# converting time to minutes

def convert(time_str):
    # time_str could be '1h 10m' or '20m' or '1h'
    time_str = time_str.replace(' ', '')
    time_str = time_str.replace('m', '')
    # now time_str could be '1h10' or '20' or '1h'

    time_split = time_str.split('h')
    hours = 0
    minutes = 0

    if 'h' in time_str:
        # if 'h' is in time_str
        # time_split[0] will be hours
        # time_split[1] will be minutes
        # but time_split[1] could be empty
        hours = int(time_split[0])
        if time_split[1] != '':
            minutes = int(time_split[1])
    else:
        # if 'h' is not in time_str
        # time_split[0] will be minutes
        minutes = int(time_split[0])
    
    return hours * 60 + minutes


df['Time'] = df['Time'].apply(convert)
df

Unnamed: 0,Movie Name,Genres,Time,Rating,Top Cast
0,Venom: The Last Dance (2024),"Action, Science Fiction, Adventure, Thriller",109,68,टॉम हार्डीEddie Brock / Venom Chiwetel Ej...
1,Moana 2 (2024),"Animation, Adventure, Family, Comedy",100,69,Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...
2,Elevation (2024),"Action, Science Fiction, Thriller",91,65,एंथनी मैकीWill Morena BaccarinNina M...
3,Absolution (2024),"Action, Crime, Thriller",112,61,लियाम नीसनThug Ron PerlmanCharlie Conner ...
4,Red One (2024),"Action, Comedy, Fantasy",124,70,ड्वेन जॉनसनCallum Drift Chris EvansJack O...
...,...,...,...,...,...
73,F Marry Kill (2024),"Comedy, Thriller",97,67,Lucy HaleEva Vaugh Virginia GardnerKelly ...
74,Beetlejuice Beetlejuice (2024),"Comedy, Fantasy, Horror",105,71,Michael KeatonBeetlejuice विनोना रायडरLyd...
75,My Hero Academia: You're Next (2024),"Animation, Action, Adventure, Science Fiction",110,68,Daiki YamashitaIzuku Midoriya (voice) Nob...
76,Female Teacher: In Front of the Students (1982),"Drama, Mystery, Horror",69,66,Rushia SantôReiko / Teacher Rina OkaKumi ...


#### Preprocessing the `Genres` column
- getting unique genres
- making a new df with each genre as a column
- updating the new df with the original df
- merging the two dfs

In [8]:
# getting unique genres

unique_genres = []

for genres in df['Genres']:
    # genres is of form 'Action,Drama,Romance'
    genres = genres.split(',')
    # genres is now ['Action', 'Drama', 'Romance']
    for g in genres:
        # g is 'Action' or '\xa0Action'
        g = g.replace('\xa0', '')
        # g is now 'Action'
        # it is unique if it is not in the list
        if g not in unique_genres:
            unique_genres.append(g)

unique_genres

['Action',
 'Science Fiction',
 'Adventure',
 'Thriller',
 'Animation',
 'Family',
 'Comedy',
 'Crime',
 'Fantasy',
 'Mystery',
 'History',
 'Drama',
 'Romance',
 'TV Movie',
 'Horror',
 'War',
 'Music']

In [9]:
# making a new dataframe that will contain the one-hot encoded genres
genres_df = df['Genres'].to_frame()

for genre in unique_genres:
    genres_df.insert(0, genre, 0)

genres_df

Unnamed: 0,Music,War,Horror,TV Movie,Romance,Drama,History,Mystery,Fantasy,Crime,Comedy,Family,Animation,Thriller,Adventure,Science Fiction,Action,Genres
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Action, Science Fiction, Adventure, Thriller"
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Animation, Adventure, Family, Comedy"
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Action, Science Fiction, Thriller"
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Action, Crime, Thriller"
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Action, Comedy, Fantasy"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Comedy, Thriller"
74,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Comedy, Fantasy, Horror"
75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Animation, Action, Adventure, Science Fiction"
76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"Drama, Mystery, Horror"


In [10]:
# filling the genres df with proper values
# then dropping the old column

for idx in range(len(genres_df)):
    genres = genres_df['Genres'].iloc[idx]
    # genres is of form "Action,Drama,Romance"
    genres = genres.split(",")

    for genre in genres:
        genre = genre.strip()
        genres_df.at[idx, genre] = 1

genres_df = genres_df.drop(columns=['Genres'])
genres_df

Unnamed: 0,Music,War,Horror,TV Movie,Romance,Drama,History,Mystery,Fantasy,Crime,Comedy,Family,Animation,Thriller,Adventure,Science Fiction,Action
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1
1,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
3,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1
4,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
74,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0
75,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1
76,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [11]:
# merging the one hot encoded genres with the original dataframe

df = pd.concat([df, genres_df], axis=1)
df = df.drop(columns=['Genres'])
df

Unnamed: 0,Movie Name,Time,Rating,Top Cast,Music,War,Horror,TV Movie,Romance,Drama,...,Mystery,Fantasy,Crime,Comedy,Family,Animation,Thriller,Adventure,Science Fiction,Action
0,Venom: The Last Dance (2024),109,68,टॉम हार्डीEddie Brock / Venom Chiwetel Ej...,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
1,Moana 2 (2024),100,69,Auliʻi CravalhoMoana (voice) ड्वेन जॉनसनM...,0,0,0,0,0,0,...,0,0,0,1,1,1,0,1,0,0
2,Elevation (2024),91,65,एंथनी मैकीWill Morena BaccarinNina M...,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
3,Absolution (2024),112,61,लियाम नीसनThug Ron PerlmanCharlie Conner ...,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
4,Red One (2024),124,70,ड्वेन जॉनसनCallum Drift Chris EvansJack O...,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,F Marry Kill (2024),97,67,Lucy HaleEva Vaugh Virginia GardnerKelly ...,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
74,Beetlejuice Beetlejuice (2024),105,71,Michael KeatonBeetlejuice विनोना रायडरLyd...,0,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0
75,My Hero Academia: You're Next (2024),110,68,Daiki YamashitaIzuku Midoriya (voice) Nob...,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,1
76,Female Teacher: In Front of the Students (1982),69,66,Rushia SantôReiko / Teacher Rina OkaKumi ...,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0


#### Preprocessing done
- saving the df in a csv file

In [12]:
print('df shape:', df.shape)
print('df columns:', df.columns)

df.to_csv("cleaned-movies.csv", index=False)

df shape: (78, 21)
df columns: Index(['Movie Name', 'Time', 'Rating', 'Top Cast', 'Music', 'War', 'Horror',
       'TV Movie', 'Romance', 'Drama', 'History', 'Mystery', 'Fantasy',
       'Crime', 'Comedy', 'Family', 'Animation', 'Thriller', 'Adventure',
       'Science Fiction', 'Action'],
      dtype='object')
