In [None]:
import pandas as pd

In [None]:
# loading the scraped data
# and checking it out

df = pd.read_csv('../scraping/---.csv')
df

In [None]:
df.info()

In [None]:
# removing '\r\n' from movie name
df['Movie Name'] = df['Movie Name'].str.replace('\r\n', ' ')
df

In [None]:
# dropping the movie that has any hindi letter

def is_english(string):
    # for each character in string
    # check if it can exist in ascii (0 to 255)
    # we confirmed that hindi letters are way beyond 255
    for c in string:
        if ord(c) > 255:
            return False
    return True


# this mask has True for english movies and False for hindi movies
english_mask = df['Movie Name'].apply(is_english)

print("Before applying mask", df.shape)
df = df[english_mask]
print("After applying mask", df.shape)

df

In [None]:
# we just droppepd some rows
# reset the index, so its easier to work with
df = df.reset_index(drop=True)

In [None]:
# converting time to minutes

def convert(time_str):
    # time_str could be '1h 10m' or '20m' or '1h'
    time_str = time_str.replace(' ', '')
    time_str = time_str.replace('m', '')
    # now time_str could be '1h10' or '20' or '1h'

    time_split = time_str.split('h')
    hours = 0
    minutes = 0

    if 'h' in time_str:
        # if 'h' is in time_str
        # time_split[0] will be hours
        # time_split[1] will be minutes
        # but time_split[1] could be empty
        hours = int(time_split[0])
        if time_split[1] != '':
            minutes = int(time_split[1])
    else:
        # if 'h' is not in time_str
        # time_split[0] will be minutes
        minutes = int(time_split[0])
    
    return hours * 60 + minutes


df['Time'] = df['Time'].apply(convert)
df

In [None]:
# getting unique genres

unique_genres = []

for genres in df['Genres']:
    # genres is of form 'Action,Drama,Romance'
    genres = genres.split(',')
    # genres is now ['Action', 'Drama', 'Romance']
    for g in genres:
        # g is 'Action' or '\xa0Action'
        g = g.replace('\xa0', '')
        # g is now 'Action'
        # it is unique if it is not in the list
        if g not in unique_genres:
            unique_genres.append(g)

unique_genres

In [None]:
genres_df = df['Genres'].to_frame()

for genre in unique_genres:
    genres_df.insert(0, genre, 0)

genres_df

In [None]:
# filling the genres df with proper values
# then dropping the old column

for idx in range(len(genres_df)):
    genres = genres_df['Genres'].iloc[idx]
    # genres is of form "Action,Drama,Romance"
    genres = genres.split(",")

    for genre in genres:
        genre = genre.strip()
        genres_df.at[idx, genre] = 1

genres_df = genres_df.drop(columns=['Genres'])
genres_df

In [None]:
# merging the one hot encoded genres with the original dataframe

df = pd.concat([df, genres_df], axis=1)
df = df.drop(columns=['Genres'])
df

In [None]:
print('df shape:', df.shape)
print('df columns:', df.columns)

df.to_csv("cleaned-movies-2k.csv", index=False)

In [None]:
import pandas as pd
import re

In [None]:
# Loading dataset
movies_df = pd.read_csv('cleaned-movies-2k.csv')

# Taking only 'Top Cast' column in different dataframe
top_cast_df = movies_df[['Top Cast']].copy()

In [None]:
# Function to keep only English names
def eng_top_cast(cast) :
    # Removes Hindi charac whose unicode is (u0900 to u097F)
    only_eng_name = re.sub(r'[\u0900-\u097F]+', '', cast)
    # Remove extra spaces that may result
    # only_eng_name = re.sub(r"\s+", " ", only_eng_name)
    # only_eng_name = only_eng_name.strip()            # No need for removing extra spaces , it seperates different characters/actors
    # returning 
    return only_eng_name

In [None]:
# Applying function to 'Top Cast' column
top_cast_df['Top Cast'] = top_cast_df['Top Cast'].apply(eng_top_cast)

# Saving the new dataset to a CSV file
top_cast_df.to_csv('cleaned_top_cast.csv', index=False)

# Printing new dataset
top_cast_df

In [None]:
# merging only eng names top_cast_df with the original dataframe

df = df.drop(columns=['Top Cast'])
df = pd.concat([df, top_cast_df], axis=1)
df

In [None]:
# reordering the dataframe 

df = df.iloc[:, [0, 1, 2, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3]]
df 

In [None]:
print('df shape:', df.shape)
print('df columns:', df.columns)

# Saving df in csv file 

df.to_csv("full-cleaned-movies-2k.csv", index=False)

In [None]:
# Load your dataset
df_top_cast = pd.read_csv("cleaned_top_cast-2k.csv") 

# Extract only the 'Top Cast' column into a new DataFrame
top_cast_df = df_top_cast[['Top Cast']].copy()

In [None]:
# Function to get up to 7 cast members separated by ';'
def get_top_seven_cast(cast):
    # split by multiple spaces
    cast_list = re.split(r'\s{2,}', cast)
    cast = cast.strip()
    # Take only the first 7 names
    return "; ".join(cast_list[:7])

In [None]:
# Apply the function to the 'Top Cast' column
top_cast_df['Top 7 Cast'] = top_cast_df['Top Cast'].apply(get_top_seven_cast)

# Save to a new dataset or view
top_cast_df = top_cast_df[['Top 7 Cast']]
print(top_cast_df)  # Display the first few rows
top_cast_df.to_csv("top_seven_cast-2k.csv", index=False) 

In [None]:
df = df.drop(columns=['Top Cast'])
df = pd.concat([df, top_cast_df], axis=1)
df

In [None]:
df = df.iloc[:, [0, 1, 2, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3]]
df 

In [None]:
print('df shape:', df.shape)
print('df columns:', df.columns)

# Saving df in csv file 

df.to_csv("full-cleaned-movies-7-top-cast-2k.csv", index=False)