In [1]:
import pandas as pd
from datetime import timedelta
import ast

In [2]:
movies_df = pd.read_csv('imdb_movies.csv')

In [None]:
movies_df.head()

In [None]:
movies_df.info()

In [None]:
movies_df.describe(include='all')

In [6]:
movies_with_no_duration = movies_df[movies_df['duration'].isnull()]
movies_with_no_duration.to_csv('movies_with_no_duration.csv', index=False)
movies_df = movies_df.dropna(subset=['duration'])

In [7]:
movies_with_no_rating = movies_df[movies_df['imdb_rating'].isnull()]
movies_with_no_rating.to_csv('movies_with_no_rating.csv', index=False)
movies_df = movies_df.dropna(subset=['imdb_rating'])

In [None]:
movies_df.info()

In [None]:
movies_df.describe(include='all')

In [10]:
movies_with_more_than_8_rating = movies_df[movies_df['imdb_rating'] > 8]
movies_with_more_than_8_rating.to_csv('movies_with_more_than_8_rating.csv', index=False)
movies_df = movies_df[movies_df['imdb_rating'] <= 8]

In [None]:
movies_df.to_csv('cleaned_imdb_movies.csv', index=False)
movies_df.info()

In [None]:
movies_df.describe(include='all')

In [None]:
movies_df.head()

In [14]:
movies_df['duration'] = movies_df['duration'].replace('PT', '', regex=True)

In [15]:
def str_to_time(duration):
    hours = int(duration.split('H')[0]) if 'H' in duration else 0
    minutes = int(duration.split('M')[0].split('H')[-1]) if 'M' in duration else 0
    return f"{hours:02}:{minutes:02}:00"

In [16]:
movies_df['duration'] = movies_df['duration'].apply(str_to_time)
movies_df['duration'] = pd.to_datetime(movies_df['duration'], format='%H:%M:%S').dt.time

In [None]:
movies_df.head()

In [None]:
movies_df.info()

In [19]:
def explode_column(df, column):
    df[column] = df[column].apply(ast.literal_eval)
    exploded_movie_df = df.explode(column).reset_index(drop=True)
    return exploded_movie_df

In [None]:
movies_df = explode_column(movies_df, 'directors')
movies_df = explode_column(movies_df, 'writers')
movies_df = explode_column(movies_df, 'genres')
movies_df = explode_column(movies_df, 'languages')
movies_df = explode_column(movies_df, 'production_companies')

movies_df.head()

In [None]:
movies_df.info()

In [22]:
movies_df['cast'] = movies_df['cast'].apply(ast.literal_eval)
movies_df['actor'] = ''
movies_df['actress'] = ''

In [23]:
actors = list(pd.read_csv('Actors.csv')['Actors'])
actresses = list(pd.read_csv('Actresses.csv')['Actresses'])

In [24]:
movies_df["actor"] = movies_df["cast"].apply(lambda x: [name for name in x if name in actors])
movies_df["actress"] = movies_df["cast"].apply(lambda x: [name for name in x if name in actresses])

In [25]:
movies_df.drop(columns=['cast'], inplace=True)

In [26]:
movies_df = movies_df.explode('actor').reset_index(drop=True)
movies_df = movies_df.explode('actress').reset_index(drop=True)

In [None]:
movies_df