In [1]:
import pandas as pd
import re
from datetime import datetime
import numpy as np

In [2]:
df = pd.read_csv('films_financials.csv', low_memory=False)

In [3]:
df.loc[:, 'movie_id'] = df.loc[:,'movie_url'].str.extract('([t]{2}\d+)', expand=False)

In [4]:
list_columns = ['top_cast', 'director', 'writer', 'country_of_origin', 'production_company']

In [5]:
def table_creator(df, columns): 
    dfs = []
    for column in list_columns: 
        temp_df = df[['movie_id', column]].copy()
        temp_df.loc[:, column] = temp_df.loc[:, column].str.split(',')
        temp_df = temp_df.explode(column)
        temp_df.set_index('movie_id', inplace=True)
        dfs.append(temp_df)
    return dfs      

In [6]:
df_copy = df.copy()

In [8]:
list_of_dataframes = table_creator(df, list_columns)

In [9]:
cast = list_of_dataframes[0]
director = list_of_dataframes[1]
writer = list_of_dataframes[2]
country_of_origin = list_of_dataframes[3]
production_company = list_of_dataframes[4]

In [10]:
df_copy.drop(['top_cast', 'director', 'writer', 'country_of_origin', 'production_company'], axis=1, inplace=True)

In [11]:
digit_columns = ['budget_local_currency', 'gross_us_canada', 'opening_weekend_us_canada_usd', 'gross_worldwide']

In [12]:
def comma_cleaner(df, columns): 
    for column in columns: 
        df[column].replace(',','', regex=True, inplace=True)
    return df

In [13]:
df2 = comma_cleaner(df_copy, digit_columns)

In [14]:
df2.loc[:, 'local_currency'] = df2.loc[:,'budget_local_currency'].str.extract('(\D+)', expand=False)

In [15]:
df2.loc[:, 'local_currency_budget'] = df2.loc[:,'budget_local_currency'].str.extract('(\d+)', expand=False)

In [16]:
df2.drop('budget_local_currency', axis=1, inplace=True)

In [17]:
df2['release_country'] = df2['release_date'].astype(str).apply(lambda x: x.split('(')[-1]).apply(lambda x: x.strip(')'))

In [18]:
df2['release_date'] = df2['release_date'].astype(str).apply(lambda x: x.split('(')[0])

In [19]:
df2['release_date'] = df2['release_date'].astype(str).apply(lambda x: x.replace(',', '')).apply(lambda x: x.replace('  ', ' '))

In [20]:
df2 = df2[[(len(x) > 7) for x in df2['release_date']]].copy()

In [21]:
df2 = pd.DataFrame(df2)

In [22]:
def month_replacement(row):
    months = {
        "January": '01',
        "February": '02',
        "March": '03',
        "April": '04',
        "May": '05',
        "June": '06',
        "July": '07',
        "August": '08',
        "September": '09',
        "October": '10',
        "November": '11',
        "December": '12'
    }
    old_date = row['release_date']
    row['release_date'] = ''.join([months.get(i, i) for i in old_date.split()])
    return row

In [23]:
df2 = df2.apply(month_replacement, axis=1)

In [24]:
df2['release_date'] = pd.to_datetime(df2['release_date'], format='%m%d%Y', errors='coerce')

In [25]:
df2['opening_weekend_date'] = df2['opening_weekend_date'].astype(str).apply(lambda x: x.replace(',', '')).apply(lambda x: x.replace('  ', ' '))

In [26]:
def short_month_replacement(row):
    months = {
        "Jan": '01',
        "Feb": '02',
        "Mar": '03',
        "Apr": '04',
        "May": '05',
        "Jun": '06',
        "Jul": '07',
        "Aug": '08',
        "Sep": '09',
        "Oct": '10',
        "Nov": '11',
        "Dec": '12'
    }
    old_date = row['opening_weekend_date']
    row['opening_weekend_date'] = ''.join([months.get(i, i) for i in old_date.split()])
    return row

In [27]:
df2 = df2.apply(short_month_replacement, axis=1)

In [28]:
df2['opening_weekend_date'] = pd.to_datetime(df2['opening_weekend_date'], format='%m%d%Y', errors='coerce')

In [29]:
categories = ['language', 'filming_locations', 'filming_locations', 'aspect_ratio', 'local_currency', 'release_country']

In [30]:
for column in categories: 
    df2[column].fillna('Not defined', inplace=True)

In [31]:
def currency_replacement(row):
    currencies = {
        '$': 'USD',
        '€': 'EUR',
        '£': 'GBP',
        'CA$': 'CAD',
        '₹': 'INR',
        'RUR': 'RUB'
    }
    old_currency = row['local_currency']
    row['local_currency'] = ' '.join(
        [currencies.get(i, i) for i in old_currency.split()])
    return row

In [32]:
df2 = df2.apply(currency_replacement, axis=1)

In [33]:
money_cols = ['gross_us_canada', 'opening_weekend_us_canada_usd', 'gross_worldwide', 'local_currency_budget']

In [34]:
for col in money_cols: 
    df2[col] = df2[col].str.replace('$', '', regex=True)
    df2[col] = df2[col].astype(float)

In [35]:
def filming_location_split(row):
    if row['filming_locations'] != 'Not defined': 
        location = row['filming_locations'].split(',')
        row['filming_country'] = location[-1]
        row['filming_locations'] = ', '.join(location[:-1])
    return row

In [36]:
df2 = df2.apply(filming_location_split, axis=1)

In [37]:
df2 = df2[['movie_id', 'title', 'filming_country', 'filming_locations',
          'language', 'release_country', 'release_date', 'aspect_ratio',
          'local_currency_budget', 'local_currency', 'opening_weekend_date',
          'opening_weekend_us_canada_usd', 'gross_us_canada',
          'gross_worldwide', 'movie_url']]

In [38]:
df2['filming_country'] = df2['filming_country'].fillna('Not defined')

In [39]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1184 entries, 0 to 1259
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   movie_id                       1184 non-null   object        
 1   title                          1184 non-null   object        
 2   filming_country                1184 non-null   object        
 3   filming_locations              1184 non-null   object        
 4   language                       1184 non-null   object        
 5   release_country                1184 non-null   object        
 6   release_date                   1163 non-null   datetime64[ns]
 7   aspect_ratio                   1184 non-null   object        
 8   local_currency_budget          277 non-null    float64       
 9   local_currency                 1184 non-null   object        
 10  opening_weekend_date           225 non-null    datetime64[ns]
 11  opening_weekend_u

In [None]:
df2.to_csv('financial_data.csv')

In [None]:
cast.to_csv('top_cast.csv')
director.to_csv('directors.csv')
writer.to_csv('writers.csv')
country_of_origin.to_csv('countries_of_origin.csv')
production_company.to_csv('production_companies.csv')