In [122]:
import pandas as pd
import re
from datetime import datetime
import numpy as np

In [123]:
df = pd.read_csv('films_financials.csv', low_memory=False)

In [125]:
df.loc[:, 'movie_id'] = df.loc[:, 'movie_url'].str.extract('([t]{2}\d+)',
                                                           expand=False)

In [126]:
list_columns = [
    'top_cast', 'director', 'writer', 'country_of_origin', 'production_company'
]

In [127]:
def table_creator(df, columns):
    dfs = []
    for column in list_columns:
        temp_df = df[['movie_id', column]].copy()
        temp_df.loc[:, column] = temp_df.loc[:, column].str.split(',')
        temp_df = temp_df.explode(column)
        temp_df.set_index('movie_id', inplace=True)
        dfs.append(temp_df)
    return dfs

In [128]:
df_copy = df.copy()

In [129]:
list_of_dataframes = table_creator(df, list_columns)

In [130]:
cast = list_of_dataframes[0]
director = list_of_dataframes[1]
writer = list_of_dataframes[2]
country_of_origin = list_of_dataframes[3]
production_company = list_of_dataframes[4]

In [131]:
df_copy.drop([
    'top_cast', 'director', 'writer', 'country_of_origin', 'production_company'
],
             axis=1,
             inplace=True)

In [132]:
digit_columns = [
    'budget_local_currency', 'gross_us_canada',
    'opening_weekend_us_canada_usd', 'gross_worldwide'
]

In [133]:
def comma_cleaner(df, columns):
    for column in columns:
        df[column].replace(',', '', regex=True, inplace=True)
    return df

In [134]:
df2 = comma_cleaner(df_copy, digit_columns)

In [135]:
df2.loc[:, 'local_currency'] = df2.loc[:, 'budget_local_currency'].str.extract(
    '(\D+)', expand=False)

In [136]:
df2.loc[:,
        'local_currency_budget'] = df2.loc[:,
                                           'budget_local_currency'].str.extract(
                                               '(\d+)', expand=False)

In [137]:
df2.drop('budget_local_currency', axis=1, inplace=True)

In [138]:
df2['release_country'] = df2['release_date'].astype(str).apply(
    lambda x: x.split('(')[-1]).apply(lambda x: x.strip(')'))

In [139]:
df2['release_date'] = df2['release_date'].astype(str).apply(
    lambda x: x.split('(')[0])

In [140]:
df2['release_date'] = df2['release_date'].astype(str).apply(
    lambda x: x.replace(',', '')).apply(lambda x: x.replace('  ', ' '))

In [141]:
df2 = df2[[(len(x) > 7) for x in df2['release_date']]].copy()

In [142]:
df2 = pd.DataFrame(df2)

In [143]:
def month_replacement(row):
    months = {
        "January": '01',
        "February": '02',
        "March": '03',
        "April": '04',
        "May": '05',
        "June": '06',
        "July": '07',
        "August": '08',
        "September": '09',
        "October": '10',
        "November": '11',
        "December": '12'
    }
    old_date = row['release_date']
    row['release_date'] = ''.join([months.get(i, i) for i in old_date.split()])
    return row

In [144]:
df2 = df2.apply(month_replacement, axis=1)

In [145]:
df2['release_date'] = pd.to_datetime(df2['release_date'],
                                     format='%m%d%Y',
                                     errors='coerce')

In [146]:
df2['opening_weekend_date'] = df2['opening_weekend_date'].astype(str).apply(
    lambda x: x.replace(',', '')).apply(lambda x: x.replace('  ', ' '))

In [147]:
def short_month_replacement(row):
    months = {
        "Jan": '01',
        "Feb": '02',
        "Mar": '03',
        "Apr": '04',
        "May": '05',
        "Jun": '06',
        "Jul": '07',
        "Aug": '08',
        "Sep": '09',
        "Oct": '10',
        "Nov": '11',
        "Dec": '12'
    }
    old_date = row['opening_weekend_date']
    row['opening_weekend_date'] = ''.join(
        [months.get(i, i) for i in old_date.split()])
    return row

In [148]:
df2 = df2.apply(short_month_replacement, axis=1)

In [149]:
df2['opening_weekend_date'] = pd.to_datetime(df2['opening_weekend_date'],
                                             format='%m%d%Y',
                                             errors='coerce')

In [150]:
categories = [
    'language', 'filming_locations', 'filming_locations', 'aspect_ratio',
    'local_currency', 'release_country'
]

In [151]:
for column in categories:
    df2[column].fillna('Not defined', inplace=True)

In [152]:
def currency_replacement(row):
    currencies = {
        '$': 'USD',
        '€': 'EUR',
        '£': 'GBP',
        'CA$': 'CAD',
        '₹': 'INR',
        'RUR': 'RUB'
    }
    old_currency = row['local_currency']
    row['local_currency'] = ' '.join(
        [currencies.get(i, i) for i in old_currency.split()])
    return row

In [153]:
df2 = df2.apply(currency_replacement, axis=1)

In [154]:
money_cols = [
    'gross_us_canada', 'opening_weekend_us_canada_usd', 'gross_worldwide',
    'local_currency_budget'
]

In [161]:
for col in money_cols:
    df2.loc[:, col] = df2.loc[:, col].str.extract('(\d+)',
                                                  expand=False).astype('float')

In [162]:
def filming_location_split(row):
    if row['filming_locations'] != 'Not defined':
        location = row['filming_locations'].split(',')
        row['filming_country'] = location[-1]
        row['filming_locations'] = ', '.join(location[:-1])
    return row

In [163]:
df2 = df2.apply(filming_location_split, axis=1)

In [164]:
df2 = df2[[
    'movie_id', 'title', 'filming_country', 'filming_locations', 'language',
    'release_country', 'release_date', 'aspect_ratio', 'local_currency_budget',
    'local_currency', 'opening_weekend_date', 'opening_weekend_us_canada_usd',
    'gross_us_canada', 'gross_worldwide', 'movie_url'
]]

In [165]:
df2['filming_country'] = df2['filming_country'].fillna('Not defined')

In [None]:
df2.to_csv('financial_data.csv')

In [None]:
cast.to_csv('./support/top_cast.csv')
director.to_csv('./support/directors.csv')
writer.to_csv('./support/writers.csv')
country_of_origin.to_csv('./support/countries_of_origin.csv')
production_company.to_csv('./support/production_companies.csv')