In [None]:
import csv
import pickle
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

%matplotlib inline

In [None]:
def write_pickle(data, file_name):
    """Create a pickle file

    Keyword arguments:
    data -- the information to be saved
    file_name -- the name of the file without an extension
    """
    sys.setrecursionlimit(20000)
    pickle_file = open(file_name + '.pkl', 'wb')
    pickle.dump(data, pickle_file)
    pickle_file.close()
    sys.setrecursionlimit(3000)

    return True


def read_pickle(file_name):
    """Import a pickle file

    Keyword arguments:
    file_name -- the name of the file to be read without an extension
    """
    pickle_file = open(file_name + '.pkl', 'rb')
    file = pickle.load(pickle_file)
    pickle_file.close()

    return file


def write_csv(data, file_name):
    """Saves a flat dictionary to a .CSV.

    Keyword arguments:
    data -- a flat dictionary
    file_name -- the desired filename without an extension
    """
    with open(file_name + '.csv', 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile, delimiter=',',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for row in data:
            if type(row) == dict:
                csv_writer.writerow(list(row.values()))
            else:
                csv_writer.writerow(row)

    return True


def read_csv(file_name):
    """Reads a .CSV into a list.

    Keyword arguments:
    file_name -- the desired file to load without an extension
    """
    result = []
    with open(file_name + '.csv', newline='') as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in csv_reader:
            result.append(row)

    return result


In [None]:
# Load and clean 2018 movies.
movies_2018 = pd.DataFrame(read_csv('data/2018_movies'))
movies_2018['datetime'] = pd.to_datetime(movies_2018[5] + ', 2018')
movies_2017 = pd.DataFrame(read_csv('data/2017_movies'))
movies_2017['datetime'] = pd.to_datetime(movies_2017[5] + ', 2017')
movies_2016 = pd.DataFrame(read_csv('data/2016_movies'))
movies_2016['datetime'] = pd.to_datetime(movies_2016[5] + ', 2016')
movies_2015 = pd.DataFrame(read_csv('data/2015_movies'))
movies_2015['datetime'] = pd.to_datetime(movies_2015[5] + ', 2015')
movies_2014 = pd.DataFrame(read_csv('data/2014_movies'))
movies_2014['datetime'] = pd.to_datetime(movies_2014[5] + ', 2014')
movies_2013 = pd.DataFrame(read_csv('data/2013_movies'))
movies_2013[5] = movies_2013[5].apply(
    lambda x: 'Feb 28' if x == 'Feb 29' else x)
movies_2013['datetime'] = pd.to_datetime(movies_2013[5] + ', 2013')
movies_2012 = pd.DataFrame(read_csv('data/2012_movies'))
movies_2012['datetime'] = pd.to_datetime(movies_2012[5] + ', 2012')
movies_2011 = pd.DataFrame(read_csv('data/2011_movies'))
movies_2011['datetime'] = pd.to_datetime(movies_2011[5] + ', 2011')
movies_2010 = pd.DataFrame(read_csv('data/2010_movies'))
movies_2010['datetime'] = pd.to_datetime(movies_2010[5] + ', 2010')

all_movies = pd.concat((movies_2018, movies_2017, movies_2016, movies_2015, movies_2014,
                        movies_2013, movies_2012, movies_2011, movies_2010))


def CountFrequency(my_list):
    freq = {}

    for item in my_list:
        if (item in freq):
            freq[item] += 1
        else:
            freq[item] = 1

    return freq


# Filters distributors who have released more than 80 movies over the data set.
distributor_count = CountFrequency(all_movies[7])

# Manually select studios to reduce the number of featurs.
# dists = list({k: v for (k, v) in distributor_count.items() if (v > 80) and (k != '-')})
dists = ['Walt Disney Studios Motion Pictures', 'Universal Pictures', 'Twentieth Century Fox',
         'Sony Pictures Entertainment (SPE)', 'Paramount Pictures', 'Warner Bros.']

movie_set = all_movies.loc[all_movies[7].apply(
    lambda x: x in dists), [2, 3, 5, 7]]
movie_set.columns = ['Gross', 'Theaters', 'Date', 'Distributor']
movie_set['Gross'] = movie_set['Gross'].apply(
    lambda x: int(re.sub(r'[$,]', '', x)))
movie_set['Theaters'] = movie_set['Theaters'].apply(
    lambda x: x.replace('-', '0'))
movie_set['Theaters'] = movie_set['Theaters'].apply(
    lambda x: int(x.replace(',', '')))


In [None]:
# Creates flags for movie by distributor.
for distributor in dists:
    movie_set[distributor.replace(
        ' ', '')] = movie_set['Distributor'].apply(lambda x: 1 if x == distributor else 0)


In [None]:
movie_set.corr()


In [None]:
plt.figure(figsize=(20, 20))

sns.heatmap(movie_set.corr(),
            cmap="seismic", annot=True, vmin=-1, vmax=1)
plt.gca().set_ylim(len(movie_set.corr())+0.5, -0.5)

In [None]:
sns.pairplot(movie_set, height=1.5, aspect=1)
# %%
lr = LinearRegression()

X = movie_set['Theaters'].values.reshape(-1, 1)

y = movie_set['Gross']

lr.fit(X, y)
# %%
lr.score(X, y)
# %%
# Bucket directors, studios?, actors by earnings/salaries.
# Genres are not exclusive.

In [None]:
movie_for_full = movie_set.copy()

del movie_for_full['Date']
del movie_for_full['Distributor']

lr_full = LinearRegression()

X = movie_for_full.loc[:, 'Theaters':'WarnerBros.']

y = movie_for_full['Gross']

lr_full.fit(X, y)

lr_full.score(X, y)
# %%
sm.add_constant(X).head()
# %%
model = sm.OLS(y, sm.add_constant(X))
fit = model.fit()
fit.summary()
# %%
plt.figure(figsize=(10, 7))
plt.scatter(fit.predict(), fit.resid)

plt.axhline(0, linestyle='--', color='gray')
plt.xlabel('Predicted Values', fontsize=18)
plt.ylabel('Residuals', fontsize=18)
# %%
lr_full = LinearRegression()
X = df(['Theaters', 'Distributor', 'WaltDisneyStudiosMotionPictures', 'UniversalPictures',
        'TwentiethCenturyFox', 'SonyPicturesEntertainment(SPE)',
        'ParamountPictures', 'WarnerBros.'])
