# Group 10 - Project 1

# Members: Bryan Groves, Randy Lam, Zach Wood, Marti Reisinger

# Topic: Drivers in revenue for top 1000 movies

## Overview: We intend to utilize a dataset from Kaggle that outlines the top 1000 movies by their ranking. We intend to analyze the studio, runtime, and performance by date  from both revenue and number of movies that fall within these categories.

In [None]:
# Import 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress
import scipy.stats as stats

In [None]:
#Load CSV
movie_df = pd.read_csv("Resources/movies.csv")

movie_df.head()

In [None]:
cleaned_columns = movie_df.drop(columns = ["Movie Info","Unnamed: 0","Genre"])

cleaned_columns.head()

In [None]:
final_movie_list = cleaned_columns.dropna()

final_movie_list

# Analysis

#### We would like to see how much revenue each studio is generating in total by creating stacked bar chart with revenue by studio

In [None]:
rev_total_columns = final_movie_list.drop(columns = ["Movie Runtime","Title","License","Release Date","Domestic Sales (in $)","International Sales (in $)"]).set_index('Distributor')

rev_total_columns.head()

In [None]:
rev_totals = rev_total_columns.groupby('Distributor').sum()

rev_totals.head()

In [None]:
x_axis = np.arange(len(rev_totals))
tick_locations = [value for value in x_axis]

In [None]:
plot_pandas= rev_totals.sort_values(by="World Sales (in $)", ascending=False).plot.bar(color='b')
plt.xlabel("Distributor")
plt.ylabel("World Sales (in $)")
plt.title("World Sales Revenue by Distributor")

In [None]:
plt.savefig("Images/totalrev.png")
plt.show()

#### Per studio, what is their average revenue for movies that fall within the top 1000 list

In [None]:
average_sales = rev_total_columns.groupby('Distributor').mean()

average_sales

In [None]:
plot_pandas=average_sales.sort_values(by="World Sales (in $)", ascending=False).plot.bar(color='b')
plt.xlabel("Distributor")
plt.ylabel("Average World Sales (in $)")
plt.title("Average World Sales Revenue by Distributor")

#### Is there a better time of year for movie successes? We would like to look at the top 1000 movies and understand if summer blockbusters (June-Aug) fair better than holiday movies (Nov-Jan)

In [None]:
#Randy Visual 1

movie_df['Release Date']=pd.to_datetime(movie_df['Release Date'])

movie_df

In [None]:
movie_df.set_index('Release Date')
movie_df

In [None]:
movie_df.loc[movie_df.index.isin([2, 3, 4, 5, 9, 10])]
movie_df

In [None]:
#bar chart
year_revenue = movie_df.drop(columns = ["Title","Unnamed: 0","Distributor", "Genre", "Movie Runtime", "License", "Domestic Sales (in $)", "International Sales (in $)"])

plot_pandas= month_revenue.plot.bar(color='r')
plt.xlabel("year")
plt.ylabel("revenue (in billions)")
plt.title("revenue by year")

#### We would like to measure the total revenue by year for movies that are in the top 1000 list

In [None]:
#Randy Visual 2

In [None]:
movie_df['Release Date']=pd.to_datetime(movie_df['Release Date']).dt.strftime('%Y-%m-%d')

In [None]:
year_revenue = movie_df.drop(columns = ["Title","Unnamed: 0","Distributor", "Genre", "Movie Runtime", "License", "Domestic Sales (in $)", "International Sales (in $)"])

plot_pandas= month_revenue.plot.bar(color='r')
plt.xlabel("year")
plt.ylabel("revenue (in billions)")
plt.title("revenue by year")

#### Create chart based on revenue by market

#### Provide analysis of total counts and revenue by rating

In [None]:
G_data["Title"].nunique()

In [None]:
PG_data["Title"].nunique()


In [None]:
PG13_data["Title"].nunique()

In [None]:
R_data["Title"].nunique()

In [None]:
G = 14
PG = 173
PG13 = 363
R = 194

In [None]:
license_df = pd.DataFrame({
    "G Rating":G,
    "PG Rating":PG,
    "PG-13 Rating":PG13,
    "R Rating":R,}, index=[0])


In [None]:
license_df[["G Rating","PG Rating","PG-13 Rating","R Rating"]]
license_df

In [None]:
License_Chart = license_df.plot.bar(color=['blue', 'red', 'green', 'cyan'], align= "center")

plt.ylabel("Movie Count")
plt.xlabel("Movie Rating")

plt.show()



# Hypothesis Test

#### Research Hypothesis: In our sample of top U.S. films, we believe that films with shorter runtime will earn more on average than those with a longer runtime.

#### Null Hypothesis: There is no difference in earnings in films based on runtime.

In [None]:
#Zach Hypothesis

In [None]:
hypoth_df = final_movie_list

In [None]:
hypoth_df["Movie Runtime"] = hypoth_df["Movie Runtime"].str.replace(" hr", "*60").str.replace(" ", " + ").str.replace("min", "0").apply(eval)

In [None]:
hypoth_df

In [None]:
hypoth_rev_list = hypoth_df["World Sales (in $)"].tolist()
hypoth_time_list = hypoth_df["Movie Runtime"].tolist()

In [None]:
print(hypoth_df["Movie Runtime"].median())
print(hypoth_df["Movie Runtime"].mean())
print(hypoth_df["Movie Runtime"].min())
print(hypoth_df["Movie Runtime"].max())

In [None]:
hypoth_time_series = hypoth_df["Movie Runtime"] 

hypoth_quartiles = hypoth_time_series.quantile([.25, .5, .75])
hypoth_lower = hypoth_quartiles[.25]
hypoth_upper = hypoth_quartiles[.75]
hypoth_iqr = hypoth_upper - hypoth_lower

print(hypoth_lower)
print(hypoth_upper)

In [None]:
plt.hist(hypoth_time_list)

plt.title("Distribution of Films by Runtime")
plt.xlabel("Film Runtime in Minutes")
plt.ylabel("Count")

plt.show()

In [None]:
hypoth_s_df = hypoth_df.loc[hypoth_df["Movie Runtime"] <= 101]
hypoth_l_df = hypoth_df.loc[hypoth_df["Movie Runtime"] >= 130]

hypoth_m_df = hypoth_df.loc[(hypoth_df["Movie Runtime"] > 101) & (hypoth_df["Movie Runtime"] < 130)]

In [None]:
hypoth_s_series = hypoth_s_df["World Sales (in $)"].squeeze()
hypoth_l_series = hypoth_l_df["World Sales (in $)"].squeeze()

hypoth_s_timelist = hypoth_s_df["Movie Runtime"].to_list()
hypoth_l_timelist = hypoth_l_df["Movie Runtime"].to_list()

In [None]:
print(hypoth_s_series.mean())
print(hypoth_l_series.mean())

In [None]:
stats.ttest_ind(hypoth_s_series, hypoth_l_series, alternative = "greater")

In [None]:
hypoth_m_series = hypoth_m_df["World Sales (in $)"].squeeze()

hypoth_m_timelist = hypoth_m_df["Movie Runtime"].to_list()

In [None]:
plt.scatter(hypoth_s_timelist, hypoth_s_series, marker = "o", facecolors = "lightblue", edgecolors = "black")
plt.scatter(hypoth_l_timelist, hypoth_l_series, marker = "o", facecolors = "lightgreen", edgecolors = "black")
plt.scatter(hypoth_m_timelist, hypoth_m_series, marker = "o", facecolors = "orange", edgecolors = "black")

plt.title("Total Film Sales by Runtime")
plt.xlabel("Film Runtime in Minutes")
plt.ylabel("World Sales in Billions of Dollars")

plt.show()

In [None]:
final_movie_list["Domestic Sales (in $)"] = final_movie_list['Domestic Sales (in $)'].map("${:,.2f}".format)
final_movie_list["International Sales (in $)"] = final_movie_list['International Sales (in $)'].map("${:,.2f}".format)
final_movie_list["World Sales (in $)"] = final_movie_list['World Sales (in $)'].map("${:,.2f}".format)

final_movie_list