# Project Background

### The business objective of this project is to make three recommendations to our company who wants to create a new movie studio to compete with competitors. We are charged with finding out what movies are doing the best at the box office and to help our company decide what kind of films to create.

### To help assist us, we used data from (INSERT DATA HERE). (EXPLAIN DATA, POSITIVES AND NEGATIVES)

# Importing Libraries And Reading in Data

In [328]:
import pandas as pd
import numpy as np
import sqlite3
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline

In [329]:
#read in csv file, TheNumbers
df_movie_budgets = pd.read_csv('data/tn.movie_budgets.csv.gz') #5782 x 6, budget, domestic_gross, worldwide_gross from TheNumbers

#read in sql file, IMDB
conn = sqlite3.connect("data/im.db/im.db")
df_imdb = pd.read_sql( #682303 × 6 runtime, genre, actors/directors from IMDB
    """
    SELECT basic.primary_title as movie_title, basic.start_year as year, basic.runtime_minutes, basic.genres, 
    persons.primary_name as person_name, persons.primary_profession
    FROM movie_basics as basic
    JOIN principals
    ON basic.movie_id = principals.movie_id
    JOIN persons
    ON principals.person_id = persons.person_id
    WHERE persons.primary_profession LIKE '%actor%' OR persons.primary_profession LIKE '%actress%' OR persons.primary_profession LIKE '%director%'
    """
, conn
).drop_duplicates()

In [330]:
df_movie_budgets["release_year"] = [int(data[-4:]) for data in df_movie_budgets["release_date"]]
#merging TheNumbers and IMDB
df_final = pd.merge(df_movie_budgets, df_imdb, left_on=['movie', 'release_year'], right_on=['movie_title', 'year'])

In [331]:
df_final = df_final.drop(columns=['year', 'movie_title', 'id'])
df_final

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_year,runtime_minutes,genres,person_name,primary_profession
0,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,136.0,"Action,Adventure,Fantasy",Geoffrey Rush,"actor,soundtrack,producer"
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,136.0,"Action,Adventure,Fantasy",Terry Rossio,"writer,director,producer"
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,136.0,"Action,Adventure,Fantasy",Ian McShane,"actor,director,producer"
3,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,136.0,"Action,Adventure,Fantasy",Stuart Beattie,"writer,director,producer"
4,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,136.0,"Action,Adventure,Fantasy",Johnny Depp,"actor,producer,soundtrack"
...,...,...,...,...,...,...,...,...,...,...
10743,"Dec 31, 2014",Stories of Our Lives,"$15,000",$0,$0,2014,60.0,Drama,Mugambi Nthiga,"actor,writer,casting_department"
10744,"Dec 31, 2014",Stories of Our Lives,"$15,000",$0,$0,2014,60.0,Drama,Paul Ogola,actor
10745,"Dec 31, 2014",Stories of Our Lives,"$15,000",$0,$0,2014,60.0,Drama,Tim Mutungi,actor
10746,"Dec 31, 2014",Stories of Our Lives,"$15,000",$0,$0,2014,60.0,Drama,Kelly Gichohi,actress


# Formatting Change and Filtering

In [333]:
#removing NA values from genres column
df_final = df_final[df_final["genres"].isna() == False]
df_final["production_budget_int"] = [int(budget.strip('$').replace(",", "")) for budget in df_final["production_budget"]]
#df_final["domestic_gross_int"] = [int(budget.strip('$').replace(",", "")) for budget in df_final["domestic_gross"]]
df_final["worldwide_gross_int"] = [int(budget.strip('$').replace(",", "")) for budget in df_final["worldwide_gross"]]

In [342]:
df_final["profit"] = df_final["worldwide_gross_int"] - df_final["production_budget_int"]
df_final

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross,release_year,runtime_minutes,genres,person_name,primary_profession,production_budget_int,worldwide_gross_int,profit
0,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,136.0,"Action,Adventure,Fantasy",Geoffrey Rush,"actor,soundtrack,producer",410600000,1045663875,635063875
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,136.0,"Action,Adventure,Fantasy",Terry Rossio,"writer,director,producer",410600000,1045663875,635063875
2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,136.0,"Action,Adventure,Fantasy",Ian McShane,"actor,director,producer",410600000,1045663875,635063875
3,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,136.0,"Action,Adventure,Fantasy",Stuart Beattie,"writer,director,producer",410600000,1045663875,635063875
4,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",2011,136.0,"Action,Adventure,Fantasy",Johnny Depp,"actor,producer,soundtrack",410600000,1045663875,635063875
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10743,"Dec 31, 2014",Stories of Our Lives,"$15,000",$0,$0,2014,60.0,Drama,Mugambi Nthiga,"actor,writer,casting_department",15000,0,-15000
10744,"Dec 31, 2014",Stories of Our Lives,"$15,000",$0,$0,2014,60.0,Drama,Paul Ogola,actor,15000,0,-15000
10745,"Dec 31, 2014",Stories of Our Lives,"$15,000",$0,$0,2014,60.0,Drama,Tim Mutungi,actor,15000,0,-15000
10746,"Dec 31, 2014",Stories of Our Lives,"$15,000",$0,$0,2014,60.0,Drama,Kelly Gichohi,actress,15000,0,-15000


# Exploratory Data Analysis

In [13]:
#plt.scatter(df_final["id"], df_final["worldwide_gross"])
#plt.show()

# Write to File

# Conclusion

### (BLAH BLAH BLAH)

# Appendix

In [343]:
df_movie_gross = pd.read_csv('data/bom.movie_gross.csv.gz') #3387 x 5 domestic_gross, foreign_gross, year from
df_movie_info = pd.read_csv('data/rt.movie_info.tsv.gz', sep='\t') #1560 x 12 rating, genre, director, writer, box office, runtime from Rotten Tomatoes
df_reviews = pd.read_csv('data/rt.reviews.tsv.gz', sep='\t', encoding='unicode_escape') #54432 x 8 rating, fresh, top critic from Rotten tomatoes
df_movies = pd.read_csv('data/tmdb.movies.csv.gz') #26517 x 10 popularity, vote_average, vote_count #from TheMoviesTB
df_directors = pd.read_sql(
    """
    Select persons.primary_name, persons.primary_profession, movie_basics.primary_title
    FROM directors
    JOIN persons
    ON directors.person_id = persons.person_id
    JOIN movie_basics
    ON movie_basics.movie_id = directors.movie_id
    """
, conn
)
df_persons = pd.read_sql(
    """
    SELECT *
    FROM known_for
    JOIN persons
    ON known_for.person_id = persons.person_id
    """
, conn
)
df_imdb[df_imdb["person_name"] == "James Cameron"] #Something we noticed is that Avatar is not a part of the IMDB data
df_imdb["year"].value_counts() #The data seems to start in 2010 and starts dwindling in 2020

2017    80850
2018    79998
2016    78784
2015    73918
2014    72248
2013    68599
2012    65035
2011    61098
2010    56322
2019    40777
2020     4274
2021      300
2022       70
2027        9
2023        8
2025        7
2115        5
2024        1
Name: year, dtype: int64