# Data Munging

In [1]:
#Dependencies
import pandas as pd
import numpy as py
import matplotlib.pyplot as plt
from pprint import pprint
import json

#Importing NLP model
from score import *
from functools import reduce

In [2]:
film_data = pd.read_csv("./Resources/MCU_Film_Data.csv").rename(columns={"Title": "title"})
film_data

Unnamed: 0,title,Worldwide_gross,Movie Run Time,Budget,IMDB_Rating
0,Iron Man 2,623933331,124,200000000,7.0
1,Guardians of the Galaxy,774176600,121,170000000,8.1
2,Black Panther,1347071259,134,200000000,7.3
3,Iron Man 3,1215439994,130,200000000,7.2
4,The Incredible Hulk,263427551,112,150000000,6.7
5,Captain Marvel,1126318317,123,152000000,7.1
6,Iron Man,585174222,126,140000000,7.9
7,Thor,449326618,115,150000000,7.0
8,Captain America: Civil War,1153304495,147,250000000,7.8
9,Spider-Man: Far from Home,1131927996,129,160000000,7.6


In [3]:
#Open and read movie data and reviews data
with open("./Resources/movie_data.json", "r") as io:
    movie_data = json.loads(io.read())
    io.close()

with open("./Resources/reviews.json", "r") as io:
    reviews = json.loads(io.read())
    io.close()

In [4]:
#Loop through key,value pairs
movie_ids = [str(x) for x in movie_data["movie_ids"]]

m_reviews = []

for movie_id in movie_ids:
    review_list = reviews[movie_id]
    for review in review_list:
        row = {}
        row["movie_id"] = movie_id
        row["review"] = review
        
        index = movie_data["movie_ids"].index(int(movie_id))
        title = movie_data["titles"][index]
        
        row["title"] = title
       
        m_reviews.append(row)

In [5]:
#Convert to dataframe
reviews_df = pd.DataFrame(m_reviews)

In [6]:
#Run NLP on each review
review_series = reviews_df.review
scores = review_series.map(lambda r : score(r))
normal_scores = review_series.map(lambda r: normal_score(r))

In [7]:
#Add scores to dataframe
reviews_df["score"] = scores
reviews_df["normal_score"] = normal_scores

In [8]:
movie_group = reviews_df[['title','score','normal_score']].groupby(by = ["title"])

In [9]:
error_df = movie_group.sem().rename(columns= {"score": "score_error", "normal_score":"normal_error"})
mean_df = movie_group.mean().rename(columns= {"score": "score_mean", "normal_score":"normal_mean"})
median_df = movie_group.median().rename(columns= {"score": "score_median", "normal_score":"normal_median"})
skew_df = movie_group.skew().rename(columns= {"score": "score_skew", "normal_score":"normal_skew"})
std_df = movie_group.std().rename(columns= {"score": "score_std", "normal_score":"normal_std"})
df_list = [error_df, mean_df, median_df, skew_df, std_df]
stats_df = reduce(lambda  left,right: pd.merge(left,right,on=['title'],
                                            how='outer'), df_list)

In [10]:
error_df.rename(columns= {"score": "score_error", "normal_score":"normal_error"})

Unnamed: 0_level_0,score_error,normal_error
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Ant-Man,3.748683,0.007754
Ant-Man and the Wasp,0.625746,0.011193
Avengers: Age of Ultron,0.701124,0.005466
Avengers: Endgame,1.396746,0.02229
Avengers: Infinity War,0.701235,0.005648
Black Panther,1.160855,0.013753
Captain America: Civil War,8.213266,0.013563
Captain America: The First Avenger,1.944676,0.004234
Captain America: The Winter Soldier,2.561422,0.009368
Captain Marvel,0.860828,0.002694


In [11]:
film_df = pd.merge(film_data, stats_df, on = "title")

In [12]:
film_df.head()

Unnamed: 0,title,Worldwide_gross,Movie Run Time,Budget,IMDB_Rating,score_error,normal_error,score_mean,normal_mean,score_median,normal_median,score_skew,normal_skew,score_std,normal_std
0,Iron Man 2,623933331,124,200000000,7.0,3.300982,0.009692,3.777613,0.006925,2.30969,0.00777,0.8648,-0.095602,6.601964,0.019384
1,Guardians of the Galaxy,774176600,121,170000000,8.1,1.135609,0.009959,3.560966,0.033561,3.128628,0.020933,0.42144,0.449212,3.406827,0.029876
2,Black Panther,1347071259,134,200000000,7.3,1.160855,0.013753,5.461749,0.047055,6.752482,0.033594,-0.116105,2.87696,4.185523,0.049587
3,Iron Man 3,1215439994,130,200000000,7.2,3.10673,0.006838,5.203083,0.013578,3.530358,0.006953,1.795376,0.61998,6.94686,0.01529
4,The Incredible Hulk,263427551,112,150000000,6.7,6.061771,0.000931,9.073092,0.032528,9.073092,0.032528,,,8.572639,0.001317


In [13]:
movie_data_df = pd.DataFrame(movie_data).rename(columns = {"titles": "title"})

In [14]:
movie_data_df

Unnamed: 0,movie_ids,title,dates,average_score,vote_count,popularity
0,1726,Iron Man,2008-04-30,7.6,17181,38.256
1,1724,The Incredible Hulk,2008-06-12,6.2,6991,22.082
2,10138,Iron Man 2,2010-04-28,6.8,13525,24.616
3,284053,Thor: Ragnarok,2017-10-25,7.5,12809,44.892
4,1771,Captain America: The First Avenger,2011-07-22,6.9,13986,20.576
5,24428,The Avengers,2012-04-25,7.7,21256,40.091
6,68721,Iron Man 3,2013-04-18,6.9,15288,25.265
7,76338,Thor: The Dark World,2013-10-29,6.6,11118,28.534
8,100402,Captain America: The Winter Soldier,2014-03-20,7.7,12093,15.957
9,118340,Guardians of the Galaxy,2014-07-30,7.9,19489,54.049


In [15]:
film_df = pd.merge(film_df, movie_data_df, on="title").rename(columns = {"Domestic Opening" : "domestic_open",
                                                                        "Movie Run Time": "runtime",
                                                                        "average_score": "themoviedb_rating",
                                                                        "dates": "date",
                                                                        "movie_ids": "movie_id",
                                                                        "Budget": "budget",
                                                                        "Worldwide_gross": "worldwide_gross",
                                                                        "IMDB_Rating": "imdb_rating"})

In [16]:
film_df.head()

Unnamed: 0,title,worldwide_gross,runtime,budget,imdb_rating,score_error,normal_error,score_mean,normal_mean,score_median,normal_median,score_skew,normal_skew,score_std,normal_std,movie_id,date,themoviedb_rating,vote_count,popularity
0,Iron Man 2,623933331,124,200000000,7.0,3.300982,0.009692,3.777613,0.006925,2.30969,0.00777,0.8648,-0.095602,6.601964,0.019384,10138,2010-04-28,6.8,13525,24.616
1,Guardians of the Galaxy,774176600,121,170000000,8.1,1.135609,0.009959,3.560966,0.033561,3.128628,0.020933,0.42144,0.449212,3.406827,0.029876,118340,2014-07-30,7.9,19489,54.049
2,Black Panther,1347071259,134,200000000,7.3,1.160855,0.013753,5.461749,0.047055,6.752482,0.033594,-0.116105,2.87696,4.185523,0.049587,284054,2018-02-13,7.4,13969,36.369
3,Iron Man 3,1215439994,130,200000000,7.2,3.10673,0.006838,5.203083,0.013578,3.530358,0.006953,1.795376,0.61998,6.94686,0.01529,68721,2013-04-18,6.9,15288,25.265
4,The Incredible Hulk,263427551,112,150000000,6.7,6.061771,0.000931,9.073092,0.032528,9.073092,0.032528,,,8.572639,0.001317,1724,2008-06-12,6.2,6991,22.082


In [17]:
film_df.to_csv("./Resources/film_df.csv", index= False)