# Data Munging

In [1]:
#Dependencies
import pandas as pd
import numpy as py
import matplotlib.pyplot as plt
from pprint import pprint
import json

#Importing NLP model
from score import *
from functools import reduce

In [2]:
film_data = pd.read_csv("./Resources/MCU_Film_Data.csv").rename(columns={"Film": "title"})
film_data

Unnamed: 0,title,Domestic Opening,Budget,Running Time,Worldwide_gross
0,The Avengers,207438708,220000000,143,1518812988
1,Avengers: Age of Ultron,191271109,250000000,141,1405403694
2,Captain America: Civil War,179139142,250000000,147,1153304495
3,Avengers: Infinity War,257698183,316000000,149,2048359754
4,Avengers: Endgame,357115007,356000000,181,2797800564


In [3]:
#Open and read movie data and reviews data
with open("./Resources/movie_data.json", "r") as io:
    movie_data = json.loads(io.read())
    io.close()

with open("./Resources/reviews.json", "r") as io:
    reviews = json.loads(io.read())
    io.close()

In [4]:
#Loop through key,value pairs
movie_ids = [str(x) for x in movie_data["movie_ids"]]

m_reviews = []

for movie_id in movie_ids:
    review_list = reviews[movie_id]
    for review in review_list:
        row = {}
        row["movie_id"] = movie_id
        row["review"] = review
        
        index = movie_data["movie_ids"].index(int(movie_id))
        title = movie_data["titles"][index]
        
        row["title"] = title
       
        m_reviews.append(row)

In [5]:
#Convert to dataframe
reviews_df = pd.DataFrame(m_reviews)

In [6]:
#Run NLP on each review
review_series = reviews_df.review
scores = review_series.map(lambda r : score(r))
normal_scores = review_series.map(lambda r: normal_score(r))

In [7]:
#Add scores to dataframe
reviews_df["score"] = scores
reviews_df["normal_score"] = normal_scores

In [11]:
movie_group = reviews_df[['title','score','normal_score']].groupby(by = ["title"])

In [12]:
error_df = movie_group.sem().rename(columns= {"score": "score_error", "normal_score":"normal_error"})
mean_df = movie_group.mean().rename(columns= {"score": "score_mean", "normal_score":"normal_mean"})
median_df = movie_group.median().rename(columns= {"score": "score_median", "normal_score":"normal_median"})
skew_df = movie_group.skew().rename(columns= {"score": "score_skew", "normal_score":"normal_skew"})
std_df = movie_group.std().rename(columns= {"score": "score_std", "normal_score":"normal_std"})
df_list = [error_df, mean_df, median_df, skew_df, std_df]
stats_df = reduce(lambda  left,right: pd.merge(left,right,on=['title'],
                                            how='outer'), df_list)

In [13]:
error_df.rename(columns= {"score": "score_error", "normal_score":"normal_error"})

Unnamed: 0_level_0,score_error,normal_error
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Avengers: Age of Ultron,0.701124,0.005466
Avengers: Endgame,1.396746,0.02229
Avengers: Infinity War,0.701235,0.005648
Captain America: Civil War,8.213266,0.013563
The Avengers,1.032284,0.006982


In [14]:
film_df = pd.merge(film_data, stats_df, on = "title")

In [15]:
film_df

Unnamed: 0,title,Domestic Opening,Budget,Running Time,Worldwide_gross,score_error,normal_error,score_mean,normal_mean,score_median,normal_median,score_skew,normal_skew,score_std,normal_std
0,The Avengers,207438708,220000000,143,1518812988,1.032284,0.006982,2.567501,0.029088,1.016494,0.0205,2.867453,0.96371,4.616514,0.031223
1,Avengers: Age of Ultron,191271109,250000000,141,1405403694,0.701124,0.005466,0.998049,0.014244,0.573129,0.017795,2.840256,-0.707376,3.135522,0.024447
2,Captain America: Civil War,179139142,250000000,147,1153304495,8.213266,0.013563,12.120093,0.03755,5.01295,0.04445,2.106036,-0.461761,21.73026,0.035885
3,Avengers: Infinity War,257698183,316000000,149,2048359754,0.701235,0.005648,1.055281,0.016934,0.259979,0.012863,3.123656,0.551084,2.52834,0.020364
4,Avengers: Endgame,357115007,356000000,181,2797800564,1.396746,0.02229,2.732638,0.035042,0.6628,0.016265,2.633917,4.110369,6.24644,0.099684


In [16]:
movie_data_df = pd.DataFrame(movie_data).rename(columns = {"titles": "title"})

In [17]:
film_df = pd.merge(film_df, movie_data_df, on="title").rename(columns = {"Domestic Opening" : "domestic_open",
                                                                        "Running Time": "runtime",
                                                                        "average_score": "user_score",
                                                                        "dates": "date",
                                                                        "movie_ids": "movie_id",
                                                                        "Budget": "budget",
                                                                        "Worldwide_gross": "worldwide_gross"})

In [20]:
film_df.to_csv("./Resources/film_df.csv", index= False)