## [MCU Worldwide Box Office Collection 데이터 분석] 
 * [생활-영화] Marvel Cinematic Universe 전세계 박스 오피스 컬렉션 데이터 세트
 * 지역별 모든 박스 오피스 컬렉션 정보
 * 데이터 출처 : https://www.kaggle.com/datasets/mayureshkoli/mcu-worldwide-box-office-collection
 
 * 데이터 분석 코드
   * [github 코드](https://github.com/LDJWJ/dataAnalysis/blob/main/01_11_MCU_MOVIE_INFO.ipynb)
   * [HTML코드 - 시작](https://ldjwj.github.io/dataAnalysis/01_11_MCU_MOVIE_INFO.html)
   * [HTML코드 - 전처리및탐색](https://ldjwj.github.io/dataAnalysis/01_11_MCU_MOVIE_INFO_02.html)

### 데이터 셋 개요
 * 6개의 데이터 셋이 존재
 * 데이터 파일
   * movie_info.csv : 영화 정보
   * asia_pacific_box_office.csv : 아시아 지역
   * europe_box_office.csv : 유럽 지역
   * middle_east_and_africa_box_office.csv : 중동, 아프리카 지역
   * north_america_box_office.csv : 북미 지역
   * south_america_box_office.csv : 남미 지역

### 데이터 설명

* Input/output variables 
<pre>
  
</pre>

### 라이브러리 불러오기

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

### 데이터 불러오기

In [3]:
mov_info = pd.read_csv("./data/Marvel/movie_info.csv")
asia_info = pd.read_csv("./data/Marvel/asia_pacific_box_office.csv")
europe_info = pd.read_csv("./data/Marvel/europe_box_office.csv")
middle_east_info = pd.read_csv("./data/Marvel/middle_east_and_africa_box_office.csv")
north_america_info = pd.read_csv("./data/Marvel/north_america_box_office.csv")
south_america_info = pd.read_csv("./data/Marvel/south_america_box_office.csv")

mov_info.shape, asia_info.shape, europe_info.shape, middle_east_info.shape, north_america_info.shape, south_america_info.shape

((27, 11), (27, 17), (27, 31), (27, 13), (27, 5), (27, 12))

In [4]:
mov_info.head()

Unnamed: 0,movie_title,release_date,season,phase,production_budget_in_million_(USD),worldwide_collection_in_million_(USD),tomatometer,tomato_audience_score,imdb,metascore,meta_user_score
0,Iron Man,"May 2, 2008",Spring,1,140,585.37,9.4,9.1,7.9,7.9,8.6
1,The Incredible Hulk,"June 13, 2008",Spring,1,150,264.77,6.7,7.0,6.6,6.1,7.0
2,Iron Man 2,"May 7, 2010",Spring,1,200,623.93,7.2,7.1,6.9,5.7,6.4
3,Thor,"May 6, 2011",Spring,1,150,449.33,7.7,7.6,7.0,5.7,7.1
4,Captain America: The First Avenger,"July 22, 2011",Summer,1,140,370.57,7.9,7.5,6.9,6.6,6.8


In [5]:
print( asia_info.head(3), end="\n\n" )
print( europe_info.head(3), end="\n\n" )
print( middle_east_info.head(3), end="\n\n" )
print( north_america_info.head(3), end="\n\n" )
print( south_america_info.head(3), end="\n\n" )

           movie_title  South Korea  Russia/CIS  Japan  Thailand  Indonesia  \
0             Iron Man        25.17        9.49   8.66      2.45       2.15   
1  The Incredible Hulk         6.38        6.41   1.69      1.18       1.50   
2           Iron Man 2        27.10       14.76  12.83      4.62       4.49   

   India  Taiwan  Philippines  Singapore  Vietnam  Malaysia  Hong Kong  \
0   1.99    5.37         3.99       3.82      NaN      3.47       2.84   
1   3.14    1.94         2.07       1.84     0.16      2.28       1.60   
2   1.23    4.04         6.25       4.19      NaN      4.64       3.76   

   New Zealand  Australia  China  Other_Asia_Pacific_Countries  
0         2.73      19.09  15.27                          1.37  
1         0.88       4.55   9.34                          0.70  
2         2.70      22.42   7.92                          6.57  

           movie_title  United Kingdom  Spain  Italy  Germany  Denmark  \
0             Iron Man           34.28  12.03  10.8

In [6]:
asia_info.head()

Unnamed: 0,movie_title,South Korea,Russia/CIS,Japan,Thailand,Indonesia,India,Taiwan,Philippines,Singapore,Vietnam,Malaysia,Hong Kong,New Zealand,Australia,China,Other_Asia_Pacific_Countries
0,Iron Man,25.17,9.49,8.66,2.45,2.15,1.99,5.37,3.99,3.82,,3.47,2.84,2.73,19.09,15.27,1.37
1,The Incredible Hulk,6.38,6.41,1.69,1.18,1.5,3.14,1.94,2.07,1.84,0.16,2.28,1.6,0.88,4.55,9.34,0.7
2,Iron Man 2,27.1,14.76,12.83,4.62,4.49,1.23,4.04,6.25,4.19,,4.64,3.76,2.7,22.42,7.92,6.57
3,Thor,14.79,16.54,5.74,2.32,0.27,1.0,5.83,4.03,4.25,,3.73,2.49,1.9,20.14,,5.29
4,Captain America: The First Avenger,3.81,8.64,3.43,2.48,2.05,0.12,6.32,3.58,3.56,0.6,3.0,2.5,1.47,11.11,,3.84


### movie info를 이용한 선형회귀 모델 구축

In [7]:
mov_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 11 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   movie_title                            27 non-null     object 
 1   release_date                           27 non-null     object 
 2   season                                 27 non-null     object 
 3   phase                                  27 non-null     int64  
 4   production_budget_in_million_(USD)     27 non-null     int64  
 5   worldwide_collection_in_million_(USD)  27 non-null     float64
 6   tomatometer                            27 non-null     float64
 7   tomato_audience_score                  27 non-null     float64
 8   imdb                                   27 non-null     float64
 9   metascore                              27 non-null     float64
 10  meta_user_score                        27 non-null     float64
dtypes: float

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [9]:
mov_info.head()

Unnamed: 0,movie_title,release_date,season,phase,production_budget_in_million_(USD),worldwide_collection_in_million_(USD),tomatometer,tomato_audience_score,imdb,metascore,meta_user_score
0,Iron Man,"May 2, 2008",Spring,1,140,585.37,9.4,9.1,7.9,7.9,8.6
1,The Incredible Hulk,"June 13, 2008",Spring,1,150,264.77,6.7,7.0,6.6,6.1,7.0
2,Iron Man 2,"May 7, 2010",Spring,1,200,623.93,7.2,7.1,6.9,5.7,6.4
3,Thor,"May 6, 2011",Spring,1,150,449.33,7.7,7.6,7.0,5.7,7.1
4,Captain America: The First Avenger,"July 22, 2011",Summer,1,140,370.57,7.9,7.5,6.9,6.6,6.8


* meta_user_score 사용자 예측 모델

In [10]:
mov_info.columns

Index(['movie_title', 'release_date', 'season', 'phase',
       'production_budget_in_million_(USD)',
       'worldwide_collection_in_million_(USD)', 'tomatometer',
       'tomato_audience_score', 'imdb', 'metascore', 'meta_user_score'],
      dtype='object')

In [11]:
sel = [ 'production_budget_in_million_(USD)',
       'worldwide_collection_in_million_(USD)', 'tomatometer',
       'tomato_audience_score', 'imdb', 'metascore'  ]

X = mov_info[sel]
y = mov_info['meta_user_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

X_train.shape, X_test.shape

((24, 6), (3, 6))

In [16]:
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)

print( model.score(X_test, y_test) )

-0.7185104879973476


In [13]:
### MSE 구하기
np.mean(  (pred - y_test)**2 )

0.5613800927457998

In [14]:
### MAE 구하기
np.mean(  np.abs(pred - y_test) )

0.47688356031118584