# 미니 프로젝트 : 문화(영화) 관객수 예측 모델링
### 5기 박혜진


<div class="alert-danger">
    
# 1. 라이브러리 및 데이터  
  
</div>

In [1]:
# -*- coding: utf-8 -*-
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', family='NanumBarunGothic') #matplotlib 한글 깨짐 방지

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import os
import sys

In [3]:
train = pd.read_csv("movies_train.csv", parse_dates=['release_time'])
test = pd.read_csv("movies_test.csv", parse_dates=['release_time'])

In [4]:
submission = pd.read_csv("submission.csv")

### 결측치 확인 / 데이터 전처리

In [5]:
print(train.isna().sum())
print(test.isna().sum())

title               0
distributor         0
genre               0
release_time        0
time                0
screening_rat       0
director            0
dir_prev_bfnum    330
dir_prev_num        0
num_staff           0
num_actor           0
box_off_num         0
dtype: int64
title               0
distributor         0
genre               0
release_time        0
time                0
screening_rat       0
director            0
dir_prev_bfnum    136
dir_prev_num        0
num_staff           0
num_actor           0
dtype: int64


> dir_prev_bfnum에 결측치 존재

In [6]:
train[train['dir_prev_bfnum'].isna()].head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
6,길위에서,백두대간,다큐멘터리,2013-05-23,104,전체 관람가,이창재,,0,32,5,53526
8,"1789, 바스티유의 연인들",유니버설픽쳐스인터내셔널코리아,뮤지컬,2014-09-18,129,전체 관람가,정성복,,0,3,5,4778
9,청춘그루브,(주)두타연,드라마,2012-03-15,94,15세 관람가,변성현,,0,138,3,868
10,AV 아이돌,(주) 케이알씨지,멜로/로맨스,2015-07-27,89,청소년 관람불가,조조 히데오,,0,0,4,745


In [7]:
test[test['dir_prev_bfnum'].isna()].head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
5,회오리 바람,모쿠슈라,드라마,2010-02-25,95,15세 관람가,장건재,,0,156,2
6,경계도시 2,시네마 달,다큐멘터리,2010-03-18,104,15세 관람가,홍형숙,,0,11,2
7,이웃집 남자,(주)루믹스미디어,드라마,2010-03-18,100,청소년 관람불가,장동홍,,0,117,5
8,아마존의 눈물 극장판,마운틴픽쳐스,다큐멘터리,2010-03-25,88,15세 관람가,김진만,,0,76,1
9,반가운 살인자,롯데쇼핑(주)롯데엔터테인먼트,코미디,2010-04-08,107,15세 관람가,김동욱,,0,255,3


In [8]:
print(train[train['dir_prev_bfnum'].isna()]['dir_prev_num'].sum())
print(test[test['dir_prev_bfnum'].isna()]['dir_prev_num'].sum())

0
0


> dir_prev_bnum의 결측치 데이터의 dir_prev_bfnum의 값이 모두 0이므로 마찬가지로 dir_prev_bfnum를 0으로 대체

In [9]:
train['dir_prev_bfnum'].fillna(0, inplace=True)
test['dir_prev_bfnum'].fillna(0, inplace=True)

In [10]:
train.isna().sum() #결측치 재확인

title             0
distributor       0
genre             0
release_time      0
time              0
screening_rat     0
director          0
dir_prev_bfnum    0
dir_prev_num      0
num_staff         0
num_actor         0
box_off_num       0
dtype: int64

In [11]:
test.isna().sum() #결측치 재확인

title             0
distributor       0
genre             0
release_time      0
time              0
screening_rat     0
director          0
dir_prev_bfnum    0
dir_prev_num      0
num_staff         0
num_actor         0
dtype: int64

<div class="alert-danger">
    
# 2. 모델링
  
</div>

In [12]:
# 장르 매핑
train['genre_rank'] = train.genre.map({'뮤지컬' : 1, '다큐멘터리' : 2, '서스펜스' : 3, '애니메이션' : 4, '멜로/로맨스' : 5,
                                      '미스터리' : 6, '공포' : 7, '드라마' : 8, '코미디' : 9, 'SF' : 10, '액션' : 11, '느와르' : 12})
test['genre_rank'] = test.genre.map({'뮤지컬' : 1, '다큐멘터리' : 2, '서스펜스' : 3, '애니메이션' : 4, '멜로/로맨스' : 5,
                                      '미스터리' : 6, '공포' : 7, '드라마' : 8, '코미디' : 9, 'SF' : 10, '액션' : 11, '느와르' : 12})

In [13]:
X = train[['time', 'num_staff', 'num_actor', 'genre_rank', 'screening_rat']]
y = np.log1p(train.box_off_num)

In [14]:
X = pd.get_dummies(columns = ['screening_rat'], data = X)

In [15]:
X['num_actor'] = np.log1p(X['num_actor'])

In [16]:
target = test[['time', 'num_staff', 'num_actor', 'genre_rank', 'screening_rat']]

In [17]:
target = pd.get_dummies(columns = ['screening_rat'], data = target)
target['num_actor'] = np.log1p(target['num_actor'])

In [18]:
import re

train['distributor'] = train.distributor.str.replace("(주)", '')
test['distributor'] = test.distributor.str.replace("(주)", '')
train['distributor'] = [re.sub(r'[^0-9a-zA-Z가-힣]', '', x) for x in train.distributor]
test['distributor'] = [re.sub(r'[^0-9a-zA-Z가-힣]', '', x) for x in test.distributor]

In [19]:
# 10fold 교차 검증
from sklearn.model_selection import KFold

kf = KFold(n_splits =  10, shuffle = True, random_state = 42)

### (1) GradientBoostingRegressor

In [20]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error

gbm = GradientBoostingRegressor(random_state = 42)

In [21]:
rmse_list = []
gb_pred = np.zeros((test.shape[0]))

for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    gbm.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in gbm.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in gbm.predict(target)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    rmse_list.append(rmse)
    
    gb_pred += (sub_pred / 10)

In [22]:
#RMSE
np.mean(rmse_list)

1005828.7184632024

### (2) LGBMRegressor

In [23]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(random_state =42)

In [24]:
rmse_list = []
lgb_pred = np.zeros((test.shape[0]))

for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in lgbm.predict(target)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    rmse_list.append(rmse)
    
    lgb_pred += (sub_pred / 10)

In [25]:
np.mean(rmse_list)

1049474.568970855

### (3) XGBRegressor

In [26]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state = 518)

In [27]:
rmse_list = []
xgb_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in xgb.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in xgb.predict(target)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    rmse_list.append(rmse)
    
    xgb_pred += (sub_pred / 10)



In [28]:
np.mean(rmse_list)

967913.5037705603

### (4) RandomForestRegressor

In [29]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

rf = RandomForestRegressor(random_state = 518)

In [30]:
rmse_list = []
rf_pred = np.zeros((test.shape[0]))
for tr_idx, val_idx in kf.split(X, y) :
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    rf.fit(tr_x, tr_y)
    
    pred = np.expm1([0 if x < 0 else x for x in rf.predict(val_x)])
    sub_pred = np.expm1([0 if x < 0 else x for x in rf.predict(target)])
    rmse = np.sqrt(mean_squared_error(val_y, pred))
    
    rmse_list.append(rmse)
    
    rf_pred += (sub_pred / 10)

In [31]:
np.mean(rmse_list)

803300.2737720159

### 최종 예측 결과

In [32]:
submission['box_off_num'] = (xgb_pred + lgb_pred + rf_pred + gb_pred ) / 4

In [33]:
submission.sort_values(by = 'box_off_num')

Unnamed: 0,title,box_off_num
188,정사: 위험한 성적유희,3.217085e+02
238,해에게서 소년에게,3.935501e+02
72,잔혹한 앵글의 로망스,4.324827e+02
133,롤플레이 2 : 동침,4.454077e+02
164,카토 프로젝트,4.580037e+02
74,롤플레이,5.070051e+02
237,어떤이의 꿈,5.276130e+02
136,미녀전쟁,5.304295e+02
26,할,5.403726e+02
130,댄서김의 은밀한 교수법,5.619229e+02


In [34]:
submission.to_csv("mini2.csv", index = False, encoding='utf-8-sig')