In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error  , r2_score
from sklearn.svm import SVR

data = pd.read_csv('/content/(final)movie_data_set.csv')
data = data.replace(',','', regex=True)
data =data.astype({'movie_code':'str','screen_number':'float'})

# 어벤저스 / 뽀로로 / 걸캅스 / 악인전
sample = data[data['movie_code'].str.contains('136900|183136|174065|177967')]

sample=sample.append({'movie_code' : 9999 , 'movie_name' : '스파이더맨', 'screen_number' : 2000,
                      'distribution_company_월트디즈니컴퍼니코리아 유한책임회사':1,'national_미국':1,'genre_어드벤처':1,
                      'grade_12세이상관람가':1,'movie_type_일반영화':1} , ignore_index=True)

sample= sample.fillna(0)

col_x = data.columns[3:169-5].to_list() # 5개 리뷰데이터 제거
col_y = data.columns[169:].to_list()

x = data[col_x]
y = data[col_y]

sample_x = sample[col_x]
sample_y = sample[col_y]

In [2]:
#train / test set 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)

In [3]:
#sMAPE
def smape(A, F):
    return 100 / len(A) * np.sum(np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [4]:
# simple regression
result_regression = {}
for i in range(9):
    mlr = LinearRegression()
    mlr.fit(x_train, y_train[[y_train.columns[i]]])
    y_predict = mlr.predict(x_test)
    
    result_regression[y_test.columns[i]] = float(smape( y_test[[y_test.columns[i]]], y_predict ))

In [5]:
result_regression

{'cgv7_days': 76.8520979567419,
 'cgv_15_days': 78.95274866162472,
 'cgv_3_days': 78.85677851501488,
 'jcon_15_days': 93.40270511358449,
 'jcon_3_days': 75.19799135510343,
 'jcon_7_days': 73.16774761748736,
 'lotte_15_days': 79.98697092561926,
 'lotte_3_days': 79.11858831431832,
 'lotte_7_days': 75.24495915384232}

In [6]:
#SVR
# Create Support Vector Regression model
# kernel : 선형 커널
# C : 학습 오류에 대한 패널티, C 값이 클 수록 모델이 학습 데이터에 좀 더 최적화 됨, 너무 크면 오버피팅 발생
# Epsilon : 임계값, 예측한 값이 GT 범위 안에 있으면 패널티 부여 X

result_svr = {}

Cs = np.arange(1,2,0.5)
epsilons = np.arange(0.1,0.3,0.1)

for c in Cs:
    for epsilon in epsilons:

        print(c,epsilon)
        
        for i in range(9):

            sv_regressor = SVR(kernel='linear', C=c, epsilon=epsilon)
            sv_regressor.fit(x_train, y_train[[y_train.columns[i]]].values.ravel())
            y_predict = sv_regressor.predict(x_test)

            smape_value = float(smape( np.array(y_test[[y_test.columns[i]]].iloc[:,0]) , y_predict ))

            result_svr[y_test.columns[i]+':'+str(c)+':'+str(epsilon)] = [smape_value,sv_regressor.predict(sample_x)]

            print(result_svr[y_test.columns[i]+':'+str(c)+':'+str(epsilon)] )



1.0 0.1
[77.05104050985608, array([-0.65530386, -0.04914078, -1.20209332, -0.82497659, -2.3239381 ])]
[70.49096831157547, array([-0.17749033, -2.31035051, -1.83552362, -1.78494872, -0.52075991])]
[69.84852501230918, array([-2.32799429, -3.73191664, -1.18679198, -2.01413229, -2.65763478])]
[77.88670699710012, array([-0.63885403, -1.32844473, -1.91813236, -0.97901126, -1.10694016])]
[75.34068861629481, array([-0.43818107, -2.71251117, -1.23051368, -0.4445587 ,  1.28774052])]
[77.09677009621086, array([-2.29272471, -3.16024782,  0.26300452,  0.59232091,  1.10566423])]
[76.32455976376181, array([ 0.95132035, -0.2949794 , -2.40211704, -0.10277618, -0.86712727])]
[77.10172415805164, array([ 0.70673187, -0.8429645 , -0.27563444, -1.58068496,  0.79119889])]
[74.9534055664695, array([-0.64584152, -5.04691251, -0.20325494,  0.26999067,  0.16959714])]
1.0 0.2
[77.68744818284384, array([-0.60412477, -0.0798535 , -1.32174997, -0.83087363, -2.1788212 ])]
[71.21320642443466, array([-0.07741503, -2.21

In [7]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf


# MLP 모델 아키텍처 정의

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

def build_model(num_input=1):
    model = Sequential()
    model.add(Dense(166, activation='relu', input_dim=num_input))
    model.add(Dense(84, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    return model

model = build_model(num_input=166-5)

result_nn = {}

for i in range(9):

  # 미니 배치 학습
  model.fit(x_train, y_train[[y_train.columns[i]]], epochs=200, batch_size=64, verbose=5)

  result_nn[y_test.columns[i]] = smape( np.array(y_test[[y_test.columns[i]]].iloc[:,0]) , model.predict(x_test))


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [8]:
result_regression
result_pd = pd.DataFrame.from_dict(result_regression, orient='index')
result_pd.columns = ['result_regression']
print(result_pd)

               result_regression
lotte_3_days           79.118588
lotte_7_days           75.244959
lotte_15_days          79.986971
cgv_3_days             78.856779
cgv7_days              76.852098
cgv_15_days            78.952749
jcon_3_days            75.197991
jcon_7_days            73.167748
jcon_15_days           93.402705


In [10]:
lst = list(result_svr.keys())
tmp_lst =[]
for i in range(len(lst)):
  tmp_lst.append(result_svr[lst[i]][0])
  """
  print(lst[i].split(':')[0])
  print(lst[i].split(':')[1]+':'+lst[i].split(':')[2])
  print(result_svr[lst[i]][0])
  """  
tmp_lst = np.array(tmp_lst)
tmp_pd = pd.DataFrame(tmp_lst.reshape(9,4))
tmp_pd.columns = ['c:1,e:0.1','c:1,e:0.2','c:1.5,e:0.1','c:1.5,e:0.2']
tmp_pd.index = result_pd.index

result_pd = pd.merge(result_pd, tmp_pd, left_index=True,right_index = True , how='left')

In [11]:
tmp_pd = pd.DataFrame.from_dict(result_nn, orient='index')
tmp_pd.columns = ['nn']

result_pd = pd.merge(result_pd, tmp_pd, left_index=True,right_index = True , how='left')

In [12]:
result_pd.to_csv('final_model_result.csv')