In [2]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error  , r2_score
from sklearn.svm import SVR

data = pd.read_csv('/content/(final)movie_data_set.csv')
data = data.replace(',','', regex=True)
data =data.astype({'movie_code':'str','screen_number':'float'})

sample = data[data['movie_code'].str.contains('177967|163788|137327|161967|170290|164125')]


col_x = data.columns[3:169].to_list()
col_y = data.columns[169:].to_list()

x = data[col_x]
y = data[col_y]

sample_x = sample[col_x]
sample_y = sample[col_y]

In [3]:
#train / test set 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)

In [4]:
#sMAPE
def smape(A, F):
    return 100 / len(A) * np.sum(np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [5]:
# simple regression
result_regression = {}
for i in range(9):
    mlr = LinearRegression()
    mlr.fit(x_train, y_train[[y_train.columns[i]]])
    y_predict = mlr.predict(x_test)
    
    result_regression[y_test.columns[i]] = float(smape( y_test[[y_test.columns[i]]], y_predict ))

In [6]:
result_regression

{'cgv7_days': 78.23402933216789,
 'cgv_15_days': 73.91405790342606,
 'cgv_3_days': 78.83670630285553,
 'jcon_15_days': 90.9565604557603,
 'jcon_3_days': 72.81673744370492,
 'jcon_7_days': 70.81318515494029,
 'lotte_15_days': 77.34062213634722,
 'lotte_3_days': 82.6893276439129,
 'lotte_7_days': 81.87415894917535}

In [7]:
#SVR
# Create Support Vector Regression model
# kernel : 선형 커널
# C : 학습 오류에 대한 패널티, C 값이 클 수록 모델이 학습 데이터에 좀 더 최적화 됨, 너무 크면 오버피팅 발생
# Epsilon : 임계값, 예측한 값이 GT 범위 안에 있으면 패널티 부여 X

result_svr = {}

Cs = np.arange(1,2,0.5)
epsilons = np.arange(0.1,0.3,0.1)

for c in Cs:
    for epsilon in epsilons:

        print(c,epsilon)
        
        for i in range(9):

            sv_regressor = SVR(kernel='linear', C=c, epsilon=epsilon)
            sv_regressor.fit(x_train, y_train[[y_train.columns[i]]].values.ravel())
            y_predict = sv_regressor.predict(x_test)

            smape_value = float(smape( np.array(y_test[[y_test.columns[i]]].iloc[:,0]) , y_predict ))

            result_svr[y_test.columns[i]+':'+str(c)+':'+str(epsilon)] = [smape_value,sv_regressor.predict(sample_x)]

            print(result_svr[y_test.columns[i]+':'+str(c)+':'+str(epsilon)] )



1.0 0.1
[81.43675863446192, array([-0.44863361,  0.12245213, -0.84277063, -0.37561134,  1.78768468,
       -1.03615827])]
[77.5011442701018, array([-1.07790237, -0.29373627, -2.08952614, -0.2547144 ,  0.14289229,
       -1.03037501])]
[70.55108995583065, array([-2.59375043, -2.45709612, -3.81269391, -0.96170489, -2.00889073,
       -2.16088518])]
[72.05647678022473, array([ 0.52825715, -1.95132572, -3.66424484,  0.25591993, -0.48322782,
       -1.77340638])]
[73.92147024199328, array([-0.03884487, -0.01902621, -3.3906757 ,  0.79455995, -0.20996098,
        0.36512502])]
[77.78956546462055, array([ 1.80115815,  0.54241879, -1.70434606,  2.60221505,  0.15644866,
        0.49518055])]
[75.27085847779097, array([-1.98131762, -0.8726806 , -0.54088057,  1.70385228,  0.75961048,
       -0.58504324])]
[76.88189584198192, array([-1.45614864, -1.78581658, -1.93549579,  0.78353327, -0.66276996,
        0.6122151 ])]
[78.36514517886836, array([ 0.57838612, -0.50207   , -4.51127307,  1.25177615,  0

In [None]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf


# MLP 모델 아키텍처 정의

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

def build_model(num_input=1):
    model = Sequential()
    model.add(Dense(166, activation='relu', input_dim=num_input))
    model.add(Dense(84, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    
    return model

model = build_model(num_input=166)

result_nn = {}

for i in range(9):

  # 미니 배치 학습
  model.fit(x_train, y_train[[y_train.columns[i]]], epochs=200, batch_size=64, verbose=5)

  result_nn[y_test.columns[i]] = smape( np.array(y_test[[y_test.columns[i]]].iloc[:,0]) , model.predict(x_test))


In [57]:
result_regression
result_pd = pd.DataFrame.from_dict(result_regression, orient='index')
result_pd.columns = ['result_regression']
print(result_pd)

               result_regression
lotte_3_days           82.689328
lotte_7_days           81.874159
lotte_15_days          77.340622
cgv_3_days             78.836706
cgv7_days              78.234029
cgv_15_days            73.914058
jcon_3_days            72.816737
jcon_7_days            70.813185
jcon_15_days           90.956560


In [67]:
lst = list(result_svr.keys())
tmp_lst =[]

for i in range(len(lst)):
  tmp_lst.append(result_svr[lst[i]][0])
  """
  print(lst[i].split(':')[0])
  print(lst[i].split(':')[1]+':'+lst[i].split(':')[2])
  print(result_svr[lst[i]][0])
  """  
tmp_lst = np.array(tmp_lst)
tmp_pd = pd.DataFrame(tmp_lst.reshape(9,4))
tmp_pd.columns = ['c:1,e:0.1','c:1,e:0.2','c:1.5,e:0.1','c:1.5,e:0.2']
tmp_pd.index = result_pd.index

result_pd = pd.merge(result_pd, tmp_pd, left_index=True,right_index = True , how='left')

               result_regression  c:1,e:0.1  ...  c:1.5,e:0.1  c:1.5,e:0.2
lotte_3_days           82.689328  81.436759  ...    70.551090    72.056477
lotte_7_days           81.874159  73.921470  ...    75.270858    76.881896
lotte_15_days          77.340622  78.365145  ...    77.691502    71.112318
cgv_3_days             78.836706  71.353446  ...    77.779261    76.072924
cgv7_days              78.234029  76.755469  ...    80.649253    77.310938
cgv_15_days            73.914058  68.924432  ...    72.925475    76.609435
jcon_3_days            72.816737  74.533324  ...    76.813528    79.869373
jcon_7_days            70.813185  77.362139  ...    70.302672    74.026223
jcon_15_days           90.956560  76.596625  ...    75.550943    77.077363

[9 rows x 5 columns]


In [69]:
tmp_pd = pd.DataFrame.from_dict(result_nn, orient='index')
tmp_pd.columns = ['nn']

result_pd = pd.merge(result_pd, tmp_pd, left_index=True,right_index = True , how='left')

In [71]:
result_pd.to_csv('final_model_result.csv')