In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier , StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

In [3]:
train_df = pd.read_csv('./datasets/train_ss.csv', index_col=0)
test_df = pd.read_csv('./datasets/test_ss.csv', index_col=0)

In [4]:
X_train = train_df[['전국스크린수','전국매출액','개봉일관객수','개봉일매출비율','배우가중치','감독가중치','등급_15','등급_청불','국내배급사','주요배급사']]
y_train = train_df[['Label']]
X_test = test_df[['전국스크린수','전국매출액','개봉일관객수','개봉일매출비율','배우가중치','감독가중치','등급_15','등급_청불','국내배급사','주요배급사']]
y_test = test_df[['Label']]

In [5]:
model = RandomForestClassifier(random_state=0, criterion = 'entropy', max_depth = 8, max_features = 'auto', min_samples_leaf = 4,
                                min_samples_split = 7, n_estimators = 20)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [10]:
def eval(test,pred):
    acc = accuracy_score(test,pred)
    precision = precision_score(test,pred)
    recall = recall_score(test,pred)
    f1 = f1_score(test,pred)
    print('##############\n',confusion_matrix(test,pred),
    "\n############\n",f'acc_score: {acc} \n precision: {precision} \n recall: {recall} \n f1_score: {f1}')

## 임계값 =[] 리스트생성 후 for 구문
def get_eval_by_threshold(y_test , pred_proba_c1, thresholds):
    # thresholds list객체내의 값을 차례로 iteration하면서 Evaluation 수행.
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1) 
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임곗값:',custom_threshold)
        eval(y_test , custom_predict)


def precision_recall_curve_plot(y_test , pred_proba_c1):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()
    

## roc curve_plot    
def roc_curve_plot(y_test,pred_proba):
    fprs, tprs, thresholds = roc_curve(y_test,pred_proba) ## 입력시 1레이블 컬럼만 추출
    
    # Roc curve를 plot 곡선으로 기름
    plt.plot(fprs,tprs,label='ROC')
    ## 가운데 대각선 직선을 그림
    plt.plot([0,1],[0,1],"k--",label="ramdom")
    
    ## fpr x축을 scale을 0.1단위로 변경 x,y축 명 설정
    start,end = plt.xlim()
    plt.xticks(np.round(np.arange(start,end,0.1),2))
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.xlabel('FPR(1-sensitivity)')
    plt.ylabel('TPR(recall)')
    plt.legend()
    plt.show()

In [11]:
eval(y_test, y_pred)

##############
 [[54 75]
 [59 88]] 
############
 acc_score: 0.5144927536231884 
 precision: 0.5398773006134969 
 recall: 0.5986394557823129 
 f1_score: 0.567741935483871


In [13]:
a = y_test
a["예측값"] = y_pred
a.reset_index(inplace=True)
df_2 = pd.read_csv('./datasets/1. 전처리완료.csv', index_col=0)
df_3 = df_2[['영화명',"전국관객수", "개봉일"]]
df_3.reset_index(inplace=True)
df_4 = pd.merge(a,df_3,on='index',how='left')
df_5 = df_4[df_4["전국관객수"]>10000000]
df_5

Unnamed: 0,level_0,index,Label,예측값,영화명,전국관객수,개봉일
0,0,0,1,1,명량,17613682,2014-07-30
1,1,1,0,1,극한직업,16264944,2019-01-23
2,2,2,1,1,신과함께-죄와 벌,14410754,2017-12-20
3,3,3,0,1,국제시장,14245998,2014-12-17
4,4,4,1,0,어벤져스: 엔드게임,13934592,2019-04-24
5,5,5,0,1,베테랑,13395400,2015-08-05
6,6,6,1,0,겨울왕국 2,13369064,2019-11-21
7,7,7,0,0,도둑들,12983330,2012-07-25
8,8,8,1,1,7번방의 선물,12811206,2013-01-23
9,9,9,1,0,암살,12705700,2015-07-22


In [15]:
e = df_5["예측값"]
g = df_5["Label"]
print(accuracy_score(e,g))

0.42857142857142855


In [16]:
df_4

Unnamed: 0,level_0,index,Label,예측값,영화명,전국관객수,개봉일
0,0,0,1,1,명량,17613682,2014-07-30
1,1,1,0,1,극한직업,16264944,2019-01-23
2,2,2,1,1,신과함께-죄와 벌,14410754,2017-12-20
3,3,3,0,1,국제시장,14245998,2014-12-17
4,4,4,1,0,어벤져스: 엔드게임,13934592,2019-04-24
...,...,...,...,...,...,...,...
271,271,271,0,1,초능력자,2164805,2010-11-10
272,272,272,0,1,신의 한 수: 귀수편,2146764,2019-11-07
273,273,273,1,0,마이 웨이,2142622,2011-12-21
274,274,274,1,1,겟 아웃,2138146,2017-05-17


In [18]:
df = pd.read_csv('./datasets/영화업종지수.csv', index_col=0)
df

Unnamed: 0,일자,수정주가지수
0,2019/12/30,284.909520
1,2019/12/27,281.475255
2,2019/12/26,277.548066
3,2019/12/24,275.803342
4,2019/12/23,279.196411
...,...,...
4931,2000/01/10,80.606965
4932,2000/01/07,80.101663
4933,2000/01/06,89.717078
4934,2000/01/05,91.902233


In [19]:
df['일자'] = pd.to_datetime(df['일자'])
df['수정주가지수12'] = df['수정주가지수'].shift(11)
df.columns = ['개봉일', '수정주가지수', '수정주가지수12']
df_4['개봉일'] = pd.to_datetime(df_4['개봉일'])
df_4 = df_4.reset_index(drop='index')
df2 = pd.merge(df_4, df, how='left', on='개봉일')
df2['수정주가지수'] = df2['수정주가지수'].interpolate()
df2['수정주가지수12'] = df2['수정주가지수12'].interpolate()

In [20]:
df2['영화12일수익률'] = (df2['수정주가지수12'] - df2['수정주가지수']) / df2['수정주가지수']

In [21]:
df2

Unnamed: 0,level_0,index,Label,예측값,영화명,전국관객수,개봉일,수정주가지수,수정주가지수12,영화12일수익률
0,0,0,1,1,명량,17613682,2014-07-30,289.105978,276.149475,-0.044816
1,1,1,0,1,극한직업,16264944,2019-01-23,352.413384,359.825668,0.021033
2,2,2,1,1,신과함께-죄와 벌,14410754,2017-12-20,294.796475,282.880336,-0.040422
3,3,3,0,1,국제시장,14245998,2014-12-17,257.122593,230.748811,-0.102573
4,4,4,1,0,어벤져스: 엔드게임,13934592,2019-04-24,389.588333,342.474973,-0.120931
...,...,...,...,...,...,...,...,...,...,...
271,271,271,0,1,초능력자,2164805,2010-11-10,169.868477,164.139688,-0.033725
272,272,272,0,1,신의 한 수: 귀수편,2146764,2019-11-07,290.871752,280.458164,-0.035801
273,273,273,1,0,마이 웨이,2142622,2011-12-21,199.993422,201.401922,0.007043
274,274,274,1,1,겟 아웃,2138146,2017-05-17,275.405645,276.803764,0.005077


In [22]:
kosdaq = pd.read_csv('./datasets/kosdaq.csv', index_col=0)
kosdaq.reset_index(inplace=True)

In [23]:
kosdaq

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2009-10-01,508.940002,509.589996,495.779999,502.549988,502.549988,564200.0
1,2009-10-05,498.970001,502.420013,496.910004,497.829987,497.829987,578200.0
2,2009-10-06,502.529999,503.470001,492.459991,494.420013,494.420013,593100.0
3,2009-10-07,499.500000,500.130005,491.359985,491.369995,491.369995,613300.0
4,2009-10-08,494.279999,496.959991,491.619995,496.959991,496.959991,586600.0
...,...,...,...,...,...,...,...
2523,2019-12-23,651.770020,652.809998,646.679993,647.619995,647.619995,700.0
2524,2019-12-24,648.710022,648.710022,638.739990,638.909973,638.909973,800.0
2525,2019-12-26,640.770020,652.090027,640.200012,652.070007,652.070007,900.0
2526,2019-12-27,652.599976,661.590027,652.169983,661.239990,661.239990,800.0


In [24]:
kosdaq['Date'] = pd.to_datetime(kosdaq['Date'])
kosdaq['Adj Close12'] = kosdaq['Adj Close'].shift(11)
kosdaq = kosdaq[['Date', 'Adj Close', 'Adj Close12']]
kosdaq.columns = ['개봉일', '코스닥수정주가지수', '코스닥수정주가지수12']

In [25]:
kosdaq

Unnamed: 0,개봉일,코스닥수정주가지수,코스닥수정주가지수12
0,2009-10-01,502.549988,
1,2009-10-05,497.829987,
2,2009-10-06,494.420013,
3,2009-10-07,491.369995,
4,2009-10-08,496.959991,
...,...,...,...
2523,2019-12-23,647.619995,628.099976
2524,2019-12-24,638.909973,627.859985
2525,2019-12-26,652.070007,627.109985
2526,2019-12-27,661.239990,629.130005


In [26]:
# df1['개봉일'] = pd.to_datetime(df1['개봉일'])
# df1 = df1.reset_index(drop='index')
df2 = pd.merge(df2, kosdaq, how='left', on='개봉일')
df2['코스닥수정주가지수'] = df2['코스닥수정주가지수'].interpolate()
df2['코스닥수정주가지수12'] = df2['코스닥수정주가지수12'].interpolate()

In [27]:
movie_df = df2[df2['예측값'] == 0]

In [28]:
movie_df = movie_df[movie_df['전국관객수'] > 10000000]

In [29]:
movie_df['코스닥12일수익률'] = (movie_df['코스닥수정주가지수12'] - movie_df['코스닥수정주가지수']) / movie_df['코스닥수정주가지수']

In [30]:
movie_df['영화누적수익률'] = (1 + movie_df['영화12일수익률']).cumprod() -1
movie_df['코스닥누적수익률'] = (1 + movie_df['코스닥12일수익률']).cumprod() -1

In [31]:
movie_df['초과수익여부'] = movie_df['코스닥12일수익률'] < movie_df['영화12일수익률']

In [33]:
movie_df = movie_df[['영화명','Label', '예측값' ,'영화누적수익률', '코스닥누적수익률', '초과수익여부']].reset_index(drop=True)

In [34]:
movie_df

Unnamed: 0,영화명,Label,예측값,영화누적수익률,코스닥누적수익률,초과수익여부
0,어벤져스: 엔드게임,1,0,-0.120931,-0.001333,False
1,겨울왕국 2,1,0,-0.135594,0.051569,False
2,도둑들,0,0,-0.119713,0.138568,False
3,암살,1,0,-0.17559,0.069761,False
4,알라딘,0,0,-0.143302,0.144181,False
5,부산행,1,0,-0.207496,0.124219,False
6,변호인,1,0,-0.19266,0.176327,False
7,어벤져스: 인피니티 워,1,0,-0.210636,0.186293,False
8,기생충,1,0,-0.201104,0.255595,False
