# 2. Second Version : 데이터 불균형이 심한 컬럼들 Log 변환
- 인원수(_cnt): 독립변수, 종속변수에 들어갈 컬럼 모두
- 시즌 내, 시즌 간 gap_days
- 연령제한(age_rating)

## 01. import, log 변환 후 분포도

In [151]:
# 모듈 불러오기
import pandas as pd
import numpy as np
import re
import ast

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


from sklearn.model_selection import learning_curve

In [152]:
# 데이터 불러오기
df = pd.read_excel('../data/file/ml_all.xlsx', index_col=0)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 601 entries, 0 to 600
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   korean_title                  601 non-null    object 
 1   english_title                 601 non-null    object 
 2   year                          601 non-null    int64  
 3   total_season_num              601 non-null    int64  
 4   runtime                       601 non-null    int64  
 5   genre                         601 non-null    object 
 6   age_rating                    601 non-null    int64  
 7   production_country            601 non-null    object 
 8   PCA                           601 non-null    float64
 9   genre_len                     601 non-null    int64  
 10  production_country_freq_code  601 non-null    int64  
 11  production_country_code       601 non-null    int64  
 12  seson_n                       601 non-null    int64  
 13  i_s1_rate 

In [153]:
# 장르 문자열을 분리하여 개별 항목으로 변환
df['genre'] = df['genre'].str.split(', ')

# 각 영화에 대한 모든 고유한 장르를 수집
all_genres = set(genre for sublist in df['genre'] for genre in sublist)

# Multi-Hot Encoding을 수행할 데이터프레임 초기화
for genre in all_genres:
    df[genre] = 0

# 각 영화에 대해 해당하는 장르에 1 할당
for index, row in df.iterrows():
    for genre in row['genre']:
        df.at[index, genre] = 1

# 장르 열을 삭제
df = df.drop(columns=['genre'])

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 601 entries, 0 to 600
Data columns (total 42 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   korean_title                  601 non-null    object 
 1   english_title                 601 non-null    object 
 2   year                          601 non-null    int64  
 3   total_season_num              601 non-null    int64  
 4   runtime                       601 non-null    int64  
 5   age_rating                    601 non-null    int64  
 6   production_country            601 non-null    object 
 7   PCA                           601 non-null    float64
 8   genre_len                     601 non-null    int64  
 9   production_country_freq_code  601 non-null    int64  
 10  production_country_code       601 non-null    int64  
 11  seson_n                       601 non-null    int64  
 12  i_s1_rate                     601 non-null    float64
 13  i_s1_rate_

In [154]:
# 제작국가 인코딩한 컬럼으로 대체
df['production_country'] = df['production_country_code']

In [155]:
df.head()

Unnamed: 0,korean_title,english_title,year,total_season_num,runtime,age_rating,production_country,PCA,genre_len,production_country_freq_code,...,범죄,SF,전쟁,역사,코미디,서부,Made in Europe,스릴러,공포,드라마
0,스위트 투스: 사슴뿔을 가진 소년,Sweet Tooth,2021,2,52,15,221,2.59594,4,221,...,0,1,0,0,0,0,0,0,0,1
1,스위트홈,Sweet Home,2020,2,60,19,176,0.898617,6,176,...,1,1,0,0,0,0,0,1,1,1
2,굿 닥터,The Good Doctor,2017,6,43,15,221,2.471506,1,221,...,0,0,0,0,0,0,0,0,0,1
3,워킹 데드,The Walking Dead,2010,11,46,19,221,4.999856,5,221,...,0,1,0,0,0,0,0,1,1,1
4,"비르기트: 왕국, 권력, 영광",Borgen - Power & Glory,2022,4,58,15,11,1.375249,3,11,...,0,0,1,0,0,0,1,0,0,1


In [156]:
# 로그 변환 적용 

df['i_s2_rate_cnt'] = np.log1p(df['i_s2_rate_cnt'])  # log(1+x) 변환 사용
df['w_s2_rate_cnt'] = np.log1p(df['w_s2_rate_cnt'])
df['s2_rate_cnt_retention'] = np.log1p(df['s2_rate_cnt_retention'])

df['i_s1_rate_cnt'] = np.log1p(df['i_s1_rate_cnt'])
df['w_s1_rate_cnt'] = np.log1p(df['w_s1_rate_cnt'])
df['s1_rate_cnt_retention'] = np.log1p(df['s1_rate_cnt_retention'])

df['s1_gap_days'] = np.log1p(df['s1_gap_days'])
df['season_gaps_days'] = np.log1p(df['season_gaps_days'])
df['age_rating'] = np.log1p(df['age_rating'])

In [157]:
# 분포도

# 첫 번째 그래프: 6개의 속성 히스토그램
fig1 = make_subplots(rows=2, cols=3, subplot_titles=['i_s2_rate', 'i_s2_rate_cnt', 'w_s2_rate', 'w_s2_rate_cnt', 's2_rate_retention', 's2_rate_cnt_retention'])
columns1 = ['i_s2_rate', 'i_s2_rate_cnt', 'w_s2_rate', 'w_s2_rate_cnt', 's2_rate_retention', 's2_rate_cnt_retention']

for i, col in enumerate(columns1):
    fig1.add_trace(go.Histogram(x=df[col], name=col), row=(i // 3) + 1, col=(i % 3) + 1)

fig1.update_layout(title_text='y값', showlegend=False)

# 두 번째 그래프: 8개의 속성 히스토그램
fig2 = make_subplots(rows=2, cols=4, subplot_titles=['runtime', 'age_rating', 'production_country', 'seson_n', 'PCA', 'genre_len', 's1_gap_days', 'season_gaps_days'])
columns2 = ['runtime', 'age_rating', 'production_country', 'seson_n', 'PCA', 'genre_len', 's1_gap_days', 'season_gaps_days']

for i, col in enumerate(columns2):
    fig2.add_trace(go.Histogram(x=df[col], name=col), row=(i // 4) + 1, col=(i % 4) + 1)

fig2.update_layout(title_text='x값-특징', showlegend=False)

# 세 번째 그래프: 6개의 속성 히스토그램
fig3 = make_subplots(rows=2, cols=3, subplot_titles=['i_s1_rate', 'i_s1_rate_cnt', 'w_s1_rate', 'w_s1_rate_cnt', 's1_rate_retention', 's1_rate_cnt_retention'])
columns3 = ['i_s1_rate', 'i_s1_rate_cnt', 'w_s1_rate', 'w_s1_rate_cnt', 's1_rate_retention', 's1_rate_cnt_retention']

for i, col in enumerate(columns3):
    fig3.add_trace(go.Histogram(x=df[col], name=col), row=(i // 3) + 1, col=(i % 3) + 1)

fig3.update_layout(title_text='x값-인기도', showlegend=False)

# 그래프 출력
fig1.show()
fig2.show()
fig3.show()

---

## 02-1. Modeling

In [159]:
# Score(y값)

# 각 요소 계산
# 점수 부여: 인원 수에만 로그를 취함
rate = (0.8 * df['i_s2_rate']) + (0.2 * df['w_s2_rate'])
cnt = (0.8 * df['i_s2_rate_cnt']) + (0.2 * df['w_s2_rate_cnt'])
ret = (0.4 * df['s2_rate_retention']) + (0.6 * df['s2_rate_cnt_retention']) *10      # 유지도(ret는 범위가 너무 작기에 균형을 위해 -> *10 )

# 총 점수 계산
total = (0.3 * rate + 0.5 * cnt + 0.2 * ret) *10

# 점수를 데이터프레임에 추가
df['score'] = total

In [160]:
df = df[['runtime','age_rating','production_country','PCA','genre_len','seson_n','i_s1_rate','i_s1_rate_cnt','s1_rate_retention','s1_rate_cnt_retention','w_s1_rate','w_s1_rate_cnt','s1_gap_days','season_gaps_days','score',
         'Made in Europe', 'SF', '가족', '공포', '드라마', '로맨스', '범죄', '서부', '스릴러', '스포츠', '액션', '역사', '음악', '전쟁', '코미디' ,'판타지']]

In [161]:
X = df.drop(['score'], axis=1)  # 'score' 열을 제외한 모든 열 선택
y = df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [162]:
# DecisionTreeRegressor 모델 초기화
dt_model = DecisionTreeRegressor(random_state=13)

# 군집에 속한 데이터로 모델 학습
dt_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
predictions = dt_model.predict(X_test)

# 회귀 모델의 성능 평가
dt_mse = mean_squared_error(y_test, predictions)
dt_mae = mean_absolute_error(y_test, predictions)
dt_r2 = r2_score(y_test, predictions)

model_result = pd.DataFrame({'MSE': [dt_mse], 'MAE': [dt_mae], 'R^2': [dt_r2]}, index=['DecisionTreeRegressor'])

In [163]:
# RandomForestRegressor 모델 초기화
rf_model = RandomForestRegressor(random_state=13)

# 군집에 속한 데이터로 모델 학습
rf_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측
predictions = rf_model.predict(X_test)

# 회귀 모델의 성능 평가
rf_mse = mean_squared_error(y_test, predictions)
rf_mae = mean_absolute_error(y_test, predictions)
rf_r2 = r2_score(y_test, predictions)

model_result = pd.DataFrame({'MSE': [rf_mse], 'MAE': [rf_mae], 'R2': [rf_r2]}, index=['RandomForestRegressor'])

In [164]:
# KNeighborsRegressor 모델 초기화
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)

# 회귀 모델의 성능 평가
knn_mse = mean_squared_error(y_test, knn_predictions)
knn_mae = mean_absolute_error(y_test, knn_predictions)
knn_r2 = r2_score(y_test, knn_predictions)

model_result = pd.DataFrame({'MSE': [knn_mse], 'MAE': [knn_mae], 'R2': [knn_r2]}, index=['KNeighborsRegressor'])
print(model_result)

                           MSE       MAE        R2
KNeighborsRegressor  28.119716  4.106433  0.730882


In [165]:
# XGBRegressor 모델 초기화
xgb_model = XGBRegressor(random_state=13)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

# 회귀 모델의 성능 평가
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)

model_result = pd.DataFrame({'MSE': [xgb_mse], 'MAE': [xgb_mae], 'R2': [xgb_r2]}, index=['XGBRegressor'])
print(model_result)

                    MSE       MAE        R2
XGBRegressor  12.912952  2.692871  0.876418


In [166]:
# AdaBoostRegressor 모델 초기화
ada_model = AdaBoostRegressor(random_state=13)
ada_model.fit(X_train, y_train)
ada_predictions = ada_model.predict(X_test)

# 회귀 모델의 성능 평가
ada_mse = mean_squared_error(y_test, ada_predictions)
ada_mae = mean_absolute_error(y_test, ada_predictions)
ada_r2 = r2_score(y_test, ada_predictions)

model_result = pd.DataFrame({'MSE': [ada_mse], 'MAE': [ada_mae], 'R2': [ada_r2]}, index=['AdaBoostRegressor'])
print(model_result)

                         MSE       MAE        R2
AdaBoostRegressor  13.601636  2.819139  0.869827


In [None]:
# LGBMRegressor 모델 초기화
lgb_model = LGBMRegressor(random_state=13)
lgb_model.fit(X_train, y_train)
lgb_predictions = lgb_model.predict(X_test)

# 회귀 모델의 성능 평가
lgb_mse = mean_squared_error(y_test, lgb_predictions)
lgb_mae = mean_absolute_error(y_test, lgb_predictions)
lgb_r2 = r2_score(y_test, lgb_predictions)

model_result = pd.DataFrame({'MSE': [lgb_mse], 'MAE': [lgb_mae], 'R2': [lgb_r2]}, index=['LGBMRegressor'])
print(model_result)

In [168]:
# GradientBoostingRegressor 모델 초기화
gbr_model = GradientBoostingRegressor(random_state=13)
gbr_model.fit(X_train, y_train)
gbr_predictions = gbr_model.predict(X_test)

# 회귀 모델의 성능 평가
gbr_mse = mean_squared_error(y_test, gbr_predictions)
gbr_mae = mean_absolute_error(y_test, gbr_predictions)
gbr_r2 = r2_score(y_test, gbr_predictions)

model_result = pd.DataFrame({'MSE': [gbr_mse], 'MAE': [gbr_mae], 'R2': [gbr_r2]}, index=['GradientBoostingRegressor'])
print(model_result)

                                MSE       MAE        R2
GradientBoostingRegressor  9.929124  2.430978  0.904974


In [169]:
# LinearRegression 모델 초기화
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

# LinearRegression 모델의 성능 평가
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)

model_result = pd.DataFrame({'MSE': [lr_mse], 'MAE': [lr_mae], 'R2': [lr_r2]}, index=['LinearRegression'])

---

## 02-2. 결과

In [170]:
model_result = pd.DataFrame({
    'MSE': [dt_mse, rf_mse, knn_mse, xgb_mse, ada_mse, lgb_mse, gbr_mse, lr_mse],
    'MAE': [dt_mae, rf_mae, knn_mae, xgb_mae, ada_mae, lgb_mae, gbr_mae, lr_mae],
    'R2': [dt_r2, rf_r2, knn_r2, xgb_r2, ada_r2, lgb_r2, gbr_r2, lr_r2]
}, index=['DecisionTreeRegressor', 'RandomForestRegressor', 'KNeighborsRegressor', 'XGBoostRegressor', 'AdaBoostRegressor', 'LightGBMRegressor', 'GradientBoostingRegressor', 'LinearRegression'])
print(model_result)

                                 MSE       MAE        R2
DecisionTreeRegressor      26.162113  3.782698  0.749617
RandomForestRegressor      11.832913  2.509390  0.886754
KNeighborsRegressor        28.119716  4.106433  0.730882
XGBoostRegressor           12.912952  2.692871  0.876418
AdaBoostRegressor          13.601636  2.819139  0.869827
LightGBMRegressor          10.213172  2.445319  0.902256
GradientBoostingRegressor   9.929124  2.430978  0.904974
LinearRegression            9.787807  2.418579  0.906326


In [171]:
# prompt: 제일 좋은 성능 두가지
print("MSE 기준")
print(model_result.sort_values(by=['MSE'], ascending=True).head(3))
print(" ")

print("MAE 기준")
print(model_result.sort_values(by=['MAE'], ascending=True).head(3))
print(" ")

print("R2 기준")
print(model_result.sort_values(by=['R2'], ascending=False).head(3))

MSE 기준
                                 MSE       MAE        R2
LinearRegression            9.787807  2.418579  0.906326
GradientBoostingRegressor   9.929124  2.430978  0.904974
LightGBMRegressor          10.213172  2.445319  0.902256
 
MAE 기준
                                 MSE       MAE        R2
LinearRegression            9.787807  2.418579  0.906326
GradientBoostingRegressor   9.929124  2.430978  0.904974
LightGBMRegressor          10.213172  2.445319  0.902256
 
R2 기준
                                 MSE       MAE        R2
LinearRegression            9.787807  2.418579  0.906326
GradientBoostingRegressor   9.929124  2.430978  0.904974
LightGBMRegressor          10.213172  2.445319  0.902256


---