<a href="https://colab.research.google.com/github/kimhyeongmin-khu/MLMovieProj/blob/main/Lasso_a_value.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#w1_slope를 포함한 lasso a value 찾기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score

# CSV 파일 읽기
file_path = 'preprosessing_ver15.csv'  # 파일 경로를 지정하세요
df = pd.read_csv(file_path)

# 타겟 변수 설정
target_variable = 'w2_ac_au'

# 숫자형 데이터만 선택
numeric_cols = df.select_dtypes(include=[np.number]).columns

# 제외할 열들
exclude_cols = ['w1_slope', 'w1_df_rank', 'w1_mean_diff', 'w2_slope', 'w2_df_rank', 'w2_mean_diff']

# 범주형 열들 (예를 들어 사용자가 지정한 범주형 열들)
categorical_cols = ['Is_High_Season', 'Over_12', 'Over_15', 'General_Audience', 'No_Youth', 'USA', 'Korea', 'Others']

# 제외할 열들을 뺀 나머지 열들
cols_to_transform = [col for col in numeric_cols if col not in exclude_cols]

# 나머지 열들에 대해 0 이하의 값을 작은 양수로 대체
df[cols_to_transform] = df[cols_to_transform].apply(lambda x: np.where(x > 0, x, 1e-6))

user_selected_features = [
   'w1_au', 'w1_rank', 'w1_av_sc', 'w1_av_sales', 'w1_df_rank',
   'di_ca_au_y3', 'Distributors_mv_au_y3', 'actor_mv_au_y3', 'Week1_Avg',
   'Over_12', 'Over_15', 'General_Audience', 'No_Youth', 'Is_High_Season',
   'Others', 'USA', 'Korea'
]

# 제공된 기본 피처 세트를 사용
X_transformed = df[user_selected_features].copy()

# 로그 변환 (exclude_cols 제외)
numeric_features_to_log = [col for col in user_selected_features if col not in categorical_cols + exclude_cols]
X_transformed[numeric_features_to_log] = np.log1p(X_transformed[numeric_features_to_log])
y_transformed = np.log1p(df[target_variable])

# 수치형 열만 스케일링
numeric_features = [col for col in user_selected_features if col not in categorical_cols]
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_transformed[numeric_features])

# 스케일링 된 수치형 데이터와 범주형 데이터를 다시 결합
X_combined = np.concatenate([X_numeric_scaled, X_transformed[categorical_cols].values], axis=1)

# 전체 피처 이름 리스트
combined_feature_names = numeric_features + categorical_cols

# 상관 계수 매트릭스 계산
correlation_matrix = df[user_selected_features + [target_variable]].corr()
print("Correlation Matrix")
print(correlation_matrix)
# 시각화
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# K-Fold 설정
k_fold = KFold(n_splits=10, shuffle=True, random_state=20)

# Lasso 모델 학습 및 평가
alphas = np.arange(0.0001, 0.1, 0.0001)
lasso_rmse_test = np.zeros(len(alphas))
lasso_mape_test = np.zeros(len(alphas))
lasso_coefs = np.zeros((len(alphas), X_combined.shape[1]))

for train_idx, test_idx in k_fold.split(X_combined):
    X_train, X_test = X_combined[train_idx], X_combined[test_idx]
    y_train, y_test = y_transformed.iloc[train_idx], y_transformed.iloc[test_idx]

    for i, alpha in enumerate(alphas):
        lasso_reg = Lasso(alpha=alpha)
        lasso_reg.fit(X_train, y_train)
        lasso_coefs[i] += lasso_reg.coef_ / k_fold.n_splits

        # 예측
        y_test_pred = lasso_reg.predict(X_test)

        # 예측값을 원래 스케일로 변환
        y_test_pred_original = np.expm1(y_test_pred)
        y_test_original = np.expm1(y_test)

        # RMSE 계산
        lasso_rmse_test[i] += mean_squared_error(y_test_original, y_test_pred_original, squared=False) / k_fold.n_splits

        # MAPE 계산 (퍼센트로 변환)
        lasso_mape_test[i] += mean_absolute_percentage_error(y_test_original, y_test_pred_original) * 100 / k_fold.n_splits

# 최적의 알파 값과 해당하는 MAPE, RMSE 값 찾기
min_rmse_index = np.argmin(lasso_rmse_test)
min_mape_index = np.argmin(lasso_mape_test)

best_alpha_rmse = alphas[min_rmse_index]
best_rmse = lasso_rmse_test[min_rmse_index]

best_alpha_mape = alphas[min_mape_index]
best_mape = lasso_mape_test[min_mape_index]

# 선택된 피처 출력
selected_features_rmse = np.array(combined_feature_names)[lasso_coefs[min_rmse_index] != 0]
selected_features_mape = np.array(combined_feature_names)[lasso_coefs[min_mape_index] != 0]

# 결과 출력
print(f"Best alpha for RMSE: {best_alpha_rmse} with RMSE: {best_rmse}")
print(f"Selected features for RMSE: {selected_features_rmse}")

print(f"Best alpha for MAPE: {best_alpha_mape} with MAPE: {best_mape}")
print(f"Selected features for MAPE: {selected_features_mape}")

# plot RMSE
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(alphas, lasso_rmse_test, 'ro-')
plt.title("Lasso Test Set RMSE", fontsize=16)
plt.xlabel("Model Simplicity (alpha)$\longrightarrow$")
plt.ylabel("RMSE")
plt.axvline(x=best_alpha_rmse, color='g', linestyle='--', label=f'Best alpha for RMSE: {best_alpha_rmse}')
plt.legend()

# plot MAPE
plt.subplot(1, 2, 2)
plt.plot(alphas, lasso_mape_test, 'bo-')
plt.title("Lasso Test Set MAPE", fontsize=16)
plt.xlabel("Model Simplicity (alpha)$\longrightarrow$")
plt.ylabel("MAPE (%)")
plt.axvline(x=best_alpha_mape, color='g', linestyle='--', label=f'Best alpha for MAPE: {best_alpha_mape}')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
#w2_slope를 포함한 lasso a value 찾기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# CSV 파일 읽기
file_path = 'preprosessing_ver15.csv'  # 파일 경로를 지정하세요
df = pd.read_csv(file_path)

# 타겟 변수 설정
target_variable = 'w3_ac_au'

# 숫자형 데이터만 선택
numeric_cols = df.select_dtypes(include=[np.number]).columns

# 제외할 열들
exclude_cols = ['w1_slope', 'w1_df_rank', 'w1_mean_diff', 'w2_slope', 'w2_df_rank', 'w2_mean_diff']

# 범주형 열들 (예를 들어 사용자가 지정한 범주형 열들)
categorical_cols = ['Is_High_Season', 'Over_12', 'Over_15', 'General_Audience', 'No_Youth', 'USA', 'Korea', 'Others']

# 제외할 열들을 뺀 나머지 열들
cols_to_transform = [col for col in numeric_cols if col not in exclude_cols]

# 나머지 열들에 대해 0 이하의 값을 작은 양수로 대체
df[cols_to_transform] = df[cols_to_transform].apply(lambda x: np.where(x > 0, x, 1e-6))

user_selected_features = [
   'w2_ac_au', 'w2_rank', 'w2_av_sc', 'w2_av_sales', 'w2_df_rank',
   'di_ca_au_y3', 'Distributors_mv_au_y3', 'actor_mv_au_y3', 'Week2_Avg',
   'Over_12', 'Over_15', 'General_Audience', 'No_Youth', 'Is_High_Season',
   'Others', 'USA', 'Korea'
]

# 제공된 기본 피처 세트를 사용
X_transformed = df[user_selected_features].copy()

# 로그 변환 (exclude_cols 제외)
numeric_features_to_log = [col for col in user_selected_features if col not in categorical_cols + exclude_cols]
X_transformed[numeric_features_to_log] = np.log1p(X_transformed[numeric_features_to_log])
y_transformed = np.log1p(df[target_variable])

# 수치형 열만 스케일링
numeric_features = [col for col in user_selected_features if col not in categorical_cols]
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(X_transformed[numeric_features])

# 스케일링 된 수치형 데이터와 범주형 데이터를 다시 결합
X_combined = np.concatenate([X_numeric_scaled, X_transformed[categorical_cols].values], axis=1)

# 전체 피처 이름 리스트
combined_feature_names = numeric_features + categorical_cols

# 상관 계수 매트릭스 계산
correlation_matrix = df[user_selected_features + [target_variable]].corr()
print("Correlation Matrix")
print(correlation_matrix)
# 시각화
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# K-Fold 설정
k_fold = KFold(n_splits=10, shuffle=True, random_state=20)

# Lasso 모델 학습 및 평가
alphas = np.arange(0.0001, 0.1, 0.0001)
lasso_rmse_test = np.zeros(len(alphas))
lasso_mape_test = np.zeros(len(alphas))
lasso_coefs = np.zeros((len(alphas), X_combined.shape[1]))

for train_idx, test_idx in k_fold.split(X_combined):
    X_train, X_test = X_combined[train_idx], X_combined[test_idx]
    y_train, y_test = y_transformed.iloc[train_idx], y_transformed.iloc[test_idx]

    for i, alpha in enumerate(alphas):
        lasso_reg = Lasso(alpha=alpha)
        lasso_reg.fit(X_train, y_train)
        lasso_coefs[i] += lasso_reg.coef_ / k_fold.n_splits

        # 예측
        y_test_pred = lasso_reg.predict(X_test)

        # 예측값을 원래 스케일로 변환
        y_test_pred_original = np.expm1(y_test_pred)
        y_test_original = np.expm1(y_test)

        # RMSE 계산
        lasso_rmse_test[i] += mean_squared_error(y_test_original, y_test_pred_original, squared=False) / k_fold.n_splits

        # MAPE 계산 (퍼센트로 변환)
        lasso_mape_test[i] += mean_absolute_percentage_error(y_test_original, y_test_pred_original) * 100 / k_fold.n_splits

# 최적의 알파 값과 해당하는 MAPE, RMSE 값 찾기
min_rmse_index = np.argmin(lasso_rmse_test)
min_mape_index = np.argmin(lasso_mape_test)

best_alpha_rmse = alphas[min_rmse_index]
best_rmse = lasso_rmse_test[min_rmse_index]

best_alpha_mape = alphas[min_mape_index]
best_mape = lasso_mape_test[min_mape_index]

# 선택된 피처 출력
selected_features_rmse = np.array(combined_feature_names)[lasso_coefs[min_rmse_index] != 0]
selected_features_mape = np.array(combined_feature_names)[lasso_coefs[min_mape_index] != 0]

# 결과 출력
print(f"Best alpha for RMSE: {best_alpha_rmse} with RMSE: {best_rmse}")
print(f"Selected features for RMSE: {selected_features_rmse}")

print(f"Best alpha for MAPE: {best_alpha_mape} with MAPE: {best_mape}")
print(f"Selected features for MAPE: {selected_features_mape}")

# plot RMSE
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(alphas, lasso_rmse_test, 'ro-')
plt.title("Lasso Test Set RMSE", fontsize=16)
plt.xlabel("Model Simplicity (alpha)$\longrightarrow$")
plt.ylabel("RMSE")
plt.axvline(x=best_alpha_rmse, color='g', linestyle='--', label=f'Best alpha for RMSE: {best_alpha_rmse}')
plt.legend()

# plot MAPE
plt.subplot(1, 2, 2)
plt.plot(alphas, lasso_mape_test, 'bo-')
plt.title("Lasso Test Set MAPE", fontsize=16)
plt.xlabel("Model Simplicity (alpha)$\longrightarrow$")
plt.ylabel("MAPE (%)")
plt.axvline(x=best_alpha_mape, color='g', linestyle='--', label=f'Best alpha for MAPE: {best_alpha_mape}')
plt.legend()

plt.tight_layout()
plt.show()
