# Google Colab 빅쿼리 연동

In [1]:
from google.colab import auth
auth.authenticate_user()

## 데이터 가져오기
- 공식문서 참조 : https://pandas.pydata.org/docs/reference/api/pandas.read_gbq.html

- gender : 성별
- SeniorCitizen : 노인인지의 여부
- Dependents : 자녀의 유무
- tenure : 고객의 가입 기간 (개월 수)
- PhoneService : 휴대폰 서비스를 가입 했는지의 여부
- MultipleLines : 여러 개의 통신선을 서비스 받고 있는지의 여부
- InternetService : 인터넷 서비스 제공자 (DSL, Fiber optic, No)
- OnlineSecurity : 온라인 보안 서비스를 가입 했는지의 여부
- OnlineBackup : 온라인 백업 서비스를 가입 했는지의 여부
- DeviceProtection 기기 보호 서비스를 가입 했는지의 여부
- TechSupport : 기술 서포트 서비스를 가입 했는지의 여부
- StreamingTV : TV 스트리밍 서비스를 가입 했는지의 여부
- StreamingMovies : 영화 스트리밍 서비스를 가입 했는지의 여부
- Contract : 계약 유형 (Month-to-month, One year, Two year)
- PaperlessBilling : 전자 고지서 여부
- PaymentMethod : 요금 지불 방법 (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
- MonthlyCharges : 매달 고객에게 청구되는 금액
- TotalCharges : 고객에게 청구된 총 금액
- Churn : 지난 한 달 내에 떠난 고객인지의 여부
- churn_rate : 0 (유지), 1 (이탈)로만 이루어진 이탈율
- CLTV : 고객 생애 가치
- Churn Reason : 고객 이탈의 구체적인 이유
- Country : 국가
- State : 주
- City : 도시
- Zip Code : 우편번호
- Latitude : 위도
- Longitude : 경도
- Age : 현재 나이
- Under 30 : 나이가 30살 미만 여부
- Married : 결혼 여부
- Referred a Friend : 친구 추천 여부
- Number of Referrals : 고객이 현재까지 추천한 횟수
- Offer : 고객이 마지막으로 수락한 마케팅 제안
- Avg Monthly Long Distance Charges : 월 평균 장거리 전화 요금
- Avg Monthly GB Download : 월 평균 다운로드한 용량(단위 GB)
- Streaming Music : 타사의 음악 스트리밍 여부
- Premium Tech Support : 대기 시간 시간을 단축하느 프리미엄 추가 기술 지원 플랜 가입 여부
- Unlimited Data : 무제한 데이터 다운로드/업로드를 위해 월별 추가 요금을 지불했는지 여부
- Total Refunds : 분기말까지의 고객의 총 환불 금액
- Total Extra Data Charges : 분기말까지의 데이터 다운로드 한도를 초과한 것에 대한 고객의 총요금
- Total Long Distance Charges : 분기말까지의 장거리 통화 한도를 초과한 것에 대한 고객의 총요금
- Total Revenue : 총 수익
- Satisfaction Score : 회사에 대한 고객의 만족도 점수 (5점 척도)
- Churn Category : 이탈 이유에 대한 카테고리 (태도, 경쟁사, 불만족, 가격, 기타)
- Age_Group : Age를 기반으로 만든 연령대

In [2]:
import pandas as pd

project_id = 'multi-telecom'
sql = '''
SELECT * FROM `multi-telecom.churn_rate.telecom_preprocessed` LIMIT 7100
'''

df = pd.read_gbq(sql, project_id = project_id, dialect = 'standard')
df

Unnamed: 0,gender,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,payment_method_credit_card_auto,paymentmethod_electronic_check,paymentmethod_mailed_check,ChurnCategory_0,ChurnCategory_Attitude,ChurnCategory_Competitor,ChurnCategory_Dissatisfaction,ChurnCategory_Other,ChurnCategory_Price,Service_Num
0,1,0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,1
1,1,0,11,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1
2,1,0,27,0,0,1,1,0,0,0,...,1,0,0,0,0,1,0,0,0,2
3,0,0,1,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,2
4,0,0,1,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,0,0,72,1,1,1,0,1,1,1,...,1,0,0,1,0,0,0,0,0,11
1128,1,0,72,0,0,1,1,1,1,1,...,1,0,0,1,0,0,0,0,0,11
1129,1,0,52,1,1,1,0,1,1,1,...,0,1,0,1,0,0,0,0,0,12
1130,0,0,72,1,1,1,0,1,1,1,...,0,0,0,1,0,0,0,0,0,12


### 데이터 컬럼명 변경 주의
빅쿼리에 저장하기 위해서는 아래의 2개 컬럼명을 사용할 수 없어서 변경함.

'PaymentMethod_Bank transfer (automatic)': 'payment_method_bank_transfer_auto',

'PaymentMethod_Credit card (automatic)': 'payment_method_credit_card_auto'

In [3]:
df.CLTV.value_counts()

5242    4
5546    4
4368    4
4011    3
5444    3
       ..
2671    1
2099    1
2146    1
5345    1
4474    1
Name: CLTV, Length: 974, dtype: Int64

# 머신러닝 (CLV)

## 머신러닝 준비
- CLV를 판단하기에 적합해 보이는 컬럼들을 추출해 예측을 돌리고, 평가지표를 사용해 원데이터 CLTV와 비교해봄.

In [4]:
selected_columns = ['tenure', 'Contract_1', 'Contract_12', 'Contract_24', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn', 'Age', 'NumberOfReferrals', 'SatisfactionScore']
X = df[selected_columns]
y = df['CLTV']

In [5]:
X

Unnamed: 0,tenure,Contract_1,Contract_12,Contract_24,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,Age,NumberOfReferrals,SatisfactionScore
0,1,1,0,0,0,20.05,20.05,1,69,0,1
1,11,1,0,0,0,19.95,214.75,1,79,1,1
2,27,1,0,0,0,30.75,805.10,1,71,0,1
3,1,1,0,0,1,25.80,25.80,1,76,0,1
4,1,1,0,0,0,25.05,25.05,1,69,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1127,72,0,0,1,1,110.80,7882.25,0,69,8,5
1128,72,0,0,1,1,64.45,4641.10,0,76,9,5
1129,52,1,0,0,1,110.75,5832.00,0,75,0,5
1130,72,0,0,1,1,108.10,7774.05,0,65,0,5


## 랜덤포레스트 회귀

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 0.88
Mean Absolute Error (MAE): 328.99
Mean Squared Error (MSE): 158331.18
Root Mean Squared Error (RMSE): 397.91


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,3787.72
1,5888,5407.92
2,2927,3402.19
3,3884,4167.94
4,4842,4503.20
...,...,...
1127,5933,5612.91
1128,5295,5298.24
1129,4993,4871.41
1130,4474,4770.00


- 하이퍼 파라미터 튜닝 진행 (StandardScaler)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300, 400, 500],
    'regressor__max_depth': [None, 10, 20, 30, 50, 70, 100],
    'regressor__min_samples_split': [5, 10, 20, 30, 50, 70, 100],
    'regressor__min_samples_leaf': [1, 2, 3, 4, 5, 7]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Model:", best_model)
print("Best Parameters:", best_params)

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['tenure', 'MonthlyCharges',
                                                   'TotalCharges', 'Age',
                                                   'NumberOfReferrals',
                                                   'SatisfactionScore'])])),
                ('regressor',
                 RandomForestRegressor(max_depth=30, min_samples_leaf=5,
                                       min_samples_split=100))])
Best Parameters: {'regressor__max_depth': 30, 'regressor__min_samples_leaf': 5, 'regressor__min_samples_split': 100, 'regressor__n_estimators': 100}
R-squared (R^2): 0.88
Mean Absolute Error (MAE): 334.04
Mean Squared Error (MSE): 164019.30
Root Mean Squared Error (RMSE): 404.99


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,3726.44
1,5888,5375.25
2,2927,3253.64
3,3884,4149.82
4,4842,4407.29
...,...,...
1127,5933,5586.78
1128,5295,5292.92
1129,4993,4948.23
1130,4474,4849.25


=> 하이퍼 파라미터 튜닝을 해도 결과가 크게 다르지 않으며 오히려 진행 전이 조금 더 예측성이 좋음.

결론적으로 일반 스케일링만을 사용하는 것이 더 좋은 예측을 보여줌.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 0.88
Mean Absolute Error (MAE): 330.90
Mean Squared Error (MSE): 162308.75
Root Mean Squared Error (RMSE): 402.88


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,3837.37
1,5888,5602.81
2,2927,3161.09
3,3884,4225.67
4,4842,4386.87
...,...,...
1127,5933,5610.87
1128,5295,5330.06
1129,4993,4861.14
1130,4474,4828.18


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 0.89
Mean Absolute Error (MAE): 328.69
Mean Squared Error (MSE): 156810.48
Root Mean Squared Error (RMSE): 395.99


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,3800.45
1,5888,5474.17
2,2927,3393.75
3,3884,4243.28
4,4842,4406.36
...,...,...
1127,5933,5590.00
1128,5295,5302.66
1129,4993,4845.04
1130,4474,4837.11


=> RobustScaler가 모든 평가지표를 통틀었을 때, 그나마 좋은 결과를 내포함.

그러나 3가지 스케일링 모두 큰 차이는 없으며, 하이퍼 파라미터 튜닝은 하지 않는 것이 바람직해 보임.

추가로 다른 모델을 사용해서 더 좋은 예측이 되도록 실행해 볼 필요가 있음.

## 선형 회귀

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_features)
#     ])

model = Pipeline([
    # ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 0.17
Mean Absolute Error (MAE): 894.91
Mean Squared Error (MSE): 1133284.55
Root Mean Squared Error (RMSE): 1064.56


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,3953.054543
1,5888,4248.071316
2,2927,4316.450305
3,3884,4096.817940
4,4842,3925.778446
...,...,...
1127,5933,5336.581249
1128,5295,5376.569015
1129,4993,4790.385133
1130,4474,5260.717410


## Ridge, Lasso 회귀

- Ridge

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_features)
#     ])

model = Pipeline([
    # ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 0.17
Mean Absolute Error (MAE): 894.91
Mean Squared Error (MSE): 1133285.17
Root Mean Squared Error (RMSE): 1064.56


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,3953.195712
1,5888,4248.262332
2,2927,4316.756975
3,3884,4096.669634
4,4842,3925.941122
...,...,...
1127,5933,5334.552610
1128,5295,5374.533432
1129,4993,4790.317908
1130,4474,5259.049055


- Lasso

In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_features)
#     ])

model = Pipeline([
    # ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=1.0))
])

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 0.17
Mean Absolute Error (MAE): 894.88
Mean Squared Error (MSE): 1133327.59
Root Mean Squared Error (RMSE): 1064.58


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,3956.871702
1,5888,4251.990541
2,2927,4322.160195
3,3884,4094.657237
4,4842,3929.829305
...,...,...
1127,5933,5317.967338
1128,5295,5357.206056
1129,4993,4788.692692
1130,4474,5245.529549


## LightGBM

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor())
])

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 618
[LightGBM] [Info] Number of data points in the train set: 1132, number of used features: 6
[LightGBM] [Info] Start training from score 4395.080389
R-squared (R^2): 0.74
Mean Absolute Error (MAE): 493.46
Mean Squared Error (MSE): 361853.83
Root Mean Squared Error (RMSE): 601.54


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,4011.855902
1,5888,4981.430618
2,2927,3460.188057
3,3884,4419.348877
4,4842,4412.614902
...,...,...
1127,5933,5158.263143
1128,5295,5268.569652
1129,4993,4585.766584
1130,4474,5103.474973


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor())
])

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 614
[LightGBM] [Info] Number of data points in the train set: 1132, number of used features: 6
[LightGBM] [Info] Start training from score 4395.080389
R-squared (R^2): 0.73
Mean Absolute Error (MAE): 497.78
Mean Squared Error (MSE): 367972.47
Root Mean Squared Error (RMSE): 606.61


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,4052.368958
1,5888,5177.569821
2,2927,3436.677320
3,3884,4412.359509
4,4842,4319.250207
...,...,...
1127,5933,5195.609650
1128,5295,5194.769750
1129,4993,4614.432902
1130,4474,5058.530458


In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor())
])

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000103 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 614
[LightGBM] [Info] Number of data points in the train set: 1132, number of used features: 6
[LightGBM] [Info] Start training from score 4395.080389
R-squared (R^2): 0.74
Mean Absolute Error (MAE): 493.97
Mean Squared Error (MSE): 361566.09
Root Mean Squared Error (RMSE): 601.30


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,3969.191678
1,5888,4852.963473
2,2927,3773.259715
3,3884,4463.270860
4,4842,4496.367575
...,...,...
1127,5933,5267.577020
1128,5295,5321.852771
1129,4993,4532.804164
1130,4474,5039.573911


## XGBoost

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Age', 'NumberOfReferrals', 'SatisfactionScore']

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_features)
#     ])

model = Pipeline([
    # ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor())
])

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 0.98
Mean Absolute Error (MAE): 106.79
Mean Squared Error (MSE): 21164.80
Root Mean Squared Error (RMSE): 145.48


Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,3629.142578
1,5888,5905.281738
2,2927,3016.923340
3,3884,4059.321289
4,4842,4710.632812
...,...,...
1127,5933,5777.892578
1128,5295,5278.585449
1129,4993,4879.223145
1130,4474,4562.919922


- 하이퍼 파라미터 튜닝 진행

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = Pipeline([
    ('regressor', xgb.XGBRegressor())
])

param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300, 500],
    'regressor__max_depth': [None, 10, 20, 30, 50],
    'regressor__min_samples_split': [5, 10, 20, 30, 50],
    'regressor__min_samples_leaf': [1, 2, 3, 4, 5]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Model:", best_model)
print("Best Parameters:", best_params)

model.fit(X, y)

y_pred = model.predict(X)

r2 = r2_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y, 'Predicted CLTV': y_pred})
comparison_df

Best Model: Pipeline(steps=[('regressor',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, device=None,
                              early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=10, max_leaves=None,
                              min_child_weight=None, min_samples_leaf=1,
                              min_samples_split=5, missing=nan,
                              monotone_constraints=No

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.



Unnamed: 0,Actual CLTV,Predicted CLTV
0,3491,3629.142578
1,5888,5905.281738
2,2927,3016.923340
3,3884,4059.321289
4,4842,4710.632812
...,...,...
1127,5933,5777.892578
1128,5295,5278.585449
1129,4993,4879.223145
1130,4474,4562.919922


=> 하이퍼 파라미터 튜닝을 진행하지 않아도 결과가 동일함.

## 머신러닝 최종 결론

- 가장 예측성이 좋은 것은 XGBoost 모델로, XGBoost 모델을 선정함.