# Google Colab 빅쿼리 연동

In [1]:
from google.colab import auth
auth.authenticate_user()

## 데이터 가져오기
- 공식문서 참조 : https://pandas.pydata.org/docs/reference/api/pandas.read_gbq.html

- gender : 성별
- SeniorCitizen : 노인인지의 여부
- Dependents : 자녀의 유무
- tenure : 고객의 가입 기간 (개월 수)
- PhoneService : 휴대폰 서비스를 가입 했는지의 여부
- MultipleLines : 여러 개의 통신선을 서비스 받고 있는지의 여부
- InternetService : 인터넷 서비스 제공자 (DSL, Fiber optic, No)
- OnlineSecurity : 온라인 보안 서비스를 가입 했는지의 여부
- OnlineBackup : 온라인 백업 서비스를 가입 했는지의 여부
- DeviceProtection 기기 보호 서비스를 가입 했는지의 여부
- TechSupport : 기술 서포트 서비스를 가입 했는지의 여부
- StreamingTV : TV 스트리밍 서비스를 가입 했는지의 여부
- StreamingMovies : 영화 스트리밍 서비스를 가입 했는지의 여부
- Contract : 계약 유형 (Month-to-month, One year, Two year)
- PaperlessBilling : 전자 고지서 여부
- PaymentMethod : 요금 지불 방법 (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
- MonthlyCharges : 매달 고객에게 청구되는 금액
- TotalCharges : 고객에게 청구된 총 금액
- Churn : 지난 한 달 내에 떠난 고객인지의 여부
- churn_rate : 0 (유지), 1 (이탈)로만 이루어진 이탈율
- CLTV : 고객 생애 가치
- Churn Reason : 고객 이탈의 구체적인 이유
- Country : 국가
- State : 주
- City : 도시
- Zip Code : 우편번호
- Latitude : 위도
- Longitude : 경도
- Age : 현재 나이
- Under 30 : 나이가 30살 미만 여부
- Married : 결혼 여부
- Referred a Friend : 친구 추천 여부
- Number of Referrals : 고객이 현재까지 추천한 횟수
- Offer : 고객이 마지막으로 수락한 마케팅 제안
- Avg Monthly Long Distance Charges : 월 평균 장거리 전화 요금
- Avg Monthly GB Download : 월 평균 다운로드한 용량(단위 GB)
- Streaming Music : 타사의 음악 스트리밍 여부
- Premium Tech Support : 대기 시간 시간을 단축하느 프리미엄 추가 기술 지원 플랜 가입 여부
- Unlimited Data : 무제한 데이터 다운로드/업로드를 위해 월별 추가 요금을 지불했는지 여부
- Total Refunds : 분기말까지의 고객의 총 환불 금액
- Total Extra Data Charges : 분기말까지의 데이터 다운로드 한도를 초과한 것에 대한 고객의 총요금
- Total Long Distance Charges : 분기말까지의 장거리 통화 한도를 초과한 것에 대한 고객의 총요금
- Total Revenue : 총 수익
- Satisfaction Score : 회사에 대한 고객의 만족도 점수 (5점 척도)
- Churn Category : 이탈 이유에 대한 카테고리 (태도, 경쟁사, 불만족, 가격, 기타)

In [1]:
import pandas as pd

project_id = 'multi-telecom'
sql = '''
SELECT * FROM `multi-telecom.churn_rate.telecom_ML` LIMIT 1200
'''

df = pd.read_gbq(sql, project_id = project_id, dialect = 'standard')
df

Unnamed: 0,gender,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,...,payment_method_credit_card_auto,PaymentMethod_Electronic check,PaymentMethod_Mailed check,ChurnCategory_0,ChurnCategory_Attitude,ChurnCategory_Competitor,ChurnCategory_Dissatisfaction,ChurnCategory_Other,ChurnCategory_Price,Service_Num
0,1,0,1,1,0,0,0,0,0,0,...,0,,,0,0,0,0,0,1,1
1,1,0,11,1,0,0,0,0,0,0,...,0,,,0,0,1,0,0,0,1
2,1,0,27,0,0,1,1,0,0,0,...,1,,,0,0,1,0,0,0,2
3,0,0,1,0,0,1,0,0,0,0,...,0,,,0,0,1,0,0,0,2
4,0,0,1,0,0,1,0,0,0,0,...,0,,,0,0,1,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,0,0,72,1,1,1,0,1,1,1,...,1,,,1,0,0,0,0,0,11
1128,1,0,72,0,0,1,1,1,1,1,...,1,,,1,0,0,0,0,0,11
1129,1,0,52,1,1,1,0,1,1,1,...,0,,,1,0,0,0,0,0,12
1130,0,0,72,1,1,1,0,1,1,1,...,0,,,1,0,0,0,0,0,12


# 머신러닝 (TotalRevenue)
- 통신사 매출예측

## 머신러닝 준비
- 매출예측을 판단하기에 적합해 보이는 컬럼들을 추출해 예측을 돌리고, 평가지표를 사용해 원데이터 TotalRevenue와 비교해봄.

In [2]:
df.TotalExtraDataCharges.value_counts()

0      988
10      20
20      16
30      14
40      14
140     12
100     12
80      10
110      9
90       7
120      7
150      7
50       6
60       4
130      3
70       3
Name: TotalExtraDataCharges, dtype: Int64

=> 제로값이 많으므로 다른 값들이 이상치로 측정될 가능성이 높아 해당 컬럼은 제외함.

In [3]:
selected_columns = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges', 'PhoneService', 'MultipleLines', 'UnlimitedData']
X = df[selected_columns]
y = df['TotalRevenue']

In [6]:
X

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,AvgMonthlyLongDistanceCharges,AvgMonthlyGBDownload,TotalRefunds,TotalLongDistanceCharges,PhoneService,MultipleLines,UnlimitedData
0,1,20.05,20.05,19.87,0,0.00,19.87,1,0,0
1,11,19.95,214.75,29.54,0,0.00,324.94,1,0,0
2,27,30.75,805.10,0.00,13,0.00,0.00,0,0,0
3,1,25.80,25.80,0.00,26,0.00,0.00,0,0,0
4,1,25.05,25.05,0.00,21,0.00,0.00,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1127,72,110.80,7882.25,15.85,2,0.00,1141.20,1,1,1
1128,72,64.45,4641.10,0.00,9,0.00,0.00,0,0,1
1129,52,110.75,5832.00,23.04,25,6.26,1198.08,1,1,1
1130,72,108.10,7774.05,26.87,20,0.00,1934.64,1,1,1


## 랜덤포레스트 회귀

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 74.80
Mean Squared Error (MSE): 12720.39
Root Mean Squared Error (RMSE): 112.78


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,221.1080
898,9111.80,9203.8507
413,1540.25,1507.7425
467,2639.44,2582.6754
745,9339.05,9343.9073
...,...,...
307,1185.05,1184.5458
334,622.45,582.5692
714,9195.82,9063.1855
832,672.70,697.9728


- 범주형 변수들이 포함된 상태인지 확인.

In [8]:
X_train

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,AvgMonthlyLongDistanceCharges,AvgMonthlyGBDownload,TotalRefunds,TotalLongDistanceCharges,PhoneService,MultipleLines,UnlimitedData
513,26,85.70,2067.00,46.24,4,0.00,1202.24,1,0,1
909,61,85.55,5251.75,46.65,23,0.00,2845.65,1,1,1
798,67,75.70,5060.85,39.89,19,0.00,2672.63,1,1,1
486,55,44.85,2479.05,0.00,30,0.00,0.00,0,0,1
1033,11,79.15,827.70,26.84,15,0.00,295.24,1,0,0
...,...,...,...,...,...,...,...,...,...,...
121,17,89.15,1496.90,30.37,28,0.00,516.29,1,0,1
1044,23,79.35,1835.30,8.72,18,15.41,200.56,1,1,1
1095,68,107.15,7379.80,32.11,12,0.00,2183.48,1,1,0
860,21,95.40,2025.10,37.40,16,0.00,785.40,1,1,1


In [9]:
X_test

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,AvgMonthlyLongDistanceCharges,AvgMonthlyGBDownload,TotalRefunds,TotalLongDistanceCharges,PhoneService,MultipleLines,UnlimitedData
783,2,69.30,153.80,32.62,2,0.0,65.24,1,0,1
898,60,101.40,6176.60,48.92,11,0.0,2935.20,1,1,1
413,43,33.45,1500.25,0.00,18,0.0,0.00,0,0,0
467,32,79.30,2570.00,2.17,11,0.0,69.44,1,1,1
745,72,116.75,8277.05,14.75,16,0.0,1062.00,1,1,1
...,...,...,...,...,...,...,...,...,...,...
307,13,82.00,1127.20,4.45,25,0.0,57.85,1,1,1
334,4,94.30,424.45,49.50,23,0.0,198.00,1,1,1
714,72,109.55,7920.70,17.71,4,0.0,1275.12,1,1,1
832,14,46.35,672.70,0.00,5,0.0,0.00,0,0,1


- 하이퍼 파라미터 튜닝 진행 (StandardScaler)

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300, 400, 500],
    'regressor__max_depth': [None, 10, 20, 30, 50, 70, 100],
    'regressor__min_samples_split': [5, 10, 20, 30, 50, 70, 100],
    'regressor__min_samples_leaf': [1, 2, 3, 4, 5, 7]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Model:", best_model)
print("Best Parameters:", best_params)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['tenure', 'MonthlyCharges',
                                                   'TotalCharges',
                                                   'AvgMonthlyLongDistanceCharges',
                                                   'AvgMonthlyGBDownload',
                                                   'TotalRefunds',
                                                   'TotalLongDistanceCharges'])])),
                ('regressor',
                 RandomForestRegressor(max_depth=50, min_samples_split=5,
                                       n_estimators=50))])
Best Parameters: {'regressor__max_depth': 50, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 50}
R-squared (R^2): 1.00
Mean Absolute Error (MAE): 71.81
Mean Squared Error (MSE): 11805.42
Root Mean Squared Error (

Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,222.4131
898,9111.80,9234.9139
413,1540.25,1516.7043
467,2639.44,2570.6282
745,9339.05,9266.5795
...,...,...
307,1185.05,1206.3272
334,622.45,580.5051
714,9195.82,9068.2253
832,672.70,719.3910


=> 하이퍼 파라미터 튜닝을 하면 평가지표의 결과가 조금 더 좋아지긴 했지만, 하지 않더라고 충분히 모델의 예측성을 잘 설명해주고 있음.

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 74.27
Mean Squared Error (MSE): 12552.89
Root Mean Squared Error (RMSE): 112.04


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,221.2581
898,9111.80,9220.5020
413,1540.25,1514.1282
467,2639.44,2569.2549
745,9339.05,9304.3829
...,...,...
307,1185.05,1228.8888
334,622.45,581.6091
714,9195.82,9010.0423
832,672.70,697.9582


In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 75.51
Mean Squared Error (MSE): 12713.52
Root Mean Squared Error (RMSE): 112.75


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,223.9133
898,9111.80,9222.9014
413,1540.25,1520.1248
467,2639.44,2569.2110
745,9339.05,9251.3897
...,...,...
307,1185.05,1201.6748
334,622.45,574.5420
714,9195.82,9083.0943
832,672.70,707.3983


=> 하이퍼 파라미터 조정을 한 StandardScaler가 모든 평가지표를 통틀었을 때, 그나마 좋은 결과를 내포함.

그러나 3가지 스케일링 모두 큰 차이는 없고, 하이퍼 파라미터 튜닝 역시 큰 차이를 보이진 않아서 필요성을 느끼지 못함.

추가로 다른 모델을 사용해서 현재보다 평가지표값이 작은 모델을 선정해볼 필요 있음.

## 선형 회귀

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 15.26
Mean Squared Error (MSE): 917.15
Root Mean Squared Error (RMSE): 30.28


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,224.631025
898,9111.80,9125.924977
413,1540.25,1503.592150
467,2639.44,2645.908242
745,9339.05,9352.709727
...,...,...
307,1185.05,1191.395686
334,622.45,631.827868
714,9195.82,9208.608482
832,672.70,675.368517


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 15.26
Mean Squared Error (MSE): 917.15
Root Mean Squared Error (RMSE): 30.28


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,224.631025
898,9111.80,9125.924977
413,1540.25,1503.592150
467,2639.44,2645.908242
745,9339.05,9352.709727
...,...,...
307,1185.05,1191.395686
334,622.45,631.827868
714,9195.82,9208.608482
832,672.70,675.368517


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 15.26
Mean Squared Error (MSE): 917.15
Root Mean Squared Error (RMSE): 30.28


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,224.631025
898,9111.80,9125.924977
413,1540.25,1503.592150
467,2639.44,2645.908242
745,9339.05,9352.709727
...,...,...
307,1185.05,1191.395686
334,622.45,631.827868
714,9195.82,9208.608482
832,672.70,675.368517


## Ridge, Lasso 회귀

- Ridge

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 19.48
Mean Squared Error (MSE): 1090.31
Root Mean Squared Error (RMSE): 33.02


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,218.348407
898,9111.80,9115.900307
413,1540.25,1512.490505
467,2639.44,2649.503810
745,9339.05,9330.804360
...,...,...
307,1185.05,1198.120198
334,622.45,641.966049
714,9195.82,9187.455769
832,672.70,664.586010


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 87.08
Mean Squared Error (MSE): 13977.55
Root Mean Squared Error (RMSE): 118.23


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,184.476796
898,9111.80,9008.671196
413,1540.25,1595.096924
467,2639.44,2676.027757
745,9339.05,9158.079418
...,...,...
307,1185.05,1239.598857
334,622.45,753.207391
714,9195.82,9017.716056
832,672.70,568.604702


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 33.31
Mean Squared Error (MSE): 2168.12
Root Mean Squared Error (RMSE): 46.56


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,205.193848
898,9111.80,9102.578360
413,1540.25,1526.326997
467,2639.44,2656.240067
745,9339.05,9288.616710
...,...,...
307,1185.05,1213.290929
334,622.45,654.869262
714,9195.82,9147.488116
832,672.70,648.534401


- Lasso

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=1.0))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 15.49
Mean Squared Error (MSE): 921.48
Root Mean Squared Error (RMSE): 30.36


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,225.431332
898,9111.80,9123.662381
413,1540.25,1505.039389
467,2639.44,2646.976095
745,9339.05,9350.789540
...,...,...
307,1185.05,1192.353221
334,622.45,630.333906
714,9195.82,9207.255784
832,672.70,677.960334


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=1.0))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 16.38
Mean Squared Error (MSE): 995.65
Root Mean Squared Error (RMSE): 31.55


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,226.768953
898,9111.80,9114.545137
413,1540.25,1509.079293
467,2639.44,2647.978480
745,9339.05,9346.586294
...,...,...
307,1185.05,1193.104248
334,622.45,629.933879
714,9195.82,9202.828554
832,672.70,680.869448


In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=1.0))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 15.48
Mean Squared Error (MSE): 922.97
Root Mean Squared Error (RMSE): 30.38


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,226.584277
898,9111.80,9123.345035
413,1540.25,1504.958296
467,2639.44,2647.133023
745,9339.05,9349.122275
...,...,...
307,1185.05,1193.239748
334,622.45,631.796047
714,9195.82,9205.711667
832,672.70,678.480059


## LightGBM

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1153
[LightGBM] [Info] Number of data points in the train set: 792, number of used features: 7
[LightGBM] [Info] Start training from score 3720.993147
R-squared (R^2): 1.00
Mean Absolute Error (MAE): 79.98
Mean Squared Error (MSE): 18843.63
Root Mean Squared Error (RMSE): 137.27


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,217.614343
898,9111.80,9544.535711
413,1540.25,1425.447039
467,2639.44,2590.091542
745,9339.05,9222.047988
...,...,...
307,1185.05,1181.612245
334,622.45,617.995592
714,9195.82,9196.528124
832,672.70,694.723367


In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1121
[LightGBM] [Info] Number of data points in the train set: 792, number of used features: 7
[LightGBM] [Info] Start training from score 3720.993147
R-squared (R^2): 1.00
Mean Absolute Error (MAE): 82.69
Mean Squared Error (MSE): 20831.82
Root Mean Squared Error (RMSE): 144.33


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,203.982691
898,9111.80,9594.169741
413,1540.25,1434.278047
467,2639.44,2612.911153
745,9339.05,9314.288805
...,...,...
307,1185.05,1167.852021
334,622.45,643.795728
714,9195.82,9166.395060
832,672.70,682.818982


In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
import lightgbm as lgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1150
[LightGBM] [Info] Number of data points in the train set: 792, number of used features: 7
[LightGBM] [Info] Start training from score 3720.993147
R-squared (R^2): 1.00
Mean Absolute Error (MAE): 81.41
Mean Squared Error (MSE): 19815.38
Root Mean Squared Error (RMSE): 140.77


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,212.535700
898,9111.80,9620.309540
413,1540.25,1414.447797
467,2639.44,2608.791394
745,9339.05,9170.430250
...,...,...
307,1185.05,1183.614262
334,622.45,600.438685
714,9195.82,9230.807141
832,672.70,698.698569


## XGBoost

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 82.70
Mean Squared Error (MSE): 16776.73
Root Mean Squared Error (RMSE): 129.53


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,213.990372
898,9111.80,9154.418945
413,1540.25,1456.504150
467,2639.44,2599.292969
745,9339.05,9506.609375
...,...,...
307,1185.05,1111.692017
334,622.45,633.575073
714,9195.82,9060.208008
832,672.70,810.153503


- 하이퍼 파라미터 튜닝 진행

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

model = Pipeline([
    ('regressor', xgb.XGBRegressor())
])

param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300, 500],
    'regressor__max_depth': [None, 10, 20, 30, 50],
    'regressor__min_samples_split': [5, 10, 20, 30, 50],
    'regressor__min_samples_leaf': [1, 2, 3, 4, 5]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Model:", best_model)
print("Best Parameters:", best_params)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.



Best Model: Pipeline(steps=[('regressor',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, device=None,
                              early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, grow_policy=None,
                              importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=None, max_leaves=None,
                              min_child_weight=None, min_samples_leaf=1,
                              min_samples_split=5, missing=nan,
                              monotone_constraints=

Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,209.441986
898,9111.80,9122.119141
413,1540.25,1452.112427
467,2639.44,2591.463623
745,9339.05,9491.247070
...,...,...
307,1185.05,1137.558716
334,622.45,611.442688
714,9195.82,9055.526367
832,672.70,806.853088


=> 하이퍼 파라미터 튜닝을 진행하지 않아도 결과가 별 차이를 보이지 않음.

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 82.70
Mean Squared Error (MSE): 16776.73
Root Mean Squared Error (RMSE): 129.53


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,213.990372
898,9111.80,9154.418945
413,1540.25,1456.504150
467,2639.44,2599.292969
745,9339.05,9506.609375
...,...,...
307,1185.05,1111.692017
334,622.45,633.575073
714,9195.82,9060.208008
832,672.70,810.153503


In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyLongDistanceCharges', 'AvgMonthlyGBDownload', 'TotalRefunds', 'TotalLongDistanceCharges']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor())
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared (R^2): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

comparison_df = pd.DataFrame({'Actual CLTV': y_test, 'Predicted CLTV': y_pred})
comparison_df

R-squared (R^2): 1.00
Mean Absolute Error (MAE): 82.70
Mean Squared Error (MSE): 16776.73
Root Mean Squared Error (RMSE): 129.53


Unnamed: 0,Actual CLTV,Predicted CLTV
783,219.04,213.990372
898,9111.80,9154.418945
413,1540.25,1456.504150
467,2639.44,2599.292969
745,9339.05,9506.609375
...,...,...
307,1185.05,1111.692017
334,622.45,633.575073
714,9195.82,9060.208008
832,672.70,810.153503


## 머신러닝 최종 결론
- 가장 예측성이 좋은 것은 선형 회귀모델이었음.