### 在使用Open source node建立模型時，用以下方式區分訓練集與驗證集

In [None]:
# 從 Model Studio 提供的資料框中分割訓練與驗證資料       等於1
train = dm_inputdf[dm_inputdf[dm_partitionvar] == dm_partition_train_val]
X_train = train.loc[:,dm_input]
y_train = train[dm_dec_target]

valid = dm_inputdf[dm_inputdf[dm_partitionvar] == 0]
X_valid = valid.loc[:,dm_input]
y_valid = valid[dm_dec_target]

# 訓練模型時可使用
X = dm_traindf.loc[:, dm_input]
y = dm_traindf[dm_dec_target]

### 計算輪廓係數的Code

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

train = dm_inputdf[dm_inputdf[dm_partitionvar] == dm_partition_train_val]
X_train = train.loc[:,dm_input]
y_train = train[dm_dec_target]

cluster_cols = ['MOBILE_P', 'REP_DATA', 'REP_MINUTES', 'REP_REP_AGE', 'TECH_PROBLEM',
       'TOTAL_TECH_PROBLEM', 'BMRP', 'CP', 'GENDER', 'P_TYPE', 'UT', 'UV']

num_cols = ['MOBILE_P', 'REP_DATA', 'REP_MINUTES', 'REP_REP_AGE', 'TECH_PROBLEM', 'TOTAL_TECH_PROBLEM', 'CP']
cat_cols = ['BMRP', 'GENDER', 'P_TYPE']

num_transformer = Pipeline([
       ('imputer', SimpleImputer(strategy = 'mean')),
       ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
       ('imputer', SimpleImputer(strategy = 'most_frequent')),
       ('onehot', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'))
])

ut_uv_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')) 
])

preprocessor = ColumnTransformer(
       transformers = [
       ('num', num_transformer, num_cols),
       ('col', cat_transformer, cat_cols),
       ('uv_ut', ut_uv_transformer, ['UV', 'UT'])]
)

X_transformed = preprocessor.fit_transform(X_train)

cluster_labels = train['_CLUSTER_ID_'].to_numpy()

silhouette_avg = silhouette_score(X_transformed, cluster_labels)

print("輪廓係數：", round(silhouette_avg, 2))


### 特徵創建

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pickle

df = dm_inputdf.copy()
df_use_data = df[dm_input].copy() # 特徵工程要用的資料

# 區隔暫時不會用到的資料
dm_idx_cols = ['CHURN', 'IU', '_dmIndex_', '_PartInd_']
dm_idx_data = df[dm_idx_cols].copy()

# 定義特徵工程函式
def service_counts(df, UV, UT):
    df = df.copy()

    df['Service_Count'] = df[UV] + df[UT]
    return df['Service_Count']

def service_group(df, UV, UT):
    df = df.copy()
    df['Service_Group'] = 'no_service'
    
    df.loc[(df[UV] == 1) | (df[UT] == 1), 'Service_Group'] = 'one_service'
    df.loc[(df[UV] == 1) & (df[UT] == 1), 'Service_Group'] = 'two_service' 
    
    return df['Service_Group']

def total_tech_average(df, total_tech_problem):
    df = df.copy()

    df['Total_Tech_Average'] = round(df[total_tech_problem] / 12, 2)
    return df['Total_Tech_Average']

def age_group(df, age_col, bins = [18, 25, 45, 60, 80], labels = ['青年', '青壯年', '中年', '老年']):
    df = df.copy()

    age_group = pd.cut(df[age_col], bins, labels = labels, include_lowest=True)
    return age_group

def tech_spike(df, tech_problem, total_tech_average):
    df = df.copy()

    df['Tech_Spike'] = df[tech_problem] - df[total_tech_average]
    return df['Tech_Spike']

def value_per_month(df, mobile_p, bmrp):
    df = df.copy()
    df['Value_Per_Month'] = round(df[mobile_p] / df[bmrp], 2)

    return df['Value_Per_Month']
    
def dissat_index(df, cp, tech_problem, total_tech_average):
    df = df.copy()

    scaler_dissat = StandardScaler()
    feature_to_scale = [cp, tech_problem, total_tech_average]
    df[feature_to_scale] = scaler_dissat.fit_transform(df[feature_to_scale])

    df['Dissat_Index'] = df[feature_to_scale].sum(axis = 1)
    return df['Dissat_Index']

def customer_group(n_cluster, df, cp, tech_problem, minutes):
    df = df[[cp, tech_problem, minutes]].copy()
    
    scaler_customer = StandardScaler()
    X_scaled = scaler_customer.fit_transform(df)

    cluster_customer = KMeans(n_clusters = n_cluster, init = 'k-means++', n_init = 'auto')
    df['customer_group'] = cluster_customer.fit_predict(X_scaled)

    return df['customer_group']

# 建立新特徵
df_use_data['Service_Count'] = service_counts(df_use_data, 'IMP_UV', 'IMP_UT')
df_use_data['Service_Group'] = service_group(df_use_data, 'IMP_UV', 'IMP_UT')
df_use_data['Total_Tech_Average'] = total_tech_average(df_use_data, 'IMP_TOTAL_TECH_PROBLEM')
df_use_data['Age_Group'] = age_group(df_use_data, 'IMP_REP_AGE')
df_use_data['Tech_Spike'] = tech_spike(df_use_data, 'IMP_TECH_PROBLEM', 'Total_Tech_Average')
df_use_data['Value_Per_Month'] = value_per_month(df_use_data, 'IMP_MOBILE_P', 'BMRP')
df_use_data['Dissat_Index'] = dissat_index(df_use_data, 'IMP_CP', 'IMP_TECH_PROBLEM', 'Total_Tech_Average')
df_use_data['Customer_Group'] = customer_group(3, df_use_data, 'IMP_CP', 'IMP_TECH_PROBLEM', 'IMP_REP_MINUTES')
print(df_use_data['Customer_Group'])
dm_scoreddf = pd.concat([df_use_data, dm_idx_data], axis = 1)

df = df_use_data.copy()
feature_to_scale = ['IMP_CP', 'IMP_TECH_PROBLEM', 'IMP_REP_MINUTES']
scaler_customer = StandardScaler()
scaler_customer.fit(df[feature_to_scale])

feature_to_scale = ['IMP_CP', 'IMP_TECH_PROBLEM', 'Total_Tech_Average']
scaler_dissat = StandardScaler()
scaler_dissat.fit(df[feature_to_scale])

feature_to_scale = ['IMP_CP', 'IMP_TECH_PROBLEM', 'IMP_REP_MINUTES']
scaler_customer = StandardScaler()
X_scaled = scaler_customer.fit_transform(df[feature_to_scale])
cluster_customer = KMeans(n_clusters = 3, init = 'k-means++', n_init = 'auto').fit(X_scaled)

with open(dm_pklpath, 'wb') as f:
    pickle.dump(scaler_dissat, f)  # Dissat_Index 的 scaler
    pickle.dump(scaler_customer, f)  # Customer_Group 的 cluster
    pickle.dump(cluster_customer, f)

print(dm_scoreddf.columns)
print(dm_scoreddf.head())


import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# 載入序列化物件（從訓練 pickle）
with open(settings.pickle_path + dm_pklname, 'rb') as f:
    scaler_dissat = pickle.load(f)
    scaler_customer = pickle.load(f)
    cluster_customer = pickle.load(f)

def score_method(BMRP, IMP_CP, IMP_GENDER, IMP_P_TYPE, IMP_UT, IMP_UV, IMP_MOBILE_P, IMP_REP_AGE, IMP_REP_DATA, IMP_REP_MINUTES, IMP_TECH_PROBLEM, IMP_TOTAL_TECH_PROBLEM):
    "Output: Service_Count, Service_Group, Total_Tech_Average, Age_Group, Tech_Spike, Value_Per_Month, Dissat_Index, Customer_Group, BMRP, IMP_CP, IMP_GENDER, IMP_P_TYPE, IMP_UT, IMP_UV, IMP_MOBILE_P, IMP_REP_AGE, IMP_REP_DATA, IMP_REP_MINUTES, IMP_TECH_PROBLEM, IMP_TOTAL_TECH_PROBLEM"

    record = pd.DataFrame({
        'IMP_UV': [IMP_UV], 'IMP_UT': [IMP_UT], 'IMP_TOTAL_TECH_PROBLEM': [IMP_TOTAL_TECH_PROBLEM],
        'IMP_REP_AGE': [IMP_REP_AGE], 'IMP_TECH_PROBLEM': [IMP_TECH_PROBLEM], 'IMP_MOBILE_P': [IMP_MOBILE_P],
        'BMRP': [BMRP], 'IMP_CP': [IMP_CP], 'IMP_REP_MINUTES': [IMP_REP_MINUTES], 'IMP_GENDER': [IMP_GENDER],
        'IMP_P_TYPE': [IMP_P_TYPE], 'IMP_REP_DATA': [IMP_REP_DATA]
    })
    
    def service_counts(df, UV, UT):
        df = df.copy()
        df['Service_Count'] = df[UV] + df[UT]
        return df['Service_Count'].iloc[0]
    
    def service_group(df, UV, UT):
        df = df.copy()
        df['Service_Group'] = 'no_service'
        df.loc[(df[UV] == 1) | (df[UT] == 1), 'Service_Group'] = 'one_service'
        df.loc[(df[UV] == 1) & (df[UT] == 1), 'Service_Group'] = 'two_service'
        return df['Service_Group'].iloc[0]
    
    def total_tech_average(df, total_tech_problem):
        df = df.copy()
        df['Total_Tech_Average'] = round(df[total_tech_problem] / 12, 2)
        return df['Total_Tech_Average'].iloc[0]
    
    def age_group(df, age_col, bins=[18, 25, 45, 60, 80], labels=['青年', '青壯年', '中年', '老年']):
        df = df.copy()
        age_group = pd.cut(df[age_col], bins, labels=labels, include_lowest=True)
        return age_group.iloc[0]
    
    def tech_spike(df, tech_problem, total_tech_average):
        df = df.copy()
        df['Tech_Spike'] = df[tech_problem] - total_tech_average # 注意：total_tech_average 為輸入值
        return df['Tech_Spike'].iloc[0]
    
    def value_per_month(df, mobile_p, bmrp):
        df = df.copy()
        df['Value_Per_Month'] = round(df[mobile_p] / df[bmrp], 2)
        return df['Value_Per_Month'].iloc[0]
    
    def dissat_index(df, cp, tech_problem, total_tech_average):
        df = df.copy()
        feature_to_scale = [cp, tech_problem]
        tech_avg_series = pd.Series([float(total_tech_average)], index=df.index, name='Total_Tech_Average')
        df_scaled = scaler_dissat.transform(pd.concat([df[feature_to_scale], tech_avg_series], axis = 1))  # 使用訓練的 scaler，重用 total_tech_average
        df['Dissat_Index'] = df_scaled.sum(axis=1)  # 調整以匹配訓練邏輯
        return df['Dissat_Index'].iloc[0]
    
    def customer_group(n_cluster, df, cp, tech_problem, minutes):
        df_subset = df[[cp, tech_problem, minutes]].copy()
        X_scaled = scaler_customer.transform(df_subset)  # 使用訓練的 scaler
        cluster_pred = cluster_customer.predict(X_scaled)
        return int(cluster_pred[0])
    
    # 計算新特徵
    service_count = service_counts(record, 'IMP_UV', 'IMP_UT')
    service_group_val = service_group(record, 'IMP_UV', 'IMP_UT')
    total_tech_avg = total_tech_average(record, 'IMP_TOTAL_TECH_PROBLEM')
    age_group_val = age_group(record, 'IMP_REP_AGE')
    tech_spike_val = tech_spike(record, 'IMP_TECH_PROBLEM', total_tech_avg)
    value_per_month_val = value_per_month(record, 'IMP_MOBILE_P', 'BMRP')
    dissat_index_val = dissat_index(record, 'IMP_CP', 'IMP_TECH_PROBLEM', total_tech_avg)  # 傳入 total_tech_avg
    customer_group_val = customer_group(3, record, 'IMP_CP', 'IMP_TECH_PROBLEM', 'IMP_REP_MINUTES')

    return (float(service_count), service_group_val, float(total_tech_avg), age_group_val,
            float(tech_spike_val), float(value_per_month_val), float(dissat_index_val), int(customer_group_val),
            record['BMRP'].iloc[0], record['IMP_CP'].iloc[0], record['IMP_GENDER'].iloc[0], record['IMP_P_TYPE'].iloc[0], record['IMP_UT'].iloc[0], record['IMP_UV'].iloc[0],
            record['IMP_MOBILE_P'].iloc[0], record['IMP_REP_AGE'].iloc[0], record['IMP_REP_DATA'].iloc[0], record['IMP_REP_MINUTES'].iloc[0], record['IMP_TECH_PROBLEM'].iloc[0], record['IMP_TOTAL_TECH_PROBLEM'].iloc[0])

#BMRP, IMP_CP, IMP_GENDER, IMP_P_TYPE, IMP_UT, IMP_UV, IMP_MOBILE_P, IMP_REP_AGE, IMP_REP_DATA, IMP_REP_MINUTES, IMP_TECH_PROBLEM, IMP_TOTAL_TECH_PROBLEM

### 網格搜尋

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

X = dm_traindf.loc[:, dm_input]
y = dm_traindf[dm_dec_target]

rf_model = RandomForestClassifier(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2']
}

rf_grid_search = GridSearchCV(estimator = rf_model, param_grid = rf_param_grid, cv = 5, n_jobs = -1, scoring='roc_auc')
rf_grid_search.fit(X, y)
print("隨機森林最佳參數組合：", rf_grid_search.best_params_)
print("最佳 ROC-AUC 分數：", rf_grid_search.best_score_)

xgb_model = XGBClassifier(random_state=42)
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1]
}

xgb_model = xgb.XGBClassifier(
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

# 步驟 3: 創建 GridSearchCV 實例
# 'cv' 參數設定交叉驗證的折數
# 'scoring' 參數設定評估指標，例如 'roc_auc'
xgb_grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=xgb_param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,  # 使用所有可用的核心進行運算
    verbose=1   # 顯示進度
)

# 步驟 4: 執行網格搜尋
# X_train 和 y_train 是你的訓練資料
xgb_grid_search.fit(X, y)

# 步驟 5: 獲取最佳參數和最佳模型
print("最佳參數組合：", xgb_grid_search.best_params_)
print("最佳 ROC-AUC 分數：", xgb_grid_search.best_score_)

### SMOTENC

In [None]:
#from imblearn.over_sampling import SMOTENC
import pandas as pd
import numpy as np

df = dm_inputdf.copy()
df.drop(['_dmIndex_', 'IU'], axis = 1, inplace = True)
# 區隔暫時不會用到的資料
total_cols = df.drop(['CHURN'], axis = 1).columns
dm_idx_cols = ['CHURN', 'IU', '_dmIndex_', '_PartInd_']

# 從 Model Studio 提供的資料框中分割訓練與驗證資料
train = df[dm_inputdf[dm_partitionvar] == dm_partition_train_val]
X_train = train.loc[:,dm_input]
y_train = train[dm_dec_target]

valid = dm_inputdf[dm_inputdf[dm_partitionvar] == 0]
X_valid = valid.drop(['CHURN'], axis = 1)
y_valid = valid[dm_dec_target]

feature_names = X_train.columns
cat_cols = [col for col in feature_names if train[col].nunique() < 5 and (col != 'Service_Count')]
cat_col_indices = [X_train.columns.get_loc(col) for col in cat_cols]
print(cat_cols)

smote_nc = SMOTENC(categorical_features=cat_col_indices, random_state=42)
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)

train_df = pd.DataFrame(X_resampled, columns = total_cols)
valid_df = pd.DataFrame(X_valid, columns = total_cols)

train_df['CHURN'] = y_resampled.reset_index(drop=True)
valid_df['CHURN'] = y_valid.reset_index(drop=True)

df = pd.concat([train_df, valid_df], axis=0, ignore_index=True)
df.fillna(1)

dm_scoreddf = df