In [1]:
# 기본 라이브러리
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [2]:
# 정확도, 정밀도, 재현율 구하는 함수
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}'.format(accuracy, precision, recall))

In [3]:
# 데이터 불러오기
data1= pd.read_csv('F:/superpack/ppv2305.csv', encoding='UTF-8')
data2= pd.read_csv('F:/superpack/ppv2306.csv', encoding='UTF-8')

In [None]:
data2

In [4]:
X1 = data1.drop(['yn', 'sa_id'], axis=1)
y1 = data1[['yn']]

In [5]:
X2 = data2.drop(['yn', 'sa_id'], axis=1)
y2 = data2[['yn']]

In [6]:
print(data1.shape)
print(data2.shape)
print(y1.sum())
print(y2.sum())

(315877, 58)
(344352, 58)
yn    162932
dtype: int64
yn    210533
dtype: int64


In [None]:
# 데이터 분리 (비율 유지)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3, random_state=777, stratify=y1)

In [None]:
import statsmodels.api as sm
model = sm.OLS(y_train, x_train).fit()
model.summary()

In [7]:
# xgboost 사용
from xgboost import XGBClassifier
xgb=XGBClassifier(n_estimators=1000, 
                 max_depth=30)
xgb.fit(X1,y1, early_stopping_rounds = 100,
                eval_metric = 'rmse',
                eval_set = [(X2,y2)])
y_xgb=xgb.predict(X2)

score_xgb=get_clf_eval(y2, y_xgb.reshape(-1,1))
score_xgb



[0]	validation_0-rmse:0.48268
[1]	validation_0-rmse:0.47482
[2]	validation_0-rmse:0.47157
[3]	validation_0-rmse:0.47026
[4]	validation_0-rmse:0.46972
[5]	validation_0-rmse:0.46951
[6]	validation_0-rmse:0.46959
[7]	validation_0-rmse:0.46967
[8]	validation_0-rmse:0.46988
[9]	validation_0-rmse:0.47006
[10]	validation_0-rmse:0.47029
[11]	validation_0-rmse:0.47042
[12]	validation_0-rmse:0.47060
[13]	validation_0-rmse:0.47069
[14]	validation_0-rmse:0.47072
[15]	validation_0-rmse:0.47077
[16]	validation_0-rmse:0.47088
[17]	validation_0-rmse:0.47097
[18]	validation_0-rmse:0.47106
[19]	validation_0-rmse:0.47115
[20]	validation_0-rmse:0.47118
[21]	validation_0-rmse:0.47124
[22]	validation_0-rmse:0.47135
[23]	validation_0-rmse:0.47141
[24]	validation_0-rmse:0.47143
[25]	validation_0-rmse:0.47145
[26]	validation_0-rmse:0.47151
[27]	validation_0-rmse:0.47154
[28]	validation_0-rmse:0.47159
[29]	validation_0-rmse:0.47171
[30]	validation_0-rmse:0.47174
[31]	validation_0-rmse:0.47179
[32]	validation_0-

In [None]:
# 최적 파라미터 찾기 
# (random_state = 42, max_depth=30, n_estimators=300, learning_rate=0.1)
# np.arange(0.05, 0.6, 0.05)
# [10,20,40,80,100,300,500]
from sklearn.model_selection import GridSearchCV
xgb2 = XGBClassifier()

parameters =  {
              'max_depth' : [10,15,20,25,30],
                'n_estimators' : [200,400,600,800,1000]
              }

xgb_grid = GridSearchCV(xgb2,
                        parameters,
                        cv = 3,
                        n_jobs = -1,
                        verbose=True)

xgb_grid.fit(x_train,y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [None]:
# 변수 중요도
importances = pd.Series(xgb.feature_importances_, X1.columns)
import matplotlib.pyplot as plt

n = 20
plt.figure(figsize=(10,n/2))

plt.title(f'Top {n} features')
importances.sort_values()[-n:].plot.barh();

In [9]:
ydata = data2[['sa_id','yn']]

Unnamed: 0,sa_id,yn
0,10999048980,1
1,11001361310,0
2,11001513150,1
3,11001680370,0
4,11001780380,1
...,...,...
344347,69168663343,0
344348,69168686594,0
344349,69168705311,0
344350,69168710662,0


In [10]:
# 익월 예측 값 엑셀로 추출 x_train,y_train
pred_proba = xgb.predict_proba(X2)[:,1]
pred_proba_1 = pred_proba.reshape(-1,1)
ddf = pd.DataFrame(pred_proba_1)
sample = pd.concat([ydata, ddf], axis =1)
sample.to_excel('C:/Users/20229069/Desktop/ttt.xlsx') #결과값 엑설로 저장 코드

# 한달만

In [None]:
# 데이터 불러오기
data= pd.read_csv('F:/superpack/ppv04.csv', encoding='UTF-8')

X = data.drop(['yn', 'sa_id'], axis=1)
y = data[['yn']]
# 데이터 분리 (비율 유지)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=777, stratify=y)

In [None]:
print(data.shape)
print(y.sum())

In [None]:
# xgboost 사용
from xgboost import XGBClassifier
xgb=XGBClassifier(n_estimators=1000, 
                 max_depth=20)
xgb.fit(x_train,y_train, early_stopping_rounds = 100,
                eval_metric = 'rmse',
                eval_set = [(x_test,y_test)])
y_xgb=xgb.predict(x_test)

score_xgb=get_clf_eval(y_test, y_xgb.reshape(-1,1))
score_xgb

In [None]:
# 최적 파라미터 찾기 
# (random_state = 42, max_depth=30, n_estimators=300, learning_rate=0.1)
# np.arange(0.05, 0.6, 0.05)
# [10,20,40,80,100,300,500]
from sklearn.model_selection import GridSearchCV
xgb2 = XGBClassifier()

parameters =  {
              'max_depth' : [10,15,20,25,30]
              }

xgb_grid = GridSearchCV(xgb2,
                        parameters,
                        cv = 3,
                        n_jobs = -1,
                        verbose=True)

xgb_grid.fit(x_train,y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

In [None]:
# 변수 중요도
importances = pd.Series(xgb.feature_importances_, X.columns)
import matplotlib.pyplot as plt

n = 20
plt.figure(figsize=(10,n/2))

plt.title(f'Top {n} features')
importances.sort_values()[-n:].plot.barh();

In [None]:
# Regressor 모델
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import AdaBoostRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

In [None]:
from sklearn.metrics import make_scorer

# 실제값과 예측값의 평균 오차를 성능 측정
def NMAE(true, pred):
    mae = np.mean(np.abs(true - pred))
    return mae

nmae_score = make_scorer(NMAE, greater_is_better=False)

etr=ExtraTreesRegressor()
rg=Ridge()
rf=RandomForestRegressor(random_state = 42)
gbr=GradientBoostingRegressor(random_state = 42)
xgb=XGBRegressor(random_state = 42)
lgb=LGBMRegressor(random_state = 42)
ada=AdaBoostRegressor(random_state = 42)
hgb=HistGradientBoostingRegressor(random_state=42)
cat_reg=CatBoostRegressor(random_state = 42)

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X1, y1,test_size=0.2, random_state = 42)
# 학습
etr.fit(X_train,y_train)
rg.fit(X_train,y_train)
rf.fit(X_train,y_train)
gbr.fit(X_train,y_train)
xgb.fit(X_train,y_train)
lgb.fit(X_train,y_train)
ada.fit(X_train,y_train)
hgb.fit(X_train,y_train)
cat_reg.fit(X_train,y_train)
# 예측
y_etr=etr.predict(X_test)
y_rg=rg.predict(X_test)
y_rf=rf.predict(X_test)
y_gbr=gbr.predict(X_test)
y_xgb=xgb.predict(X_test)
y_lgb=lgb.predict(X_test)
y_ada=ada.predict(X_test)
y_hgb=hgb.predict(X_test)
y_cat=cat_reg.predict(X_test)

# 성능 측정
score_etr=NMAE(y_test, y_etr.reshape(-1,1))
score_rg=NMAE(y_test, y_rg.reshape(-1,1))
score_rf=NMAE(y_test, y_rf.reshape(-1,1))
score_gbr=NMAE(y_test, y_gbr.reshape(-1,1))
score_xgb=NMAE(y_test, y_xgb.reshape(-1,1))
score_lgb=NMAE(y_test, y_lgb.reshape(-1,1))
score_ada=NMAE(y_test, y_ada.reshape(-1,1))
score_hgb=NMAE(y_test, y_hgb.reshape(-1,1))
score_cat=NMAE(y_test, y_cat.reshape(-1,1))

print('score_etr_score : ', score_etr)
print('score_rg_score : ',score_rg)
print('score_rf_score : ',score_rf)
print('score_gbr_score : ',score_gbr)
print('score_xgb_score : ',score_xgb)
print('score_lgb_score : ',score_lgb)
print('score_ada_score : ',score_ada)
print('score_hgb_score : ',score_hgb)
print('score_cat_score : ',score_cat)

In [None]:
y_pred_proba = xgb.predict_proba(x_test)

## ROC Curve, AUC
import sklearn.metrics as metrics

fpr, tpr, threshold = metrics.roc_curve(
    y_test, 
    y_pred_proba[:, 0], 
    pos_label=0) # positive label

AUC = metrics.auc(fpr, tpr)
# plotting ROC Curve
import matplotlib.pyplot as plt

plt.figure(figsize = (8, 8))
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % AUC)
plt.title(('ROC Curve of Logistic Regression'), fontsize=18)
plt.legend(loc = 'lower right')

plt.plot([0, 1], [0, 1],'r--') # random guess
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate', 
                fontsize=14)
plt.xlabel('False Positive Rate', 
                fontsize=14)

plt.show()

In [None]:
# 임계값에 따른 값
from sklearn.preprocessing import Binarizer
custom_threshold = 0.6 # 임계값 설정
pred_proba = xgb.predict_proba(X2)[:,1]
pred_proba_1 = pred_proba.reshape(-1,1)
custom_predict = Binarizer(threshold=custom_threshold).fit_transform(pred_proba_1)

# 결과값이 달라짐
get_clf_eval(y2, custom_predict) # 성능 측정

In [None]:
# 변수 중요도
importances = pd.Series(xgb.feature_importances_, X1.columns)
import matplotlib.pyplot as plt

n = 20
plt.figure(figsize=(10,n/2))

plt.title(f'Top {n} features')
importances.sort_values()[-n:].plot.barh();

In [None]:
ddf = pd.DataFrame(pred_proba_1)

ddf.to_excel('C:/Users/20229069/Desktop/predict.xlsx')

In [None]:
#교차 검증
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)
cv_accuracy = []
n_iter=0
for train_index, test_index in skf.split(X1, y1):  # feautres 데이터를 위에서 지정한 kfold 숫자로 분할
    x_train, x_test = X1.iloc[train_index,:], X1.iloc[test_index,:]
    y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
    
    xgb.fit(x_train, y_train)
    pred = xgb.predict(x_test)
    n_iter += 1
    
    accuracy = get_clf_eval(y_test, pred) # 소수점 4자리 반올림

    
    print('\n#{0} 교차 검증 정확도 : {1}'
          .format(n_iter, accuracy)) 

In [None]:
# 원핫인코딩
X1_data = pd.get_dummies(X1)
#X2_data = pd.get_dummies(X2)

In [None]:
# 데이터 표준화 - MinMaxScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#scaler = StandardScaler()
scaler = MinMaxScaler()

# MinMaxScaler 로 데이터 셋 변환 .fit() 과 .transform() 호출.  
scaler.fit(X1)
X_scaled = scaler.transform(X1)

# transform( )시 scale 변환된 데이터 셋이 numpy ndarry로 반환되어 이를 DataFrame으로 변환
df1_scaled = pd.DataFrame(data=X_scaled, columns=X1.columns)

# MinMaxScaler 로 데이터 셋 변환 .fit() 과 .transform() 호출.  
#scaler.fit(X2_data)
#X2_data_scaled = scaler.transform(X2_data)

# transform( )시 scale 변환된 데이터 셋이 numpy ndarry로 반환되어 이를 DataFrame으로 변환
#df2_scaled = pd.DataFrame(data=X2_data_scaled, columns=X2_data.columns)

In [None]:
df1_scaled

# 여러달

In [None]:
# xgboost 사용
from xgboost import XGBClassifier
xgb=XGBClassifier(random_state = 42, max_depth=25, n_estimators=250, learning_rate=0.1)
xgb.fit(df1_scaled,y1)
y_xgb=xgb.predict(df2_scaled)

score_xgb=get_clf_eval(y2, y_xgb.reshape(-1,1))

In [None]:
# 임계값에 따른 값
from sklearn.preprocessing import Binarizer
custom_threshold = 0.01 # 임계값 설정
pred_proba = xgb.predict_proba(df2_scaled)[:,1]
pred_proba_1 = pred_proba.reshape(-1,1)
custom_predict = Binarizer(threshold=custom_threshold).fit_transform(pred_proba_1)

# 결과값이 달라짐
get_clf_eval(y2, custom_predict) # 성능 측정

In [None]:
# 변수 중요도
importances = pd.Series(xgb.feature_importances_, df1_scaled.columns)
import matplotlib.pyplot as plt

n = 20
plt.figure(figsize=(10,n/2))

plt.title(f'Top {n} features')
importances.sort_values()[-n:].plot.barh();