In [None]:
import pandas as pd
import numpy as np
import math

from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder
from sklearn.model_selection import cross_val_score,train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,recall_score,precision_score



X_train = pd.read_csv('./data/X_train.csv')
X_train['index'] = 1
print(X_train.describe())
X_test = pd.read_csv('./data/X_test.csv')
X_test['index'] = 0
cust_id = X_test.cust_id
y_train = pd.read_csv('./data/y_train.csv')

X = pd.concat([X_train,X_test],axis=0)
X = X.fillna(0)
# print(X_test.shape[0]-X_test.count())
# print(X_test.loc[X_train.환불금액 == 0,'환불금액'])
X.drop('최대구매액',axis='columns',inplace=True)
X= pd.get_dummies(X)
# encoder = LabelEncoder()
# X_train['주구매상품']=encoder.fit_transform(X_train['주구매상품'])
# X_train['주구매지점']=encoder.fit_transform(X_train['주구매지점'])
X['총구매액'] = np.log1p(np.where(X['총구매액']<0 ,0,X['총구매액'])+1)

X['환불금액'] = np.log1p(np.where(X['환불금액']<0 ,0,X['환불금액'])+1)
X['환불여부'] = np.where(X['환불금액']==0 ,0,1)
# X_train['내점대비금액'] = X_train['총구매액']/X_train['내점일수']
# X_train['구매건수대비금액'] = X_train['총구매액']/(X_train['내점일수']*X_train['내점당구매건수'])
X['주말방문비율'] = X['주말방문비율']*100
X['실제구매액'] = X['총구매액'] - X['환불금액']
X.drop(['cust_id','환불금액'],axis='columns',inplace=True)

# scalar = StandardScaler()
# scalar.fit(np.array(X))

X_train =X.query('index==1')
X_test = X.query('index==0')

X_train.drop('index',axis='columns',inplace=True)
X_test.drop('index',axis='columns',inplace=True)

y_train.drop('cust_id',axis='columns',inplace=True)
y_train = pd.DataFrame(y_train)

clf_labels = ['LR','SGD','RF','GB','KNN','NB']
clf1 = LogisticRegression()
clf2 = SGDClassifier()
clf3 = RandomForestClassifier()
clf4 = GradientBoostingClassifier()
clf5 = KNeighborsClassifier()
clf6 = GaussianNB()
lassocv = LogisticRegressionCV(cv=10, penalty='l1', l1_ratios=[0.1, 0.5, 0.9], solver='saga')
elasticcv = LogisticRegressionCV(cv=10, penalty='elasticnet', l1_ratios=[0.1, 0.5, 0.9], solver='saga')


clfs = [clf1, clf2, clf3, clf4, clf5,clf6]

for clf,label in zip(clfs,clf_labels):
	scores = cross_val_score(estimator=clf,
													X=X_train,
													y=y_train.values.ravel(),
													cv = 3,
													scoring = 'accuracy')
	print(f"Accuracy: {np.mean(scores):.4f}  \t [{label}]")

xtr,xts,ytr,yts = train_test_split(X_train,y_train,test_size=0.3)
model = LogisticRegression()

model.fit(xtr,ytr)
y_prob = model.predict_proba(xts)
y_pred = model.predict(xts)
print(f'accuracy : {accuracy_score(y_pred,yts):.4f},f1 : {f1_score(y_pred,yts):.4f},rocauc : {roc_auc_score(y_pred,yts):.4f},recall : {recall_score(y_pred,yts):.4f},precision : {precision_score(y_pred,yts):.4f}')

# model = LogisticRegression()
# model.fit(X_train,y_train.values.ravel())
# y_pred = model.predict_proba(X_test)
# y_pred_men = [i[1] for i in y_pred]
# result = pd.DataFrame({'custid':cust_id,
# 						           'gender':y_pred_men})
# print(result)
# result.to_csv('./data/y_test.csv',header=True,index=False)