### 🔨01.데이터 핸들링
---

#### 01-01.Data 형태 확인

In [None]:
import pandas as pd
# Data 형태 확인
df.shape

# Data type 확인
df.info()

# Null 값 확인
df.isnull().sum()

#### 01-02.Unique한 Value별 카운팅

In [None]:
len(df['col'].unique())

df['col'].unique()

#### 01-03.DataFrame 특정값 치환

In [None]:
import numpy as np
import pandas as pd
df.replace(-200, np.NaN)

#### 01-04.Null 값 이전 값으로 채워넣기

In [None]:
import numpy as np
df.fillna(method='ffill')

#### 01-05.DataFrame 특정 col만 가져오기

In [None]:
import pandas as pd
df = df[['col1', 'col2']]

#### 01-06.조건에 맞는 DataFrame 출력

In [None]:
import pandas as pd
df[ (df['T'] >= 25) & (df['T'] <= 27) ]

#### 01-07.오름차순, 내림차순 정렬

In [None]:
import pandas as pd
# ascending = False(내림차순), default(오름차순)
df['col'].sort_values(ascending=False)

#### 01-08.특정값이 포함된 Data 찾기

In [None]:
df[df['col'].astype(str).str.contains('text')]

#### 01-09.특정 조건 만족하는 값, 변경하기

In [None]:
import numpy as np
np.where(df['col'] <= 5, 1, 0)

#### 01-10.groupby 활용 카운팅

In [None]:
import numpy as np
import pandas as pd
df['y'].groupby(df['job']).value_counts()

#### 01-11.pivot table 활용 데이터 처리

In [None]:
import pandas as pd
df_job = pd.pivot_table(df_job,          # 피벗할 데이터프레임
                     index = 'index',    # 행 위치에 들어갈 열
                     columns = 'col',    # 열 위치에 들어갈 열
                     values = 'value')   # 데이터로 사용할 열

#### 01-12.inf(무한대) 데이터 null 처리

In [None]:
import numpy as np
import pandas as pd

df['col'].replace([np.inf, -np.inf], np.nan)

#### 01-13.lag 데이터 생성

In [None]:
import numpy as np
import pandas as pd
# + n : 순방향, - n : 역방향
df['col'].shift(1)

#### 01-14.중복 데이터 처리

In [None]:
import numpy as np
import pandas as pd

df.drop_duplicates(['col'], keep = 'first', inplace=True)

#### 01-15.문자열 데이터 앞 공백 제거

In [None]:
import numpy as np
import pandas as pd

df['col'].str.lstrip()

#### 01-15.날짜 데이터 형식 변경

In [None]:
import datetime
df["Date"].dt.strftime("%Y-%m")

#### 01-16.list 중복 없애기

In [None]:
import pandas as pd
all_list = list(df['start']) + list(df['end'])
unique_list = set(all_list)

### 📈02.데이터 시각화
---

#### 02-01.Numeric(연속형) 변수 분포 확인

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use(['dark_background'])

# displot 활용 분포 그리기
sns.displot(df['col']);

# 분포의 평균도 같이 출력
print("col :", df['col'].mean())

#### 02-02.Plot size 조절

In [None]:
import matplotlib.pyplot as plt
# (20, 5) → 가로 inch, 세로 inch
plt.gcf().set_size_inches(20, 5)

#### 02-03.산점도(Scatter plot) 그리기

In [None]:
import seaborn as sns
# x(가로), y(세로), hue(구분자)
sns.scatterplot(x=df['x'], y=df['y'], hue = df['hue'], data=df)

#### 02-04.기본 line plot 그리기

In [None]:
import matplotlib.pyplot as plt
plt.plot(df['x'], df['y'], label='label')

#### 02-05.for문 활용 distplot 다중 출력

In [None]:
import matplotlib.pyplot as plt

# 데이터에 i번째 데이터부터 출력
for i in range(1,13):
    plt.subplot(3,4,i)
    plt.grid(False)
    sns.distplot(df.iloc[:,i])

plt.gcf().set_size_inches(20, 10)
plt.tight_layout()
plt.show()

#### 02-06.이중 축 그래프 그리기

In [None]:
import matplotlib.pyplot as plt

fig, ax1 = plt.subplots()
ax1.plot(df['x'], df['y'], color='green', label='label1')

ax2 = ax1.twinx()
ax2.plot(df['x'], df['y'], color='deeppink', label='label2')

fig.legend()
plt.gcf().set_size_inches(25, 5)
plt.show()

#### 02-07.pairplot 상관관계 분석

In [None]:
import seaborn as sns
#  모든 변수 조합에 관한 Scatter plot
df_pair = df[['col1', 'col2', 'col3', 'col4']]
sns.pairplot(df_pair)
plt.show()

#### 02-08.Heat map 상관관계 분석

In [None]:
import seaborn as sns
# 모든 조합, 상관계수 표현
df_pair = df[['col1', 'col2', 'col3', 'col4']]
sns.heatmap(df_pair.corr(), vmin = -1, vmax = +1, annot = True, cmap = 'coolwarm');

#### 02-09.그래프에 수직, 수평선 추가 및 길이 조절

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(data=df,x='x',y='y', s=50, linewidth=0);

# 수직선 추가
plt.vlines(-2, ymin=-2, ymax=2, color='r', linewidth=2);
plt.vlines(2, ymin=-2, ymax=2, color='r', linewidth=2);

# 수평선 추가
plt.hlines(-2, xmin=-2, xmax=2, color='r', linewidth=2);
plt.hlines(2, xmin=-2, xmax=2, color='r', linewidth=2);

#### 02-10.catplot 그리기

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use(['dark_background'])

sns.catplot(x="x", hue="y", kind="count",palette="pastel", edgecolor=".6",data=df);
plt.gcf().set_size_inches(25, 3)

#### 02-11.그래프 특정 값에 색상 입히기

In [None]:
import numpy as np
import matplotlib.pyplot as plt

df['vol_color'] = np.where(df['Volume_issue']==1, 'red', 'gray')
colors=list(df['vol_color'])
print(colors)

plt.figure(figsize=(10, 8))

plt.subplot(2,1,1)
plt.plot(df['Date'], df['Close'], 'o-', ms=1, lw=0.5, label='Close')
plt.legend()


plt.subplot(2,1,2)
plt.bar(df['Date'], df['Volume'], label='volume', color=colors)
plt.legend()

#### 02-12.bar plot 그리기

In [None]:
# barplot, order 옵션을 활용하여 가독성 Up
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use(['dark_background'])

sns.barplot(x='arrival_date_month', y='hotel', hue='arrival_date_year', data = df_reservation,
            order = ['01.January', '02.February', '03.March', '04.April', '05.May', '06.June', '07.July', '08.August', '09.September', '10.October', '11.November', '12.December']);
plt.gcf().set_size_inches(20, 5);

### 💻03.데이터 분석 및 모델링
---

#### 03-01.Train/Test set 분할

In [None]:
# 모델링을 학습하기 위한 Feature(X)와 Y데이터를 구분하는 단계
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

X=df_merge.drop(['y'], axis=1)
Y=df_merge['y']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify=Y)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

#### 03-02.모델 학습 및 예측

In [None]:
from sklearn.ensemble import RandomForestClassifier

# 모델 학습
rfc = RandomForestClassifier(random_state=123456)
rfc.fit(x_train, y_train)

# 예측
# 예측은 학습에 사용된 Data와 Test Data 모두 예측하고 평가함(※ 과적합 여부 판별)
y_pred_train = rfc.predict(x_train)
y_pred_test = rfc.predict(x_test)


#### 03-03.이진분류 모델 성능 확인

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

#### 03-04.하이퍼 파라미터 튜닝

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [400, 500],
           'max_depth' : [6, 8, 10, 12]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 123456, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1, scoring='recall')
grid_cv.fit(x_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

#### 03-05.중요 변수 파악(Feature Importance)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use(['dark_background'])

# rfc → 생성한 Model에 name 기재
ftr_importances_values = rfc.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = x_train.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Feature Importances')
sns.barplot(x=ftr_top20, y=ftr_top20.index)
plt.show()

#### 03-06.모델 Save & Read

In [None]:
import pickle
# 모델 저장
saved_model = pickle.dumps(model)

# 모델 Read
model_from_pickle = pickle.loads(saved_model)

#### 03-07.상관계수 값 출력

In [None]:
import scipy.stats as stats
stats.pearsonr(x=df['x'], y=df['y'])

#### 03-08.Regressor(회귀) 모델 학습 및 평가

In [None]:
# 모델링을 학습하기 위한 Fearue(X)와 Y데이터를 구분하는 단계
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

X=df.drop(['y'], axis=1)
Y=df['y']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

# RandomForestRegressor 모델 학습
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)

# 예측
# 예측은 학습에 사용된 Data와 Test Data 모두 예측하고 평가함(※ 과적합 여부 판별)
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score
y_pred_train = rfr.predict(x_train)
y_pred_test = rfr.predict(x_test)


mse_train = mean_absolute_error(y_train, y_pred_train)
print('mse_train(mse): ', mse_train)
rmse_train = (np.sqrt(mse_train))
print('rmse_train(rmse): ', rmse_train)
r2_train = r2_score(y_train, y_pred_train)
print('rmse_train(r2): ', r2_train)
print('')
mse_test = mean_absolute_error(y_test, y_pred_test)
print('mse_test(mse): ', mse_test)
rmse_test = (np.sqrt(mse_test))
print('rmse_test(rmse): ', rmse_test)
r2_test = r2_score(y_test, y_pred_test)
print('rmse_test(r2): ', r2_test)

#### 03-09.표준화 및 PCA 차원축소

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

x = StandardScaler().fit_transform(x)

pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

#### 03-10.선형회귀활용 모델링

In [None]:
# 모델링을 학습하기 위한 Fearue(X)와 Y데이터를 구분하는 단계
from sklearn.model_selection import train_test_split
from sklearn import metrics

X=df.drop(['y'], axis=1)
Y=df['y']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

# LR(선형회귀) 모델 활용
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()
mlr.fit(x_train, y_train)

# 예측
# 예측은 학습에 사용된 Data와 Test Data 모두 예측하고 평가함(※ 과적합 여부 판별)
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score
y_pred_train = mlr.predict(x_train)
y_pred_test = mlr.predict(x_test)

# 평가
mse_train = mean_absolute_error(y_train, y_pred_train)
print('mse_train(mse): ', mse_train)
rmse_train = (np.sqrt(mse_train))
print('rmse_train(rmse): ', rmse_train)
r2_train = r2_score(y_train, y_pred_train)
print('rmse_train(r2): ', r2_train)
print('')
mse_test = mean_absolute_error(y_test, y_pred_test)
print('mse_test(mse): ', mse_test)
rmse_test = (np.sqrt(mse_test))
print('rmse_test(rmse): ', rmse_test)
r2_test = r2_score(y_test, y_pred_test)
print('rmse_test(r2): ', r2_test)

#### 03-11.선형회귀 상관계수 확인

In [None]:
df_coef = pd.DataFrame({'col':X.columns, 'coef':mlr.coef_}).reset_index(drop=True)
df_coef

#### 03-12.light gbm 활용 모델링

In [None]:
# ▶ 모델링을 학습하기 위한 Fearue(X)와 Y데이터를 구분하는 단계
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split

# 데이터 세트로드
X = df.drop(['y'], axis=1)
Y = df['y']

# train/test split
x_train, x_test, y_train, y_test = train_test_split (X, Y, test_size = 0.3)

# 데이터 세트를 적절한 LGB 형식으로 변환
d_train = lgb.Dataset (x_train, label = y_train)

# setting the parameters
params = {}
params [ 'learning_rate'] = 0.02
params [ 'boosting_type'] = 'gbdt' # GradientBoostingDecisionTree
params['objective'] = 'binary'
params [ 'metric' ] = 'binary_logloss' # metric for binary-class
params [ 'max_depth'] = 5
params [ 'num_leaves' ] = 32
params ['seed'] = 23456

# 모델 학습
clf = lgb.train (params, d_train, 1000) # epocs에서 모델 훈련

from sklearn.metrics import classification_report

y_pred_train = clf.predict(x_train)
for i in range(0,len(y_pred_train)):
    if y_pred_train[i]>=.5:       # setting threshold to .5
       y_pred_train[i]=1
    else:
       y_pred_train[i]=0

y_pred_test = clf.predict(x_test)
for i in range(0,len(y_pred_test)):
    if y_pred_test[i]>=.5:       # setting threshold to .5
       y_pred_test[i]=1
    else:
       y_pred_test[i]=0

print(classification_report(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))

#### 03-13.연속형, 범주형 변수 list 나누기

In [None]:
import numpy as np
import pandas as pd
#  numeric, categorical value 나누기
numeric_list=[]
categoical_list=[]

for i in df.columns :
  if df[i].dtypes == 'O' :
    categoical_list.append(i)
  else :
    numeric_list.append(i)

#### 03-14.AUROC score 출력하기

In [None]:
from sklearn.metrics import roc_auc_score

y_pred_train_proba = rfc.predict_proba(x_train)[:, 1]
y_pred_test_proba = rfc.predict_proba(x_test)[:, 1]


roc_score_train = roc_auc_score(y_train, y_pred_train_proba)
roc_score_test = roc_auc_score(y_test, y_pred_test_proba)

print("roc_score_train :", roc_score_train)
print("roc_score_test :", roc_score_test)

#### 03-15.Lable encoder 활용 범주형 데이터 처리

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in categoical_list:
    print(col)
    le = LabelEncoder()
    le.fit(list(x_train[col].values) + list(x_test[col].values))
    x_train[col] = le.transform(x_train[col])
    x_test[col] = le.transform(x_test[col])

#### 03-16.ROC 커브 그리기

In [None]:
from sklearn.metrics import roc_curve
def roc_curve_plot(y_test , pred_proba_c1):
    # 임곗값에 따른 FPR, TPR 값을 반환 받음.
    # FPR : 암환자가 아닌 환자를 암환자라고 잘 못 예측한 비율
    # TPR : Recall
    fprs , tprs , thresholds = roc_curve(y_test ,pred_proba_c1)

    # ROC Curve를 plot 곡선으로 그림.
    plt.plot(fprs , tprs, label='ROC')
    # 가운데 대각선 직선을 그림.
    plt.plot([0, 1], [0, 1], 'k--', label='Random', color='red')

    # FPR X 축의 Scale을 0.1 단위로 변경, X,Y 축명 설정등
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.xlabel('FPR( 1 - Sensitivity )')
    plt.ylabel('TPR( Recall )')
    plt.legend()
    plt.show()

roc_curve_plot(y_test, y_pred_test_proba)

#### 03-17.min-max scale 활용 정규화

In [None]:
from sklearn.preprocessing import minmax_scale

rfm['Recency'] = minmax_scale(rfm['Recency'], axis=0, copy=True)