#### Data Preparation
- 이상치 판별과 처리
- Scalling
- 경우의 수 Colum 처리
- 교차 검증

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('test3.csv')

In [3]:
df.loc[df['tenure'] == 0, 'TotalCharges'] = 0
df['TotalCharges'] = df['TotalCharges'].apply(lambda x : float(x))
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2279.734304
std,0.368612,24.559481,30.090047,2266.79447
min,0.0,0.0,18.25,0.0
25%,0.0,9.0,35.5,398.55
50%,0.0,29.0,70.35,1394.55
75%,0.0,55.0,89.85,3786.6
max,1.0,72.0,118.75,8684.8


In [4]:
df['Churn_num'] = df['Churn'].map( {'Yes':1, 'No':0} )
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn_num
count,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2279.734304,0.26537
std,0.368612,24.559481,30.090047,2266.79447,0.441561
min,0.0,0.0,18.25,0.0,0.0
25%,0.0,9.0,35.5,398.55,0.0
50%,0.0,29.0,70.35,1394.55,0.0
75%,0.0,55.0,89.85,3786.6,1.0
max,1.0,72.0,118.75,8684.8,1.0


#### 이상치 (Outlier)
- IQR
- MAD
- MCOD

#### 식별된 이상치가 있는 경우 대처법
- 경우의 수가 적으면 지움
- 경계값으로 대체 (Winsorize)
- Business Understanding에서 설정한 방향성이 이상치 탐지인 경우 → label

#### IQR (Inter Quartile Range)
- Range를 벗어나면 이상치로 판단
- 25%부터 75%까지의 구간
- Range : (25% - 1.5 * IQR) 부터 (75% + 1.5 * IQR)까지의 구간

In [5]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn_num
count,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2279.734304,0.26537
std,0.368612,24.559481,30.090047,2266.79447,0.441561
min,0.0,0.0,18.25,0.0,0.0
25%,0.0,9.0,35.5,398.55,0.0
50%,0.0,29.0,70.35,1394.55,0.0
75%,0.0,55.0,89.85,3786.6,1.0
max,1.0,72.0,118.75,8684.8,1.0


In [6]:
desc = df.describe()['TotalCharges']

IQR = 1.5 * (desc['75%'] - desc['25%'])   # 1.5*IQR
upper_lim = desc['75%'] + IQR
lower_lim = desc['25%'] - IQR

In [7]:
# 이상치가 있는지 판단

df.loc[(df['TotalCharges'] > upper_lim) | (df['TotalCharges'] < lower_lim)]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Churn_num


#### Scaling
- min-max : 최소를 0으로, 최대를 1로 만드는 방식
- standard : 평균은 0으로, 표준편차를 1로 만드는 방식
- robust : 중간값은 0으로, IQR은 1로 만드는 방식
- 이상치에 미치는 영향도 : min-max > standard > robust

In [11]:
import numpy as np

In [13]:
# min-max
minus_min = df['tenure'] - np.min(df['tenure'])
scaled = minus_min / np.max(minus_min)
scaled

0       0.013889
1       0.472222
2       0.027778
3       0.625000
4       0.027778
          ...   
7038    0.333333
7039    1.000000
7040    0.152778
7041    0.055556
7042    0.916667
Name: tenure, Length: 7043, dtype: float64

In [14]:
df['tenure_scaled'] = scaled
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn_num,tenure_scaled
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2279.734304,0.26537,0.449599
std,0.368612,24.559481,30.090047,2266.79447,0.441561,0.341104
min,0.0,0.0,18.25,0.0,0.0,0.0
25%,0.0,9.0,35.5,398.55,0.0,0.125
50%,0.0,29.0,70.35,1394.55,0.0,0.402778
75%,0.0,55.0,89.85,3786.6,1.0,0.763889
max,1.0,72.0,118.75,8684.8,1.0,1.0


In [15]:
# standard
minus_mean = df['MonthlyCharges'] - np.mean(df['MonthlyCharges'])
df['MonthlyCharges_scaled'] = minus_mean / np.std(minus_mean)
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn_num,tenure_scaled,MonthlyCharges_scaled
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2279.734304,0.26537,0.449599,-1.513422e-15
std,0.368612,24.559481,30.090047,2266.79447,0.441561,0.341104,1.000071
min,0.0,0.0,18.25,0.0,0.0,0.0,-1.54586
25%,0.0,9.0,35.5,398.55,0.0,0.125,-0.9725399
50%,0.0,29.0,70.35,1394.55,0.0,0.402778,0.1857327
75%,0.0,55.0,89.85,3786.6,1.0,0.763889,0.8338335
max,1.0,72.0,118.75,8684.8,1.0,1.0,1.794352


In [16]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [17]:
yes_no_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for tgt in yes_no_cols:
    mapper = {'No':0, 'Yes':1}
    df[tgt + '_num'] = df[tgt].map(mapper)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,MonthlyCharges,TotalCharges,Churn,Churn_num,tenure_scaled,MonthlyCharges_scaled,Partner_num,Dependents_num,PhoneService_num,PaperlessBilling_num
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,29.85,29.85,No,0,0.013889,-1.160323,1,0,0,1
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,56.95,1889.5,No,0,0.472222,-0.259629,0,0,1,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,53.85,108.15,Yes,1,0.027778,-0.36266,0,0,1,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,42.3,1840.75,No,0,0.625,-0.746535,0,0,0,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,70.7,151.65,Yes,1,0.027778,0.197365,0,0,1,1


In [18]:
set(df['InternetService'])

{'DSL', 'Fiber optic', 'No'}

In [21]:
mapper = {'No':0, 'DSL':0.5, 'Fiber optic':1}
df['InternetSpeed'] = df['InternetService'].map(mapper)
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn_num,tenure_scaled,MonthlyCharges_scaled,Partner_num,Dependents_num,PhoneService_num,PaperlessBilling_num,InternetSpeed
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2279.734304,0.26537,0.449599,-1.513422e-15,0.483033,0.299588,0.903166,0.592219,0.611458
std,0.368612,24.559481,30.090047,2266.79447,0.441561,0.341104,1.000071,0.499748,0.45811,0.295752,0.491457,0.389438
min,0.0,0.0,18.25,0.0,0.0,0.0,-1.54586,0.0,0.0,0.0,0.0,0.0
25%,0.0,9.0,35.5,398.55,0.0,0.125,-0.9725399,0.0,0.0,1.0,0.0,0.5
50%,0.0,29.0,70.35,1394.55,0.0,0.402778,0.1857327,0.0,0.0,1.0,1.0,0.5
75%,0.0,55.0,89.85,3786.6,1.0,0.763889,0.8338335,1.0,1.0,1.0,1.0,1.0
max,1.0,72.0,118.75,8684.8,1.0,1.0,1.794352,1.0,1.0,1.0,1.0,1.0


In [22]:
df.describe().columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'Churn_num', 'tenure_scaled', 'MonthlyCharges_scaled', 'Partner_num',
       'Dependents_num', 'PhoneService_num', 'PaperlessBilling_num',
       'InternetSpeed'],
      dtype='object')

In [23]:
interested = ['SeniorCitizen', 'TotalCharges', 'Churn_num',
              'tenure_scaled', 'MonthlyCharges_scaled',
              'Partner_num', 'Dependents_num', 'PhoneService_num',
              'PaperlessBilling_num', 'InternetSpeed']
df[interested].to_csv('test3_modify.csv', index=False)

In [24]:
df = pd.read_csv('test3_modify.csv')
df.columns

Index(['SeniorCitizen', 'TotalCharges', 'Churn_num', 'tenure_scaled',
       'MonthlyCharges_scaled', 'Partner_num', 'Dependents_num',
       'PhoneService_num', 'PaperlessBilling_num', 'InternetSpeed'],
      dtype='object')

In [26]:
label = 'Churn_num'
features = ['SeniorCitizen', 'TotalCharges', 'tenure_scaled',
       'MonthlyCharges_scaled', 'Partner_num', 'Dependents_num',
       'PhoneService_num', 'PaperlessBilling_num', 'InternetSpeed']

X, Y = df[features], df[label]

#### 교차검증 (Cross-Validation)
1. row 순서를 섞음
2. 데이터셋을 k등분 (k-fold)
3. 첫 번째 데이터셋을 test용으로, 나머지를 train용으로 사용
4. train 데이터로 학습하고, test 데이터로 채점해서 점수 확인
5. 두 번째 데이터셋을 test용으로 하고, 3, 4단계를 k번 반복
6. 평가지표 점수가 k개 생김 → 평균 (cv-score)

In [27]:
from sklearn.model_selection import cross_val_score as cvs
from sklearn.ensemble import RandomForestClassifier as rf

In [28]:
model = rf()

In [29]:
cvs(model, X, Y, scoring='precision', cv=5)

array([0.58306189, 0.60750853, 0.57875458, 0.59121622, 0.61764706])

#### 군집 알고리즘
- 유사한 그룹끼리 묶는다

In [30]:
from sklearn.cluster import KMeans

In [31]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)

KMeans(n_clusters=4)

In [32]:
kmeans.labels_

array([3, 1, 3, ..., 3, 3, 0])