In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [4]:
Cust = pd.read_csv('lv2-2401.csv')
Cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
C_ID        2000 non-null int64
GENDER      2000 non-null object
AGE         2000 non-null int64
INCOME      2000 non-null int64
JOB         1965 non-null object
WORK_EXP    2000 non-null int64
FAMILY      2000 non-null int64
SPENDING    2000 non-null float64
GRADE       2000 non-null object
dtypes: float64(1), int64(5), object(3)
memory usage: 140.8+ KB


# 1

## 1-1

In [5]:
def get_age_p(age):
    if age < 20:
        return '10s'
    elif age < 30:
        return '20s'
    elif age < 40:
        return '30s'
    elif age < 50:
        return '40s'
    elif age < 60:
        return '50s'
    elif age >= 60:
        return 'other'

In [7]:
Cust['AGE_P'] = Cust['AGE'].map(get_age_p)

In [8]:
Cust.head()

Unnamed: 0,C_ID,GENDER,AGE,INCOME,JOB,WORK_EXP,FAMILY,SPENDING,GRADE,AGE_P
0,1,Male,19,15000,B,1,4,11.7,prestige,10s
1,2,Male,21,35000,D,3,3,24.3,royal blue,20s
2,3,Female,20,86000,D,1,1,1.8,general,20s
3,4,Female,23,59000,G,0,2,23.1,royal blue,20s
4,5,Female,31,38000,C,2,6,12.0,prestige,30s


## 1-2

In [10]:
Cust.groupby('AGE_P')[['SPENDING', 'C_ID']]\
    .agg({'SPENDING': 'mean', 'C_ID': 'size'})\
    .rename(columns={'SPENDING': 'AVG_SPEND', 'C_ID': 'CUST_CNT'})

Unnamed: 0_level_0,AVG_SPEND,CUST_CNT
AGE_P,Unnamed: 1_level_1,Unnamed: 2_level_1
10s,16.125789,380
20s,15.665403,211
30s,15.13444,241
40s,14.43232,362
50s,15.314179,402
other,15.138861,404


In [15]:
Agg24 = Cust.groupby('AGE_P')[['SPENDING', 'C_ID']]\
            .apply(lambda x: pd.Series({'AVG_SPEND' : x['SPENDING'].mean(),
                                        'CUST_CNT': x['C_ID'].size}))

## 1-3

In [18]:
Agg24.sort_values(by='AVG_SPEND', ascending=False)

Unnamed: 0_level_0,AVG_SPEND,CUST_CNT
AGE_P,Unnamed: 1_level_1,Unnamed: 2_level_1
10s,16.125789,380.0
20s,15.665403,211.0
50s,15.314179,402.0
other,15.138861,404.0
30s,15.13444,241.0
40s,14.43232,362.0


# 2

In [19]:
df2 = Cust.copy()

## 2-1

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
df2.columns

Index(['C_ID', 'GENDER', 'AGE', 'INCOME', 'JOB', 'WORK_EXP', 'FAMILY',
       'SPENDING', 'GRADE', 'AGE_P'],
      dtype='object')

In [22]:
col = ['AGE', 'INCOME', 'WORK_EXP', 'FAMILY', 'SPENDING']
len(col)

5

In [27]:
df2[col] = pd.DataFrame(StandardScaler().fit_transform(df2[col]), columns=col)

In [28]:
df2.head()

Unnamed: 0,C_ID,GENDER,AGE,INCOME,JOB,WORK_EXP,FAMILY,SPENDING,GRADE,AGE_P
0,1,Male,-1.098933,-2.093501,B,-0.791207,0.117497,-0.428339,prestige,10s
1,2,Male,-0.99692,-1.656133,D,-0.281162,-0.390051,1.075546,royal blue,20s
2,3,Female,-1.047927,-0.540845,D,-0.791207,-1.405148,-1.609962,general,20s
3,4,Female,-0.894907,-1.131292,G,-1.04623,-0.897599,0.932319,royal blue,20s
4,5,Female,-0.486856,-1.590528,C,-0.536185,1.132594,-0.392532,prestige,30s


## 2-2
## 2-3

In [30]:
df2 = df2.sort_values(by='C_ID')

In [37]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [46]:
result = pd.DataFrame()

for i in [3, 4]:
    model = KMeans(n_clusters=i, random_state=1234, init='random').fit(df2[col])
    pred = model.predict(df2[col])
    score = silhouette_score(df2[col], pred)
    
    result = result.append({'n_cluster': i, 'score': score}, ignore_index=True)

In [53]:
result['score'].max().round(3)

0.158

# 3

## 3-1

In [55]:
Cust = Cust[~(Cust['AGE'] < 14) & (Cust['JOB'].notna())]

## 3-2

In [58]:
Cust = pd.get_dummies(Cust, columns=['GENDER', 'JOB'], drop_first=True)

## 3-3

In [61]:
TrainSet6 = Cust[Cust['C_ID'] % 3 != 0]
TestSet6 = Cust[Cust['C_ID'] % 3 == 0]

In [62]:
print(TrainSet6.shape, TestSet6.shape)

(1148, 17) (566, 17)


## 3-4

In [63]:
from sklearn.tree import DecisionTreeClassifier

In [65]:
col = ['AGE', 'INCOME', 'WORK_EXP', 'FAMILY',
       'GENDER_Male', 'JOB_B', 'JOB_C', 'JOB_D', 'JOB_E', 'JOB_F',
       'JOB_G', 'JOB_H', 'JOB_I']
len(col)

13

In [66]:
model = DecisionTreeClassifier(max_depth=5, random_state=1234)\
            .fit(X=TrainSet6[col], y=TrainSet6['GRADE'])

## 3-5

In [68]:
pred = model.predict(TestSet6[col])

In [69]:
from sklearn.metrics import accuracy_score

In [72]:
round(accuracy_score(y_true=TestSet6['GRADE'], y_pred=pred) * 100, 2)

33.22