In [20]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

import pandas as pd
import numpy
import tensorflow as tf
import matplotlib.pyplot as plt 
import seaborn as sns 

# seed 값 설정
seed=0
numpy.random.seed(3)
tf.random.set_seed(3)

In [21]:
# 데이터 불러오기
df = pd.read_csv('data/Automobile_customers.csv')
df.head(10)

Unnamed: 0,Customer ID,Age,Income_Bucket,Gender,State,Region,Marital_Status,No_of_children,Occupation,Vehicle_Segment,No_of_months,Hobbies,On_Call_Offer
0,46756841,28,2,M,Q,Sub-urban#,1,4,White Collar Clerical,SUV,39,Swimming,Plan 1
1,48408855,25,3,M,A,Sub-urban#,2,3,Engineer,Compact SUV,14,Painting,Plan 5
2,23645631,58,2,M,Z,Urban#,2,4,Defence,Car,24,Reading Books,Plan 5
3,65645152,37,1,M,W,Sub-urban#,2,4,Sales Executive,Pick up truck,27,Gourmet Cooking,Plan 4
4,52236057,64,2,M,S,Sub-urban,1,4,Chef,Pick up truck,56,Music,Plan 4
5,80008322,25,1,M,X,Urban,2,2,Investment Banker,Pick up truck,54,Watching movies,Plan 5
6,12663567,32,1,M,E,Sub-urban,2,1,Orthopeadic,Pick up truck,31,Football,Plan 4
7,62204295,59,3,M,D,Sub-urban,1,3,Gardener,Pick up truck,56,Hockey,Plan 4
8,92845629,56,2,M,C,Urban,1,2,Interior designer,Pick up truck,28,Hiking,Plan 5
9,76395421,38,2,M,R,Sub-urban,1,1,Architect,Pick up truck,46,Para-gliding,Plan 4


In [22]:
df.info()
#Gender, State, Region, Occupation, Vehicle_Segment, Hobbies, On_Call_Offer는 object 타입이므로 어떤 데이터인지 csv 파일로 직접 확인
#Gender는 성별 F,M,!M,MA 로 이루어짐. F만 여자 나머지는 남자
#State는 A~Z까지의 알파벳으로 이루어져 있으나 어떤 의미인지 알 수 없으므로 사용X
#Region 지역을 나타내는데 Urban, Sub-urban#, Urban%, rural 으로 이루어짐. #, %을 제거해줄것
#Occupation은 직업
#Vehicle_Segment는 차량 종류
#Hobbies는 취미
#On_Call_Offer는 예측할 내용인 가입한 구독 플랜

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10733 entries, 0 to 10732
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Customer ID      10733 non-null  int64 
 1   Age              10733 non-null  int64 
 2   Income_Bucket    10733 non-null  int64 
 3   Gender           10733 non-null  object
 4   State            10715 non-null  object
 5   Region           10733 non-null  object
 6   Marital_Status   10733 non-null  int64 
 7   No_of_children   10733 non-null  int64 
 8   Occupation       10733 non-null  object
 9   Vehicle_Segment  10733 non-null  object
 10  No_of_months     10733 non-null  int64 
 11  Hobbies          10733 non-null  object
 12  On_Call_Offer    10733 non-null  object
dtypes: int64(6), object(7)
memory usage: 1.1+ MB


In [23]:
def oneHotEncoding(df, column):
    dummy = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummy], axis=1)
    df = df.drop(column, axis=1)
    return df

In [24]:
#일정하지 않은 데이터를 변경
df['Gender'] = df['Gender'].str.replace('!', '');
df['Gender'] = df['Gender'].str.replace('A', '');


In [25]:
#잘못된 데이터를 변경
df['Region'] = df['Region'].str.replace('%', '');
df['Region'] = df['Region'].str.replace('#', '');

In [26]:
#필요없는 데이터 삭제
df = df.drop(['Customer ID','State','Hobbies'], axis=1)

In [27]:
#object 데이터들을 oneHotEncoding
categories = ['Income_Bucket', 'Gender', 'Region', 'Marital_Status', 'No_of_children', 'Occupation', 'Vehicle_Segment']
for i in categories:
    df = oneHotEncoding(df,i)

In [28]:
df.describe()

Unnamed: 0,Age,No_of_months,Income_Bucket_1,Income_Bucket_2,Income_Bucket_3,Gender_F,Gender_M,Region_Rural,Region_Sub-urban,Region_Urban,...,Occupation_Research Analyst,Occupation_Sales Executive,Occupation_Web App Developer,Occupation_White Collar Clerical,Occupation_Wildlife Photographer,Occupation_Writer,Vehicle_Segment_Car,Vehicle_Segment_Compact SUV,Vehicle_Segment_Pick up truck,Vehicle_Segment_SUV
count,10733.0,10733.0,10733.0,10733.0,10733.0,10733.0,10733.0,10733.0,10733.0,10733.0,...,10733.0,10733.0,10733.0,10733.0,10733.0,10733.0,10733.0,10733.0,10733.0,10733.0
mean,46.505357,32.915028,0.336439,0.336066,0.327495,0.303084,0.696916,0.091121,0.610826,0.298053,...,0.155315,0.099786,0.045001,0.254076,0.008385,0.015373,0.282027,0.226311,0.292556,0.199106
std,16.785471,15.756305,0.472513,0.472384,0.469321,0.459613,0.459613,0.287794,0.487586,0.457424,...,0.362222,0.299728,0.207317,0.435361,0.091191,0.123037,0.450008,0.418462,0.454957,0.399346
min,18.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,47.0,33.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,61.0,47.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
max,75.0,60.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
X = df.drop(['On_Call_Offer'], axis=1).values
Y_obj = df['On_Call_Offer'].values

In [30]:
e = LabelEncoder()
e.fit(Y_obj)
Y = e.transform(Y_obj)

In [31]:
Y_encoded = np_utils.to_categorical(Y)

In [32]:
#훈련셋과 테스트셋 분리
X_train, X_test, Y_train, Y_test = train_test_split(X, Y_encoded, test_size=0.3, random_state=seed)

In [33]:
model = Sequential()
model.add(Dense(16, input_dim=39, activation='relu'))
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [34]:
model.fit(X_train, Y_train, epochs = 20, batch_size=32,validation_split=0.2)

Train on 6010 samples, validate on 1503 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x1a6f7d5e348>

In [35]:
print("\n Accuracy: %.4f" % (model.evaluate(X_test, Y_test)[1]))


 Accuracy: 0.9988
