In [55]:
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy
import tensorflow as tf
import matplotlib.pyplot as plt 
import seaborn as sns 

# seed 값 설정
seed=0
numpy.random.seed(3)
tf.random.set_seed(3)

In [56]:
# 데이터 불러오기
df = pd.read_csv('data/train.csv')
df.head(10)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
0,1,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
1,2,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94
2,3,CA-2017-138688,12/06/2017,16/06/2017,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62
3,4,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775
4,5,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368
5,6,CA-2015-115812,09/06/2015,14/06/2015,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,FUR-FU-10001487,Furniture,Furnishings,Eldon Expressions Wood and Plastic Desk Access...,48.86
6,7,CA-2015-115812,09/06/2015,14/06/2015,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,OFF-AR-10002833,Office Supplies,Art,Newell 322,7.28
7,8,CA-2015-115812,09/06/2015,14/06/2015,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,TEC-PH-10002275,Technology,Phones,Mitel 5320 IP Phone VoIP phone,907.152
8,9,CA-2015-115812,09/06/2015,14/06/2015,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,OFF-BI-10003910,Office Supplies,Binders,DXL Angle-View Binders with Locking Rings by S...,18.504
9,10,CA-2015-115812,09/06/2015,14/06/2015,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,OFF-AP-10002892,Office Supplies,Appliances,Belkin F5C206VTEL 6 Outlet Surge,114.9


In [57]:
df.info()
# object 타입이 많아 엑셀을 통해 csv 파일을 직접 확인했음.
# object 타입을 범주형으로 변환해야할 타입이 많음, 날짜 데이터는 년, 월, 일을 분리하여 새로운 행으로 생성
# Order ID, Row ID, Customer ID, customer ID, Postal Code, Product ID는 고객과 주문을 식별하기 위한 컬럼이므로 판매량 예측과는 상관 없음.
# Country는 United States로 모두 같기 때문에 판매량 예측과 상관 없음.
# Product Name은 범주형 데이터가 아니기 때문에 필요시 fastText 사용을 고려

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9800 non-null   int64  
 1   Order ID       9800 non-null   object 
 2   Order Date     9800 non-null   object 
 3   Ship Date      9800 non-null   object 
 4   Ship Mode      9800 non-null   object 
 5   Customer ID    9800 non-null   object 
 6   Customer Name  9800 non-null   object 
 7   Segment        9800 non-null   object 
 8   Country        9800 non-null   object 
 9   City           9800 non-null   object 
 10  State          9800 non-null   object 
 11  Postal Code    9789 non-null   float64
 12  Region         9800 non-null   object 
 13  Product ID     9800 non-null   object 
 14  Category       9800 non-null   object 
 15  Sub-Category   9800 non-null   object 
 16  Product Name   9800 non-null   object 
 17  Sales          9800 non-null   float64
dtypes: float

In [58]:
def date(df, column):
    df[column] = pd.to_datetime(df[column])
    df[column+'Date'] = df[column].apply(lambda x : x.year)
    df[column+'Month'] = df[column].apply(lambda x : x.month)
    df[column+'Day'] = df[column].apply(lambda x : x.day)
    df = df.drop(column, axis=1)
    return df

In [59]:
def oneHotEncoding(df, column):
    dummy = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummy], axis=1)
    df = df.drop(column, axis=1)
    return df

In [60]:
#날짜형 데이터를 년, 월, 일 데이터로 변환
df = date(df, 'Order Date')
df = date(df, 'Ship Date')

In [61]:
# 범주형 데이터들을 원핫인코딩으로 변환
categories = ['Ship Mode', 'Segment', 'City', 'Postal Code', 'State', 'Region', 'Category', 'Sub-Category', 'Product ID']
for i in categories:
    df = oneHotEncoding(df, i)

In [62]:
#필요없는 데이터들을 삭제
df = df.drop(['Row ID', 'Order ID', 'Customer ID', 'Customer Name', 'Country', 'Product Name'], axis=1)

In [63]:
Y = df['Sales']
X = df.drop('Sales', axis=1)

In [64]:
#훈련셋과 테스트셋 분리
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)

In [65]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=df.drop('Sales', axis=1).columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=df.drop('Sales', axis=1).columns)


In [66]:
X_train

Unnamed: 0,Order DateDate,Order DateMonth,Order DateDay,Ship DateDate,Ship DateMonth,Ship DateDay,Ship Mode_First Class,Ship Mode_Same Day,Ship Mode_Second Class,Ship Mode_Standard Class,...,Product ID_TEC-PH-10004833,Product ID_TEC-PH-10004875,Product ID_TEC-PH-10004896,Product ID_TEC-PH-10004897,Product ID_TEC-PH-10004908,Product ID_TEC-PH-10004912,Product ID_TEC-PH-10004922,Product ID_TEC-PH-10004924,Product ID_TEC-PH-10004959,Product ID_TEC-PH-10004977
0,-1.543349,0.247012,0.353700,-1.553460,0.221312,0.566159,-0.427264,-0.236716,2.020268,-1.214024,...,-0.024154,0.0,-0.020917,-0.027007,-0.027007,-0.012075,-0.024154,-0.020917,-0.017077,-0.034169
1,-0.652980,-1.799843,-1.625003,-0.664880,-1.542670,-1.595738,-0.427264,-0.236716,2.020268,-1.214024,...,-0.024154,0.0,-0.020917,-0.027007,-0.027007,-0.012075,-0.024154,-0.020917,-0.017077,-0.034169
2,1.127757,1.124236,1.095714,1.112279,1.103303,1.286791,-0.427264,-0.236716,2.020268,-1.214024,...,-0.024154,0.0,-0.020917,-0.027007,-0.027007,-0.012075,-0.024154,-0.020917,-0.017077,-0.034169
3,0.237388,0.831828,-0.882989,0.223699,0.515309,-0.274579,-0.427264,-0.236716,-0.494984,0.823707,...,-0.024154,0.0,-0.020917,-0.027007,-0.027007,-0.012075,-0.024154,-0.020917,-0.017077,-0.034169
4,-0.652980,0.247012,1.590390,-0.664880,-1.542670,-0.875106,-0.427264,-0.236716,-0.494984,0.823707,...,-0.024154,0.0,-0.020917,-0.027007,-0.027007,-0.012075,-0.024154,-0.020917,-0.017077,-0.034169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6855,0.237388,1.416644,1.095714,0.223699,1.397300,1.527002,-0.427264,-0.236716,-0.494984,0.823707,...,-0.024154,0.0,-0.020917,-0.027007,-0.027007,-0.012075,-0.024154,-0.020917,-0.017077,-0.034169
6856,1.127757,-0.922620,-0.635651,1.112279,-0.954676,-0.634895,-0.427264,4.224471,-0.494984,-1.214024,...,-0.024154,0.0,-0.020917,-0.027007,-0.027007,-0.012075,-0.024154,-0.020917,-0.017077,-0.034169
6857,1.127757,1.416644,-0.140975,1.112279,1.397300,0.325948,-0.427264,-0.236716,-0.494984,0.823707,...,-0.024154,0.0,-0.020917,-0.027007,-0.027007,-0.012075,-0.024154,-0.020917,-0.017077,-0.034169
6858,0.237388,-0.922620,-1.625003,0.223699,0.221312,-1.595738,-0.427264,-0.236716,2.020268,-1.214024,...,-0.024154,0.0,-0.020917,-0.027007,-0.027007,-0.012075,-0.024154,-0.020917,-0.017077,-0.034169


In [88]:
model = Sequential()
model.add(Dense(256, input_dim = 3102, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='linear'))

In [89]:
model.compile(loss='mean_squared_error',
              optimizer='adam')
model.fit(X_train, Y_train, epochs=5, validation_split=0.2,batch_size = 100)

Train on 5488 samples, validate on 1372 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x2c5cef2de48>

In [90]:
test_loss = model.evaluate(X_test, Y_test, verbose=0)

print("Test Loss: {:.5f}".format(test_loss))

Test Loss: 590760.86145
