In [1]:
import pandas as pd
import numpy as np

from sklearn import svm

from sklearn.model_selection import train_test_split
# 정답과 예측을 넣으면, 정확도를 산출해서 보여준다.
from sklearn.metrics import accuracy_score
# 평가지표를 한눈에 보게해주는 라이브러리
from sklearn.metrics import classification_report

In [2]:
csData = pd.read_csv("../dataset/customer.csv")
csData

Unnamed: 0,balance,stock,label
0,30000000,22500000,normal
1,280000000,48000000,diamond
2,300000000,40666666,diamond
3,54000000,28000000,normal
4,768000000,32000000,vip
...,...,...,...
19995,628000000,44666666,diamond
19996,276000000,20000000,normal
19997,652000000,41333333,diamond
19998,676000000,45333333,diamond


In [3]:
csData.balance.drop_duplicates()

0        30000000
1       280000000
2       300000000
3        54000000
4       768000000
          ...    
717     516000000
778     318000000
801     320000000
1239    508000000
5763    484000000
Name: balance, Length: 129, dtype: int64

In [4]:
csData.stock.drop_duplicates()

0       22500000
1       48000000
2       40666666
3       28000000
4       32000000
          ...   
1085    31000000
1378    39500000
1401    38500000
2494    39000000
5725    51000000
Name: stock, Length: 112, dtype: int64

In [5]:
csData.label.drop_duplicates()

0     normal
1    diamond
4        vip
Name: label, dtype: object

In [6]:
csData.label.isnull().sum()

0

In [7]:
csData.stock.isnull().sum()

0

In [8]:
csData.balance.isnull().sum()

0

In [9]:
labelDict = {"normal":0,"diamond":1,"vip":2}
csData["labelcode"] = csData.label.map(labelDict)
csData

Unnamed: 0,balance,stock,label,labelcode
0,30000000,22500000,normal,0
1,280000000,48000000,diamond,1
2,300000000,40666666,diamond,1
3,54000000,28000000,normal,0
4,768000000,32000000,vip,2
...,...,...,...,...
19995,628000000,44666666,diamond,1
19996,276000000,20000000,normal,0
19997,652000000,41333333,diamond,1
19998,676000000,45333333,diamond,1


In [10]:
csData["labelcode"].drop_duplicates()

0    0
1    1
4    2
Name: labelcode, dtype: int64

In [11]:
def minMaxNorm(indata) :
    maxValue = max(indata)
    minValue = min(indata)
    deNormValue = maxValue - minValue

    plusData = indata - minValue
    outData = indata # 0을 넣기 위함

    if deNormValue != 0 :
        outData = plusData / deNormValue
    else : 
        pass
    return outData
# 가장 큰값과 작은 값을 받아온다
# 큰 값에서 작은 값을 빼서 양수 값으로 만든다 = deNormValue
# 원래값에서 작은 수를 뺀다 = plusData
# deNormValue가 0이 아닐경우 plusData / deNormValue를 실행한다.
# 0일 경우에는 0으로 넣는다.
# 모든 값이 0~1이 된다.

In [12]:
csData["balance_norm"] = minMaxNorm(csData.balance)
csData["stock_norm"] = minMaxNorm(csData.stock)
csData

Unnamed: 0,balance,stock,label,labelcode,balance_norm,stock_norm
0,30000000,22500000,normal,0,0.000000,0.080000
1,280000000,48000000,diamond,1,0.324675,0.488000
2,300000000,40666666,diamond,1,0.350649,0.370667
3,54000000,28000000,normal,0,0.031169,0.168000
4,768000000,32000000,vip,2,0.958442,0.232000
...,...,...,...,...,...,...
19995,628000000,44666666,diamond,1,0.776623,0.434667
19996,276000000,20000000,normal,0,0.319481,0.040000
19997,652000000,41333333,diamond,1,0.807792,0.381333
19998,676000000,45333333,diamond,1,0.838961,0.445333


In [13]:
corrDf = csData.corr()
corrDf

Unnamed: 0,balance,stock,labelcode,balance_norm,stock_norm
balance,1.0,0.565942,0.883144,1.0,0.565942
stock,0.565942,1.0,0.824174,0.565942,1.0
labelcode,0.883144,0.824174,1.0,0.883144,0.824174
balance_norm,1.0,0.565942,0.883144,1.0,0.565942
stock_norm,0.565942,1.0,0.824174,0.565942,1.0


In [14]:
feature_col = (list(corrDf.loc[(abs(corrDf.labelcode)>0.5) & (corrDf.labelcode != 1)].index))
features_Col = feature_col[2:]
features_Col

['balance_norm', 'stock_norm']

In [15]:
labelCol = ["label"]

In [16]:
featuresData = csData.loc[:,features_Col]
labelData = csData.loc[:,labelCol]

In [17]:
print(featuresData.shape)
print(labelData.shape)

(20000, 2)
(20000, 1)


In [18]:
trainingData_features, \
testData_features, \
trainingData_label, \
testData_label = train_test_split(featuresData, labelData, test_size = 0.3, random_state = 1)

In [19]:
print(trainingData_features.shape)
print(trainingData_label.shape)
print(testData_features.shape)
print(testData_label.shape)

(14000, 2)
(14000, 1)
(6000, 2)
(6000, 1)


In [20]:
model_method = svm.SVC(random_state=1)
# svm = 선형분리)
# = dtModel

In [21]:
model = model_method.fit(X = trainingData_features,
                         y = trainingData_label)
# 학습된 dtModel

  y = column_or_1d(y, warn=True)


In [22]:
predict = model.predict(X = testData_features)

In [23]:
predictData = pd.DataFrame(predict, columns = ["predict"])
predictData

Unnamed: 0,predict
0,diamond
1,diamond
2,diamond
3,normal
4,normal
...,...
5995,diamond
5996,diamond
5997,diamond
5998,vip


In [24]:
testData_label.reset_index(drop=True, inplace=True) 

In [33]:
fianlData = pd.concat( [testData_label, predictData], axis=1)
fianlData

Unnamed: 0,label,predict
0,diamond,diamond
1,diamond,diamond
2,diamond,diamond
3,normal,normal
4,normal,normal
...,...,...
5995,diamond,diamond
5996,diamond,diamond
5997,diamond,diamond
5998,vip,vip


In [34]:
labelData = csData.loc[testData_label.index,:]
labelData.reset_index(drop=True, inplace=True)
labelData

Unnamed: 0,balance,stock,label,labelcode,balance_norm,stock_norm
0,30000000,22500000,normal,0,0.000000,0.080000
1,280000000,48000000,diamond,1,0.324675,0.488000
2,300000000,40666666,diamond,1,0.350649,0.370667
3,54000000,28000000,normal,0,0.031169,0.168000
4,768000000,32000000,vip,2,0.958442,0.232000
...,...,...,...,...,...,...
5995,648000000,51333333,diamond,1,0.802597,0.541333
5996,600000000,44666666,diamond,1,0.740260,0.434667
5997,712000000,25333333,diamond,1,0.885714,0.125333
5998,612000000,39333333,diamond,1,0.755844,0.349333


In [35]:
finalResult = pd.concat( [labelData, predictData], axis=1) 
finalResult

Unnamed: 0,balance,stock,label,labelcode,balance_norm,stock_norm,predict
0,30000000,22500000,normal,0,0.000000,0.080000,diamond
1,280000000,48000000,diamond,1,0.324675,0.488000,diamond
2,300000000,40666666,diamond,1,0.350649,0.370667,diamond
3,54000000,28000000,normal,0,0.031169,0.168000,normal
4,768000000,32000000,vip,2,0.958442,0.232000,normal
...,...,...,...,...,...,...,...
5995,648000000,51333333,diamond,1,0.802597,0.541333,diamond
5996,600000000,44666666,diamond,1,0.740260,0.434667,diamond
5997,712000000,25333333,diamond,1,0.885714,0.125333,diamond
5998,612000000,39333333,diamond,1,0.755844,0.349333,vip


In [36]:
ac_score = accuracy_score(finalResult.label, finalResult.predict)
cl_report = classification_report(finalResult.label, finalResult.predict)

In [37]:
print(ac_score)
# finalData를 굳이 train과 합치지 않아도 내용물은 같은데, 결과는 다르다.
# 어쩌면 합쳐놓으면, label만이 아니라 다른 요소들도 고려하는게 아닐까?
# 하지만 그안에서 값을 골라서 넣었으니 아닐것이다.

0.43766666666666665


In [38]:
print(cl_report)

              precision    recall  f1-score   support

     diamond       0.58      0.57      0.57      3517
      normal       0.29      0.30      0.30      1734
         vip       0.12      0.12      0.12       749

    accuracy                           0.44      6000
   macro avg       0.33      0.33      0.33      6000
weighted avg       0.44      0.44      0.44      6000



In [None]:
# 합치기 전에 데이터프레임에서 가져온 값

In [39]:
print(accuracy_score(fianlData.label,fianlData.predict))

0.9953333333333333


In [40]:
print(classification_report(fianlData.label,fianlData.predict))

              precision    recall  f1-score   support

     diamond       1.00      1.00      1.00      3483
      normal       0.99      0.99      0.99      1803
         vip       0.99      0.99      0.99       714

    accuracy                           1.00      6000
   macro avg       1.00      0.99      0.99      6000
weighted avg       1.00      1.00      1.00      6000

