In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# 데이터 로딩

In [35]:
data = pd.read_csv('https://raw.githubusercontent.com/haram4th/ablearn/main/galaxy.csv')
data.head()

Unnamed: 0,BuyItNow,startprice,carrier,color,productline,noDescription,charCountDescription,upperCaseDescription,sold
0,0,199.99,,White,Galaxy_S9,contains description,0,0,1
1,0,235.0,,,Galaxy_Note9,contains description,0,0,0
2,0,199.99,,,Unknown,no description,100,2,0
3,1,175.0,AT&T,Space Gray,Galaxy_Note9,contains description,0,0,1
4,1,100.0,,Space Gray,Galaxy_S8,contains description,0,0,1


* BuyItNow : 바로 구매 할 수 있는지의 여부(즉시 구매)
* startprice : 경매의 시작 가격
* carrier : 통신사
* color : 기기 색상
* productline : 모델명
* noDescription : 판매자가 설명을 썼는지 안썼는지
* charCountDescriptio : 설명이 얼마나 긴지
* upperCaseDescription : 몇 문장인지
* sold : 팔렸는지 안팔렸는지 (우리가 알고자하는 종속변수)

# 각 기종별 평균 startprice 보다 저렴할 경우의 판매량, 비쌀 경우의 판매량

In [36]:
priceproduct2 = data.groupby('productline')['startprice'].describe()

In [37]:
priceproduct2

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
productline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Galaxy_Note10,351.0,366.931311,198.259953,0.99,250.0,380.0,499.97,939.0
Galaxy_Note8,153.0,176.065033,122.104147,0.01,99.0,187.5,239.88,699.95
Galaxy_Note9,107.0,254.561402,126.208902,0.01,177.5,259.95,349.99,575.0
Galaxy_S10,8.0,294.58375,79.754787,160.0,246.25,301.335,341.75,420.0
Galaxy_S7,227.0,84.19,66.527227,0.01,48.745,79.0,104.99,499.0
Galaxy_S8,277.0,165.883321,123.743959,0.01,89.99,155.0,230.0,999.0
Galaxy_S9,158.0,231.268101,136.186507,0.01,145.87,249.97,300.0,695.0
Unknown,204.0,171.994265,156.092171,0.99,44.9975,149.99,259.9925,700.0


In [38]:
data1 = data.copy()
data2 = data.copy()
data3 = data.copy()

In [39]:
# 평균 가격보다 싼 것
data[(data['productline'] == 'Galaxy_Note10') & (data['startprice'] <366.931311)].value_counts()

BuyItNow  startprice  carrier          color       productline    noDescription         charCountDescription  upperCaseDescription  sold
0         149.00      Sprint/T-Mobile  White       Galaxy_Note10  contains description  0                     0                     0       1
          249.99      Sprint/T-Mobile  Space Gray  Galaxy_Note10  contains description  0                     0                     1       1
          270.99      AT&T             Space Gray  Galaxy_Note10  no description        97                    10                    0       1
          274.99      AT&T             Space Gray  Galaxy_Note10  no description        102                   12                    0       1
          279.99      AT&T             Space Gray  Galaxy_Note10  no description        48                    9                     0       1
          299.99      AT&T             Space Gray  Galaxy_Note10  contains description  0                     0                     0       1
          3

In [40]:
priceproduct2['mean']

productline
Galaxy_Note10    366.931311
Galaxy_Note8     176.065033
Galaxy_Note9     254.561402
Galaxy_S10       294.583750
Galaxy_S7         84.190000
Galaxy_S8        165.883321
Galaxy_S9        231.268101
Unknown          171.994265
Name: mean, dtype: float64

In [41]:
priceproduct2

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
productline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Galaxy_Note10,351.0,366.931311,198.259953,0.99,250.0,380.0,499.97,939.0
Galaxy_Note8,153.0,176.065033,122.104147,0.01,99.0,187.5,239.88,699.95
Galaxy_Note9,107.0,254.561402,126.208902,0.01,177.5,259.95,349.99,575.0
Galaxy_S10,8.0,294.58375,79.754787,160.0,246.25,301.335,341.75,420.0
Galaxy_S7,227.0,84.19,66.527227,0.01,48.745,79.0,104.99,499.0
Galaxy_S8,277.0,165.883321,123.743959,0.01,89.99,155.0,230.0,999.0
Galaxy_S9,158.0,231.268101,136.186507,0.01,145.87,249.97,300.0,695.0
Unknown,204.0,171.994265,156.092171,0.99,44.9975,149.99,259.9925,700.0


* data3에 pricebyproduct2에 있는 기종별 평균 가격 추가 / [meanprice] => 평균가격 <br>
* startprice 컬럼과 비교해서 isUnderMean 컬럼 추가 => 평균보다 저렴하면 1, 비싸면 0

방법 1 함수 한땀한땀 만들기

In [42]:
def meanPrice(product):
    if product == 'Galaxy_Note10':
        return 366.931311
    elif product == 'Galaxy_Note8':
        return 176.065033
    elif product == 'Galaxy_Note9':
        return 254.561402
    elif product == 'Galaxy_S10':
        return 294.583750
    elif product == 'Galaxy_S7':
        return 84.190000
    elif product == 'Galaxy_S8':
        return 165.883321
    elif product == 'Galaxy_S9':
        return 231.268101
    elif product == 'Unknown':
        return 171.994265

In [43]:
data['meanprice'] = data['productline'].apply(meanPrice)

In [44]:
data.head()

Unnamed: 0,BuyItNow,startprice,carrier,color,productline,noDescription,charCountDescription,upperCaseDescription,sold,meanprice
0,0,199.99,,White,Galaxy_S9,contains description,0,0,1,231.268101
1,0,235.0,,,Galaxy_Note9,contains description,0,0,0,254.561402
2,0,199.99,,,Unknown,no description,100,2,0,171.994265
3,1,175.0,AT&T,Space Gray,Galaxy_Note9,contains description,0,0,1,254.561402
4,1,100.0,,Space Gray,Galaxy_S8,contains description,0,0,1,165.883321


In [45]:
priceproduct2

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
productline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Galaxy_Note10,351.0,366.931311,198.259953,0.99,250.0,380.0,499.97,939.0
Galaxy_Note8,153.0,176.065033,122.104147,0.01,99.0,187.5,239.88,699.95
Galaxy_Note9,107.0,254.561402,126.208902,0.01,177.5,259.95,349.99,575.0
Galaxy_S10,8.0,294.58375,79.754787,160.0,246.25,301.335,341.75,420.0
Galaxy_S7,227.0,84.19,66.527227,0.01,48.745,79.0,104.99,499.0
Galaxy_S8,277.0,165.883321,123.743959,0.01,89.99,155.0,230.0,999.0
Galaxy_S9,158.0,231.268101,136.186507,0.01,145.87,249.97,300.0,695.0
Unknown,204.0,171.994265,156.092171,0.99,44.9975,149.99,259.9925,700.0


방법2 loc로 priceproduct2에서 추출해서 입력하기

In [46]:
priceproduct2.loc['Galaxy_Note10', 'mean']

np.float64(366.9313105413105)

In [47]:
for product in priceproduct2.index:
    print(priceproduct2.loc[product, 'mean'])

366.9313105413105
176.06503267973858
254.56140186915889
294.58375
84.19
165.883321299639
231.2681012658228
171.99426470588236


In [48]:
data1['meanprice'] = data1['productline'].apply(lambda x: priceproduct2.loc[x, 'mean'])

In [49]:
data1

Unnamed: 0,BuyItNow,startprice,carrier,color,productline,noDescription,charCountDescription,upperCaseDescription,sold,meanprice
0,0,199.99,,White,Galaxy_S9,contains description,0,0,1,231.268101
1,0,235.00,,,Galaxy_Note9,contains description,0,0,0,254.561402
2,0,199.99,,,Unknown,no description,100,2,0,171.994265
3,1,175.00,AT&T,Space Gray,Galaxy_Note9,contains description,0,0,1,254.561402
4,1,100.00,,Space Gray,Galaxy_S8,contains description,0,0,1,165.883321
...,...,...,...,...,...,...,...,...,...,...
1480,0,89.50,AT&T,,Galaxy_S7,no description,96,2,0,84.190000
1481,0,239.95,,Midnight Black,Galaxy_S9,no description,97,5,1,231.268101
1482,0,329.99,,Space Gray,Galaxy_Note10,no description,93,1,0,366.931311
1483,0,89.00,,Midnight Black,Galaxy_S7,no description,92,2,1,84.190000


startprice 컬럼과 비교해서 isUnderMean 컬럼 추가 => 평균보다 저렴하면 1, 비싸면 0

In [50]:
data[data['startprice'] < data['meanprice']]

Unnamed: 0,BuyItNow,startprice,carrier,color,productline,noDescription,charCountDescription,upperCaseDescription,sold,meanprice
0,0,199.99,,White,Galaxy_S9,contains description,0,0,1,231.268101
1,0,235.00,,,Galaxy_Note9,contains description,0,0,0,254.561402
3,1,175.00,AT&T,Space Gray,Galaxy_Note9,contains description,0,0,1,254.561402
4,1,100.00,,Space Gray,Galaxy_S8,contains description,0,0,1,165.883321
5,1,0.99,,White,Galaxy_S7,contains description,0,0,1,84.190000
...,...,...,...,...,...,...,...,...,...,...
1468,1,50.00,,Midnight Black,Galaxy_S7,no description,100,12,1,84.190000
1469,0,249.99,Sprint/T-Mobile,Space Gray,Galaxy_Note10,contains description,0,0,1,366.931311
1472,0,36.95,,,Galaxy_S7,no description,70,10,0,84.190000
1475,1,119.99,,Midnight Black,Galaxy_S8,contains description,0,0,1,165.883321


반복문은 -> apply로 할 수 있음

In [51]:
for start, mean in zip(data['startprice'], data['meanprice']): #apply
    if start < mean:
        print(1)
    else:
        print(0)

# apply(lambda x: 1 if x['startprice'] < x['meanprice'] else 0)

1
1
0
1
1
1
1
1
0
1
1
1
1
0
1
0
0
0
0
1
1
0
0
0
1
0
0
1
1
0
0
0
1
1
0
0
0
0
0
0
0
1
1
0
0
0
1
0
1
1
0
0
0
1
0
0
0
0
1
0
1
1
1
1
1
1
0
0
1
1
1
0
1
0
0
0
1
0
1
1
0
0
1
1
0
1
0
0
1
1
0
0
0
1
1
0
0
0
1
1
0
0
1
1
1
0
0
0
1
0
1
1
0
0
1
1
1
0
0
1
0
1
0
0
1
1
0
1
1
1
0
1
0
1
1
0
1
1
0
0
1
0
0
0
1
1
1
1
0
1
1
1
0
1
1
1
0
1
1
0
1
0
1
0
1
1
1
1
1
1
0
0
0
1
1
1
1
0
1
0
1
1
0
0
0
1
1
1
1
1
1
0
0
1
0
1
1
1
1
1
0
1
1
0
1
0
1
0
0
1
1
1
1
1
0
1
1
0
1
1
1
0
1
1
1
1
1
0
0
1
1
1
1
1
1
0
1
0
1
1
1
1
1
1
1
1
1
1
0
0
0
1
0
1
1
1
1
1
1
1
1
0
1
1
0
1
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
1
1
0
1
1
1
0
0
1
0
1
1
0
0
1
0
1
0
0
0
1
1
1
1
1
1
1
1
1
0
1
1
1
1
1
0
1
1
1
1
0
1
1
1
1
1
1
1
0
0
1
0
1
0
0
0
1
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
0
1
1
0
1
0
1
0
0
0
1
1
0
1
0
0
0
1
0
1
1
1
1
0
1
0
1
1
1
0
1
1
0
1
1
1
1
1
1
1
1
1
1
1
0
1
1
1
0
0
1
0
1
0
1
1
1
1
1
0
0
1
1
0
1
1
0
1
0
0
1
1
0
1
1
1
0
1
1
1
0
0
0
0
1
1
1
1
1
1
0
0
1
1
1
1
1
1
1
0
1
1
0
1
0
1
1
1
1
0
0
0
0
1
0
1
0
0
1
1
1
0
0
1
1
1
0
0
1
0
1
0
1
1
1
0
0
0
1
1
1
1


In [52]:
data['isUnderMean'] = data.apply(lambda x: 1 if x['startprice'] < x['meanprice'] else 0, axis = 1)

In [53]:
data

Unnamed: 0,BuyItNow,startprice,carrier,color,productline,noDescription,charCountDescription,upperCaseDescription,sold,meanprice,isUnderMean
0,0,199.99,,White,Galaxy_S9,contains description,0,0,1,231.268101,1
1,0,235.00,,,Galaxy_Note9,contains description,0,0,0,254.561402,1
2,0,199.99,,,Unknown,no description,100,2,0,171.994265,0
3,1,175.00,AT&T,Space Gray,Galaxy_Note9,contains description,0,0,1,254.561402,1
4,1,100.00,,Space Gray,Galaxy_S8,contains description,0,0,1,165.883321,1
...,...,...,...,...,...,...,...,...,...,...,...
1480,0,89.50,AT&T,,Galaxy_S7,no description,96,2,0,84.190000,0
1481,0,239.95,,Midnight Black,Galaxy_S9,no description,97,5,1,231.268101,0
1482,0,329.99,,Space Gray,Galaxy_Note10,no description,93,1,0,366.931311,1
1483,0,89.00,,Midnight Black,Galaxy_S7,no description,92,2,1,84.190000,0


In [54]:
data = data[['BuyItNow', 'startprice', 'color', 'productline', 'noDescription', 'sold', 'isUnderMean']]

In [55]:
data = pd.get_dummies(data)

In [56]:
data

Unnamed: 0,BuyItNow,startprice,sold,isUnderMean,color_Aura Black,color_Black,color_Gold,color_Midnight Black,color_Prism Black,color_Space Gray,...,productline_Galaxy_Note10,productline_Galaxy_Note8,productline_Galaxy_Note9,productline_Galaxy_S10,productline_Galaxy_S7,productline_Galaxy_S8,productline_Galaxy_S9,productline_Unknown,noDescription_contains description,noDescription_no description
0,0,199.99,1,1,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
1,0,235.00,0,1,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,False
2,0,199.99,0,0,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
3,1,175.00,1,1,False,False,False,False,False,True,...,False,False,True,False,False,False,False,False,True,False
4,1,100.00,1,1,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1480,0,89.50,0,0,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
1481,0,239.95,1,0,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True
1482,0,329.99,0,1,False,False,False,False,False,True,...,True,False,False,False,False,False,False,False,False,True
1483,0,89.00,1,0,False,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,True


In [57]:
X3 = data.drop('sold', axis = 1)
y3 = data['sold']

In [60]:
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.2, random_state = 7)

In [62]:
for i in range(1,30):
    dtc3 = DecisionTreeClassifier(max_depth = 3, random_state=7)
    dtc3.fit(X3_train, y3_train)
    pred3 = dtc3.predict(X3_test)
    print(i, accuracy_score(y3_test, pred3))
    print(i, classification_report(y3_test, pred3))

1 0.7946127946127947
1               precision    recall  f1-score   support

           0       0.78      0.85      0.81       158
           1       0.81      0.73      0.77       139

    accuracy                           0.79       297
   macro avg       0.80      0.79      0.79       297
weighted avg       0.80      0.79      0.79       297

2 0.7946127946127947
2               precision    recall  f1-score   support

           0       0.78      0.85      0.81       158
           1       0.81      0.73      0.77       139

    accuracy                           0.79       297
   macro avg       0.80      0.79      0.79       297
weighted avg       0.80      0.79      0.79       297

3 0.7946127946127947
3               precision    recall  f1-score   support

           0       0.78      0.85      0.81       158
           1       0.81      0.73      0.77       139

    accuracy                           0.79       297
   macro avg       0.80      0.79      0.79       297
weigh