# 1. 필요한 라이브러리를 가져옵니다.

In [6]:
import pandas as pd # 분석용 파이썬 라이브러리 패키지
import lightgbm as lgbm

import numpy as np # 계산용 파이썬 라이브러리 패키지
import matplotlib as mpl
import matplotlib.pyplot as plt # 시각화 파이썬 라이브러리 패키지
import seaborn as sns # 시각화 파이썬 라이브러리 패키지 as는 seaborn을 sns로 쓰겠다는 말

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp /content/drive/MyDrive/DACON/와인품질분류.zip ./

In [4]:
!rm -rf wine
!mkdir wine
!unzip 와인품질분류.zip -d wine/

Archive:  와인품질분류.zip
  inflating: wine/data/sample_submission.csv  
  inflating: wine/data/test.csv      
  inflating: wine/data/train.csv     


# 2. 데이터를 불러옵니다.

In [8]:
train = pd.read_csv("/content/wine/data/train.csv", encoding="euc-kr")
test = pd.read_csv("/content/wine/data/test.csv", encoding="euc-kr")
smpl_sub = pd.read_csv("/content/wine/data/sample_submission.csv", encoding="euc-kr")

In [9]:
train.head()

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,white
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,red
2,2,5,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,white
3,3,6,7.0,0.21,0.31,6.0,0.046,29.0,108.0,0.9939,3.26,0.5,10.8,white
4,4,6,7.8,0.4,0.26,9.5,0.059,32.0,178.0,0.9955,3.04,0.43,10.9,white


In [10]:
test.head()

Unnamed: 0,index,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,9.0,0.31,0.48,6.6,0.043,11.0,73.0,0.9938,2.9,0.38,11.6,white
1,1,13.3,0.43,0.58,1.9,0.07,15.0,40.0,1.0004,3.06,0.49,9.0,red
2,2,6.5,0.28,0.27,5.2,0.04,44.0,179.0,0.9948,3.19,0.69,9.4,white
3,3,7.2,0.15,0.39,1.8,0.043,21.0,159.0,0.9948,3.52,0.47,10.0,white
4,4,6.8,0.26,0.26,2.0,0.019,23.5,72.0,0.99041,3.16,0.47,11.8,white


# 3. 변수를 변환합니다.

In [11]:
# type에는 white와 red 두 종류가 있습니다.
# 각각 0,1로 변환합니다.

train['type'] = train['type'].map({'white':0, 'red':1}).astype(int)
test['type'] = test['type'].map({'white':0, 'red':1}).astype(int)

In [12]:
train.head()

Unnamed: 0,index,quality,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,5,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,0
1,1,5,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,1
2,2,5,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,0
3,3,6,7.0,0.21,0.31,6.0,0.046,29.0,108.0,0.9939,3.26,0.5,10.8,0
4,4,6,7.8,0.4,0.26,9.5,0.059,32.0,178.0,0.9955,3.04,0.43,10.9,0


In [13]:
test.head()

Unnamed: 0,index,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,0,9.0,0.31,0.48,6.6,0.043,11.0,73.0,0.9938,2.9,0.38,11.6,0
1,1,13.3,0.43,0.58,1.9,0.07,15.0,40.0,1.0004,3.06,0.49,9.0,1
2,2,6.5,0.28,0.27,5.2,0.04,44.0,179.0,0.9948,3.19,0.69,9.4,0
3,3,7.2,0.15,0.39,1.8,0.043,21.0,159.0,0.9948,3.52,0.47,10.0,0
4,4,6.8,0.26,0.26,2.0,0.019,23.5,72.0,0.99041,3.16,0.47,11.8,0


In [14]:
# 모델에 입력하기 전 데이터를 정형합니다.
train_x = train.drop(['index', 'quality'], axis = 1)
train_y = train['quality']
test_x = test.drop('index', axis = 1)

In [15]:
train_x.shape, train_y.shape, test_x.shape

((5497, 12), (5497,), (1000, 12))

In [16]:
train_x.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,5.6,0.695,0.06,6.8,0.042,9.0,84.0,0.99432,3.44,0.44,10.2,0
1,8.8,0.61,0.14,2.4,0.067,10.0,42.0,0.9969,3.19,0.59,9.5,1
2,7.9,0.21,0.39,2.0,0.057,21.0,138.0,0.99176,3.05,0.52,10.9,0
3,7.0,0.21,0.31,6.0,0.046,29.0,108.0,0.9939,3.26,0.5,10.8,0
4,7.8,0.4,0.26,9.5,0.059,32.0,178.0,0.9955,3.04,0.43,10.9,0


In [17]:
train_y.head()

0    5
1    5
2    5
3    6
4    6
Name: quality, dtype: int64

In [18]:
test_x.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,9.0,0.31,0.48,6.6,0.043,11.0,73.0,0.9938,2.9,0.38,11.6,0
1,13.3,0.43,0.58,1.9,0.07,15.0,40.0,1.0004,3.06,0.49,9.0,1
2,6.5,0.28,0.27,5.2,0.04,44.0,179.0,0.9948,3.19,0.69,9.4,0
3,7.2,0.15,0.39,1.8,0.043,21.0,159.0,0.9948,3.52,0.47,10.0,0
4,6.8,0.26,0.26,2.0,0.019,23.5,72.0,0.99041,3.16,0.47,11.8,0


# 4. 모델을 생성 및 훈련합니다. (여기서부터 살짝 이해안감)

In [20]:
model = lgbm.LGBMClassifier()
model.fit(train_x,train_y)

LGBMClassifier()

In [21]:
y_pred = model.predict(test_x)

In [22]:
y_pred

array([5, 5, 5, 5, 6, 6, 6, 6, 6, 7, 6, 5, 7, 6, 6, 5, 6, 6, 6, 5, 6, 6,
       5, 7, 6, 5, 5, 5, 5, 5, 7, 6, 5, 5, 5, 6, 5, 5, 6, 6, 6, 5, 6, 5,
       6, 6, 5, 6, 4, 6, 5, 5, 6, 4, 5, 6, 6, 5, 5, 6, 5, 6, 6, 4, 6, 6,
       5, 5, 6, 5, 5, 5, 5, 5, 6, 5, 6, 7, 6, 7, 6, 6, 5, 5, 6, 6, 5, 5,
       6, 7, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 5, 5, 6, 5, 5, 6, 6, 6, 6, 5,
       6, 6, 6, 5, 5, 6, 6, 7, 7, 6, 6, 5, 6, 6, 5, 5, 6, 6, 6, 7, 5, 5,
       6, 5, 5, 6, 6, 7, 5, 5, 5, 6, 5, 5, 5, 5, 7, 6, 8, 6, 5, 8, 6, 5,
       6, 5, 5, 6, 6, 5, 5, 6, 6, 7, 6, 5, 5, 6, 5, 5, 5, 6, 6, 5, 6, 5,
       6, 5, 6, 6, 6, 5, 7, 7, 5, 6, 6, 6, 5, 6, 6, 5, 5, 6, 6, 6, 7, 5,
       5, 5, 6, 5, 6, 5, 5, 5, 6, 6, 6, 6, 7, 5, 5, 6, 6, 6, 5, 6, 6, 6,
       6, 5, 6, 6, 7, 6, 6, 5, 6, 5, 6, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 7, 6, 6, 7, 5, 7, 5, 6, 7, 7, 6, 6, 6, 7, 6, 6, 7, 5, 5, 7,
       5, 5, 6, 5, 6, 6, 6, 5, 5, 5, 6, 5, 7, 7, 5, 6, 6, 6, 6, 6, 7, 7,
       6, 6, 6, 6, 5, 5, 5, 6, 5, 6, 8, 5, 5, 7, 6,

# 6. 제출파일을 생성합니다.

In [23]:
smpl_sub

Unnamed: 0,index,quality
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
995,995,0
996,996,0
997,997,0
998,998,0


In [24]:
#서브미션 파일의 quality 변수에 예측값을 대입합니다.
smpl_sub['quality'] = y_pred

In [25]:
smpl_sub

Unnamed: 0,index,quality
0,0,5
1,1,5
2,2,5
3,3,5
4,4,6
...,...,...
995,995,5
996,996,6
997,997,5
998,998,6


In [26]:
# csv 파일로 저장합니다.
smpl_sub.to_csv('submission.csv', index=False)