In [84]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')


# LightGBM
import lightgbm as lgb

In [85]:
path = "./data/"


In [103]:
df = pd.read_csv(path + 'train.csv')
X_test = pd.read_csv(path + 'test.csv')
print('訓練データのデータ数は{}、変数は{}種類です。'.format(df.shape[0], df.shape[1]))
print('テストデータのデータ数は{}、変数は{}種類です'.format(X_test.shape[0], X_test.shape[1]))

訓練データのデータ数は1280、変数は12種類です。
テストデータのデータ数は319、変数は11種類です


In [104]:
df.isnull().sum() 

fixed acidity             0
volatile acidity          0
citric acid               0
residual sugar            0
chlorides                 0
free sulfur dioxide       0
total sulfur dioxide      0
density                 122
pH                      137
sulphates                 0
alcohol                   0
quality                   0
dtype: int64

In [105]:
X_test.isnull().sum()

fixed acidity            0
volatile acidity         0
citric acid              0
residual sugar           0
chlorides                0
free sulfur dioxide      0
total sulfur dioxide     0
density                 25
pH                      24
sulphates                0
alcohol                  0
dtype: int64

In [107]:
#fill nan  mean
den = pd.concat([df['density'], X_test['density']])
ph = pd.concat([df['pH'], X_test['pH']])

df['density'].fillna(den.mean(), inplace=True)
X_test['density'].fillna(den.mean(), inplace=True)

df['pH'].fillna(ph.mean(), inplace=True)
X_test['pH'].fillna(ph.mean(), inplace=True)

df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [108]:
df['quality'] = df['quality'].astype(int)

In [109]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,9.8,0.63,0.24,2.4,0.078,6.1,32.6,0.9997,3.08,0.57,9.4,5
1,6.1,0.34,0.25,1.8,0.084,4.0,28.0,0.9941,3.36,0.44,10.2,4
2,7.1,0.43,0.17,1.8,0.083,27.2,51.3,0.9941,3.51,0.63,10.4,5
3,8.6,0.47,0.27,1.9,0.058,17.5,37.6,0.9907,3.16,0.85,11.1,6
4,6.9,0.41,0.30,8.9,0.084,25.9,45.4,0.9861,3.35,0.64,10.2,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1275,7.6,0.54,0.21,2.2,0.071,7.1,28.1,1.0043,3.29,0.54,9.9,4
1276,7.1,0.65,0.00,3.9,0.086,17.0,44.7,1.0066,3.40,0.54,9.7,4
1277,9.4,0.34,0.37,2.2,0.074,5.1,12.9,0.9936,3.23,0.62,9.3,5
1278,9.6,0.73,0.10,2.1,0.080,6.0,30.6,1.0017,3.22,0.56,10.1,5


In [110]:
x = df.drop(columns ='quality')
y = df['quality']



In [111]:
X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.3, random_state=42)



In [112]:
#ランダムフォレスト
train_data = lgb.Dataset(X_train, label=y_train)
eval_data = lgb.Dataset(X_valid, label=y_valid, reference= train_data)

In [128]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',

    
}

In [129]:
gbm = lgb.train(
params,
train_data,
valid_sets=eval_data,
num_boost_round=100,

)


[1]	valid_0's l2: 0.955993
[2]	valid_0's l2: 0.918476
[3]	valid_0's l2: 0.8886
[4]	valid_0's l2: 0.864693
[5]	valid_0's l2: 0.841784
[6]	valid_0's l2: 0.821514
[7]	valid_0's l2: 0.811737
[8]	valid_0's l2: 0.800324
[9]	valid_0's l2: 0.788784
[10]	valid_0's l2: 0.779809
[11]	valid_0's l2: 0.771094
[12]	valid_0's l2: 0.768012
[13]	valid_0's l2: 0.764462
[14]	valid_0's l2: 0.763617
[15]	valid_0's l2: 0.759421
[16]	valid_0's l2: 0.758626
[17]	valid_0's l2: 0.759845
[18]	valid_0's l2: 0.759637
[19]	valid_0's l2: 0.758106
[20]	valid_0's l2: 0.758931
[21]	valid_0's l2: 0.758407
[22]	valid_0's l2: 0.760553
[23]	valid_0's l2: 0.760383
[24]	valid_0's l2: 0.762213
[25]	valid_0's l2: 0.764654
[26]	valid_0's l2: 0.766319
[27]	valid_0's l2: 0.770207
[28]	valid_0's l2: 0.770242
[29]	valid_0's l2: 0.772717
[30]	valid_0's l2: 0.775528
[31]	valid_0's l2: 0.779241
[32]	valid_0's l2: 0.777905
[33]	valid_0's l2: 0.779332
[34]	valid_0's l2: 0.781521
[35]	valid_0's l2: 0.783864
[36]	valid_0's l2: 0.784192
[37

In [130]:
preds = gbm.predict(X_valid)

In [131]:
mean_squared_error(y_valid, preds)

0.8270471386754616

In [118]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_predict = lr.predict(X_valid)
mean_squared_error(y_valid, lr_predict)

0.828125

In [119]:
lr_predict = lr.predict(X_test)


In [120]:
lr_predict

array([5, 5, 5, 6, 5, 5, 5, 5, 6, 5, 6, 5, 6, 6, 6, 5, 5, 6, 5, 6, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 6, 6, 5,
       5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 5, 6,
       5, 5, 5, 5, 5, 5, 5, 6, 5, 6, 6, 5, 5, 5, 6, 5, 5, 4, 5, 5, 4, 6,
       6, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 6, 5, 5, 5, 6, 5, 6, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 6, 6, 5,
       5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 6, 5, 5, 4,
       5, 5, 6, 6, 6, 5, 6, 6, 5, 5, 5, 5, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5,
       6, 5, 5, 5, 5, 6, 5, 6, 6, 5, 5, 5, 5, 5, 5, 6, 5, 6, 6, 5, 5, 5,
       5, 5, 5, 5, 5, 6, 5, 4, 5, 5, 5, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6,
       5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 6, 5, 4, 5, 6, 6, 5,
       6, 5, 5, 6, 5, 5, 5, 4, 5, 5, 4, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5,
       5, 5, 6, 5, 5, 6, 6, 6, 5, 6, 5, 5, 5, 5, 5, 5, 6, 6, 6, 5, 6, 4,
       5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5,

In [121]:
submission = pd.read_csv(path + 'submission.csv')
submission

Unnamed: 0,quality
0,5.0
1,5.0
2,5.0
3,5.0
4,5.0
...,...
314,5.0
315,5.0
316,5.0
317,5.0


In [122]:
submission['quality'] = lr_predict
submission

Unnamed: 0,quality
0,5
1,5
2,5
3,6
4,5
...,...
314,6
315,4
316,5
317,5


In [123]:
submission.to_csv('./data/submission.csv', index=False)