### Modelling
Проведем минимальную работы по конструированию признаков, затем скалируем, полученные результаты, и построим пару моделей классификации.

In [43]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

In [44]:
df = pd.read_csv('https://raw.githubusercontent.com/MelnikDM/Netology/main/CRISP_DM/WIne_Quality/data/processed/Wine_qual_EDA.csv', sep=",")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5907 entries, 0 to 5906
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            5907 non-null   int64  
 1   type                  5907 non-null   object 
 2   fixed acidity         5907 non-null   float64
 3   volatile acidity      5907 non-null   float64
 4   citric acid           5907 non-null   float64
 5   residual sugar        5907 non-null   float64
 6   chlorides             5907 non-null   float64
 7   free sulfur dioxide   5907 non-null   float64
 8   total sulfur dioxide  5907 non-null   float64
 9   density               5907 non-null   float64
 10  pH                    5907 non-null   float64
 11  sulphates             5907 non-null   float64
 12  alcohol               5907 non-null   float64
 13  quality               5907 non-null   int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 646.2+ KB


In [45]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [46]:
df.quality.value_counts().to_frame()

Unnamed: 0,quality
6,2586
5,1924
7,1015
8,181
4,181
3,15
9,5


Мы будем решать задачу классификации по оценке качества вина. Для этого нам необходимо разделить "качество" на 2 группы: вина, которые получили оцеку меньше 6 мы условно обозначим 0, а остальные - 1.

In [47]:
def quality_class(x):
	if x['quality'] >= 6:
		res = 1
	else:
		res = 0
	return res

df['quality_rate'] = df.apply(quality_class, axis=1)

In [48]:
df.quality_rate.value_counts().to_frame()

Unnamed: 0,quality_rate
1,3787
0,2120


Теперь нормализуем наши нзависимые переменные

In [49]:
columns_to_normalize = df.drop(['quality', 'type', 'quality_rate'], axis = 1)

min_max_scaler = preprocessing.MinMaxScaler()

for col in columns_to_normalize:
   df[col] = min_max_scaler.fit_transform(df[col].values.reshape(-1, 1) )


df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_rate
0,white,0.333333,0.293333,0.465753,0.05291,0.27972,0.158537,0.463235,0.444803,0.597938,0.375,0.247934,6,1
1,white,0.583333,0.266667,0.547945,0.333333,0.286713,0.353659,0.334559,0.515817,0.556701,0.305556,0.347107,6,1
2,white,0.458333,0.2,0.438356,0.417989,0.342657,0.560976,0.661765,0.548096,0.484536,0.25,0.31405,6,1
3,white,0.458333,0.2,0.438356,0.417989,0.342657,0.560976,0.661765,0.548096,0.484536,0.25,0.31405,6,1
4,white,0.583333,0.266667,0.547945,0.333333,0.286713,0.353659,0.334559,0.515817,0.556701,0.305556,0.347107,6,1
