### Подготовка данных
Проведем минимальную работу по конструированию признаков, затем скалируем, полученные результаты, и построим пару моделей классификации.

In [22]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pickle

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/MelnikDM/Netology/main/CRISP_DM/WIne_Quality/data/processed/Wine_qual_EDA.csv', sep=",")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5907 entries, 0 to 5906
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            5907 non-null   int64  
 1   type                  5907 non-null   object 
 2   fixed acidity         5907 non-null   float64
 3   volatile acidity      5907 non-null   float64
 4   citric acid           5907 non-null   float64
 5   residual sugar        5907 non-null   float64
 6   chlorides             5907 non-null   float64
 7   free sulfur dioxide   5907 non-null   float64
 8   total sulfur dioxide  5907 non-null   float64
 9   density               5907 non-null   float64
 10  pH                    5907 non-null   float64
 11  sulphates             5907 non-null   float64
 12  alcohol               5907 non-null   float64
 13  quality               5907 non-null   int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 646.2+ KB


In [6]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [7]:
df.quality.value_counts().to_frame()

Unnamed: 0,quality
6,2586
5,1924
7,1015
8,181
4,181
3,15
9,5


Мы будем решать задачу классификации по оценке качества вина. Для этого нам необходимо разделить "качество" на 2 группы: вина, которые получили оцеку меньше 6 мы условно обозначим 0, а остальные - 1.

In [8]:
def quality_class(x):
    if x['quality'] >= 6:
        res = 1
    else:
        res = 0
    return res

df['quality_rate'] = df.apply(quality_class, axis=1)

In [9]:
df.quality_rate.value_counts().to_frame()

Unnamed: 0,quality_rate
1,3787
0,2120


Теперь нормализуем наши независимые переменные

In [10]:
columns_to_normalize = df.drop(['quality', 'type', 'quality_rate'], axis = 1)

min_max_scaler = preprocessing.MinMaxScaler()

for col in columns_to_normalize:
   df[col] = min_max_scaler.fit_transform(df[col].values.reshape(-1, 1) )


df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_rate
0,white,0.333333,0.293333,0.465753,0.05291,0.27972,0.158537,0.463235,0.444803,0.597938,0.375,0.247934,6,1
1,white,0.583333,0.266667,0.547945,0.333333,0.286713,0.353659,0.334559,0.515817,0.556701,0.305556,0.347107,6,1
2,white,0.458333,0.2,0.438356,0.417989,0.342657,0.560976,0.661765,0.548096,0.484536,0.25,0.31405,6,1
3,white,0.458333,0.2,0.438356,0.417989,0.342657,0.560976,0.661765,0.548096,0.484536,0.25,0.31405,6,1
4,white,0.583333,0.266667,0.547945,0.333333,0.286713,0.353659,0.334559,0.515817,0.556701,0.305556,0.347107,6,1


In [11]:
import os
outname = 'Wine_qual_scaled.csv'
outdir = 'E:/Repositories/Projects/SVO/Netology/CRISP_DM/WIne_Quality/data/scaled'
if not os.path.exists(outdir):
    os.mkdir(outdir)
fullname = os.path.join(outdir, outname)
df.to_csv(fullname)

### Построение модели

In [12]:
x = df.iloc[:, 1:12]
y = df.quality_rate.values

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#### Классификация с LogReg

In [52]:
LogReg = Pipeline([
        ('classification', LogisticRegression())
])
parameters_LogReg = [{
        'classification__C': np.linspace(0.001, 1, 10, dtype=float),
        'classification__penalty': ('l2', 'l1')  
}]

scores=['accuracy', 'f1']

grid_LogReg = GridSearchCV(
                LogReg, 
                param_grid=parameters_LogReg, 
                cv=3,
                scoring=scores,
                refit=scores[0],
                n_jobs=-1, 
                verbose=1
)

grid_LogReg_sc = grid_LogReg.fit(x_train, y_train)

with open('E:/Repositories/Projects/SVO/Netology/CRISP_DM/WIne_Quality/models/LogReg.pkl', 'wb') as file:  
    pickle.dump(LogReg, file)

with open('E:/Repositories/Projects/SVO/Netology/CRISP_DM/WIne_Quality/reports/LogReg.txt', 'a') as f:
        f.write('Model: {}\n'.format(str(grid_LogReg).split('(')[0]))
        f.write('Accuracy: {}\n'.format(grid_LogReg_sc.best_score_))
        f.write('Best params: {}'.format(grid_LogReg_sc.best_params_))
        f.write('\n---\n')
f.close()

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [32]:
y_pred1 = grid_LogReg.predict(x_test)
print(metrics.classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.65      0.56      0.60       398
           1       0.79      0.85      0.82       784

    accuracy                           0.75      1182
   macro avg       0.72      0.70      0.71      1182
weighted avg       0.74      0.75      0.74      1182



#### Реализация модели классификации с использованием RandomForestClassifier

In [48]:
RFC = Pipeline([
       ('classification', RandomForestClassifier())
])
parameters_RFC = [{
        'classification__max_features': (0.5, 0.8, 1),
        'classification__n_estimators': [32, 64, 128],
        'classification__max_depth' : [5,6,7,8,10,12],
        'classification__criterion' :['gini', 'entropy']
}]

scores=['accuracy', 'f1']

grid_RFC = GridSearchCV(
                RFC, 
                param_grid = parameters_RFC,
                cv=3,
                scoring=scores,
                refit=scores[0],
                n_jobs=-1, 
                verbose=1
)


grid_RFC_sc = grid_RFC.fit(x_train, y_train)

with open('E:/Repositories/Projects/SVO/Netology/CRISP_DM/WIne_Quality/models/RFC.pkl', 'wb') as file:  
    pickle.dump(RFC, file)

with open('E:/Repositories/Projects/SVO/Netology/CRISP_DM/WIne_Quality/reports/RFC.txt', 'a') as f:
        f.write('Model: {}\n'.format(str(grid_RFC).split('(')[0]))
        f.write('Accuracy: {}\n'.format(grid_RFC_sc.best_score_))
        f.write('Best params: {}'.format(grid_RFC_sc.best_params_))
        f.write('\n---\n')
f.close()

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [49]:
y_pred = grid_RFC.predict(x_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.73      0.74       398
           1       0.86      0.88      0.87       784

    accuracy                           0.83      1182
   macro avg       0.81      0.80      0.81      1182
weighted avg       0.83      0.83      0.83      1182



### Оценка моделей

In [33]:
from sklearn.metrics import accuracy_score, make_scorer, precision_score, recall_score, f1_score

In [34]:
model_list = [grid_RFC, grid_LogReg]

for model in model_list:
    y_pred = model.best_estimator_.predict(x_test)
    print(f"Model: {model}")
    print('Accuracy: %.2f' % accuracy_score(y_true=y_test, y_pred=y_pred))
    print('Precision: %.2f' % precision_score(y_true=y_test, y_pred=y_pred))
    print('Recall: %.2f' % recall_score(y_true=y_test, y_pred=y_pred))
    print('f1_score: %.2f' % f1_score(y_true=y_test, y_pred=y_pred))
    print()

Model: GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('classification',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid=[{'classification__max_features': (0.5, 0.8, 1)}],
             refit='accuracy', scoring=['accuracy', 'f1'], verbose=1)
Accuracy: 0.84
Precision: 0.87
Recall: 0.89
f1_score: 0.88

Model: GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('classification',
                                        LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'classification__C': array([0.001, 0.112, 0.223, 0.334, 0.445, 0.556, 0.667, 0.778, 0.889,
       1.   ]),
                          'classification__penalty': ('l2', 'l1')}],
             refit='accuracy', scoring=['accuracy', 'f1'], verbose=1)
Accuracy: 0.75
Precision: 0.79
Recall: 0.85
f1_score: 0.82



### Итого

Лучшие результаты показала модель RandomForestClassifier

In [50]:
y_pred = grid_RFC.best_estimator_.predict(x_test)
y_pred = pd.DataFrame(y_pred)

y_pred.to_csv('E:/Repositories/Projects/SVO/Netology/CRISP_DM/WIne_Quality/data/processed/y_pred.csv')

df['y_pred'] = y_pred
df.to_csv('E:/Repositories/Projects/SVO/Netology/CRISP_DM/WIne_Quality/data/processed/wine_prediction_final.csv', index = False )

In [51]:
df.head(50)

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_rate,y_pred
0,white,0.333333,0.293333,0.465753,0.05291,0.27972,0.158537,0.463235,0.444803,0.597938,0.375,0.247934,6,1,0.0
1,white,0.583333,0.266667,0.547945,0.333333,0.286713,0.353659,0.334559,0.515817,0.556701,0.305556,0.347107,6,1,1.0
2,white,0.458333,0.2,0.438356,0.417989,0.342657,0.560976,0.661765,0.548096,0.484536,0.25,0.31405,6,1,0.0
3,white,0.458333,0.2,0.438356,0.417989,0.342657,0.560976,0.661765,0.548096,0.484536,0.25,0.31405,6,1,1.0
4,white,0.583333,0.266667,0.547945,0.333333,0.286713,0.353659,0.334559,0.515817,0.556701,0.305556,0.347107,6,1,1.0
5,white,0.319444,0.32,0.219178,0.338624,0.251748,0.353659,0.477941,0.502905,0.474227,0.347222,0.264463,6,1,1.0
6,white,0.333333,0.293333,0.465753,0.05291,0.27972,0.158537,0.463235,0.444803,0.597938,0.375,0.247934,6,1,0.0
7,white,0.583333,0.186667,0.589041,0.047619,0.244755,0.329268,0.452206,0.431892,0.515464,0.319444,0.495868,6,1,1.0
8,white,0.583333,0.253333,0.561644,0.044974,0.167832,0.121951,0.209559,0.238218,0.278351,0.472222,0.661157,5,0,1.0
9,white,0.652778,0.2,0.547945,0.190476,0.181818,0.195122,0.378676,0.489994,0.43299,0.430556,0.280992,5,0,1.0
