# loading data

In [None]:
import numpy as np
import scipy as sp
import pandas as pd

# plotting and visualization

import matplotlib.colors
import matplotlib.pyplot as plt
import seaborn as sns

# modeling
import sklearn.model_selection
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import cross_val_score,cross_validate
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score,precision_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay

import pickle

from sklearn import __version__ as sklearn_version
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

import os

In [None]:
X_train = pd.read_csv("../Capstone_Two_Project/data/processed/X_train.csv", index_col = 0)

In [None]:
X_test = pd.read_csv("../Capstone_Two_Project/data/processed/X_test.csv", index_col = 0)

In [None]:
y_train = pd.read_csv("../Capstone_Two_Project/data/processed/y_train.csv",index_col = False)

y_train=y_train.drop('Unnamed: 0',axis=1)

In [None]:
y_test = pd.read_csv("../Capstone_Two_Project/data/processed/y_test.csv", index_col = False)
y_test=y_test.drop('Unnamed: 0',axis=1)

# Modeling

## Metrics for model performance

For a classification machine learning, several metrics such as accuracy, recall, F1 and ROC_AUC are used in model performance evaluation. The project purpose is to predict good quality wine, and the dataset is imblanced, therefore the model with highest precision values with relative high accuracy among the three models:Logisticregression, randomforest, XGBoost will be the winner. 

## Baseline model

dummyclassifier is used in modeling prediction. it will serve as the baseline modeling, good models should shows better metris than it.

In [None]:
dummy = DummyClassifier(strategy ="stratified",random_state = 123)
dummy.fit(X_train.values,y_train.values.ravel())
y_pred = dummy.predict(X_test.values)
y_pred_probs = dummy.predict_proba(X_test.values) [:,1]

In [None]:
pc_dummy = precision_score(y_test, y_pred,average='weighted')
report_dummy = classification_report(y_test, y_pred)
cm_dummy = confusion_matrix(y_test, y_pred)
print('dummy: precision=%.3f' % (pc_dummy))

In [None]:
print("report:", report_dummy,"\n")

In dummy model, the precison for predict good win is 0.20. overall precision is 0.697.

three models:logisticregression, random forest and XGBoost Model will be selected. Through compare the cross validtion scores, the final model will be select and final tuned. Also, the number of features used in modeling was controlled bu selectkbest algorithms. 

## Logisticregression model

### make a pipeline

In [None]:
from sklearn.pipeline import Pipeline
pipe_log = Pipeline(
    steps =[("feature_selection",SelectKBest(score_func=f_regression)),("model",
    LogisticRegression())]
)

In [None]:
pipe_log.fit(X_train.values, y_train.values.ravel())

In [None]:
y_tr_pred_log = pipe_log.predict(X_train.values)
y_te_pred_log= pipe_log.predict(X_test.values)

In [None]:
y_pred_log_probs = pipe_log.predict_proba(X_test.values) [:,1]
print(roc_auc_score(y_test, y_pred_log_probs))

In [None]:
precision_score(y_train, y_tr_pred_log, average = "weighted" ), precision_score(y_test, y_te_pred_log, average ="weighted")

In [None]:
accuracy_score(y_train, y_tr_pred_log), accuracy_score(y_test, y_te_pred_log)

In [None]:
# report_log = classification_report(y_test, y_te_pred_log)
# cm_log = confusion_matrix(y_test, y_te_pred_log)
# ConfusionMatrixDisplay(confusion_matrix=cm_log,display_labels=pipe_log.classes_).plot()
# plt.show()
print(classification_report(y_test, y_te_pred_log))

### Assessing performance using cross-validation

In [None]:
cv_results_log = cross_validate(pipe_log, X_train, y_train.values.ravel(), cv=5,scoring = "precision")

In [None]:
cv_scores_log = cv_results_log['test_score']
cv_scores_log

In [None]:
cv_scores_mean_log = np.mean(cv_scores_log)
cv_score_std_log = np.std(cv_scores_log)
print(cv_scores_mean_log, cv_score_std_log)

the cv score is only 0.59, which is lower than the intial logistic regression modeling. 

## Random forest 

### using the default setting

In [None]:
pipe_rf = Pipeline(
    steps =[("feature_selection",SelectKBest(score_func=f_regression)),("rf",
    RandomForestClassifier(random_state= 123))]
)

In [None]:
pipe_rf.fit(X_train, y_train.values.ravel())

In [None]:
y_tr_pred_rf = pipe_rf.predict(X_train)
y_te_pred_rf = pipe_rf.predict(X_test)

In [None]:
y_pred_rf_probs = pipe_rf.predict_proba(X_test.values) [:,1]
print(roc_auc_score(y_test, y_pred_rf_probs))

In [None]:
precision_score(y_train, y_tr_pred_rf, average = "weighted" ), precision_score(y_test, y_te_pred_rf, average ="weighted")

### check cross validation score

In [None]:
cv_results_rf = cross_validate(pipe_rf, X_train, y_train.values.ravel(), cv=5,scoring = "precision")
cv_scores_rf = cv_results_rf['test_score']
cv_scores_rf

In [None]:
accuracy_score(y_train, y_tr_pred_rf), accuracy_score(y_test, y_te_pred_rf)

In [None]:
np.mean(cv_scores_rf), np.std(cv_scores_rf)

## XGBoost model

In [None]:
pipe_xgb = Pipeline(
    steps =[("feature_selection",SelectKBest(score_func=f_regression)),("rf",
    XGBClassifier(objective='binary:logistic',learning_rate=0.1 , n_estimators=300, random_state=123))]
)

In [None]:
pipe_xgb.fit(X_train, y_train.values.ravel())

In [None]:
y_tr_pred_xgb = pipe_xgb.predict(X_train)
y_te_pred_xgb = pipe_xgb.predict(X_test)
y_pred_xgb_probs = pipe_xgb.predict_proba(X_test.values) [:,1]
print(roc_auc_score(y_test, y_pred_xgb_probs))

In [None]:
precision_score(y_train, y_tr_pred_xgb, average = "weighted" ), precision_score(y_test, y_te_pred_xgb, average ="weighted")

### cross validation

In [None]:
cv_results_xgb = cross_validate(pipe_xgb, X_train, y_train.values.ravel(), cv=5,scoring = "precision")
cv_scores_xgb = cv_results_xgb['test_score']
cv_scores_xgb

In [None]:
np.mean(cv_scores_xgb), np.std(cv_scores_xgb)

## CV of the three models:

In [None]:
cv_dict = {
    "cv_scores_log": np.mean(cv_scores_log),
    "cv_scores_rf": np.mean(cv_scores_rf),
    "cv_scores_xgb": np.mean(cv_scores_xgb)
}
cv_score_model = pd.DataFrame(cv_dict.items(), columns=["cv_score","std"])
cv_score_model

among the three modles, random forest shows a slighlty highier score. Both logisticregression and random wil be tunning,and compare. 

## tunining model

### logisticregression model

In [None]:
k = [k+1 for k in range(len(X_train.columns))]
c_values = [0.01,0.1,1.0,10,100]
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
grid_params = {'feature_selection__k': k, 'model__C':c_values,'model__solver':solvers}
#grid_params = {'feature_slection__k': k}

In [None]:
log_grid_cv = GridSearchCV(pipe_log, param_grid=grid_params,scoring="precision", cv=5, n_jobs=-1)

In [None]:
log_grid_cv.fit(X_train, y_train.values.ravel())

In [None]:
score_mean = log_grid_cv.cv_results_['mean_test_score']
score_std = log_grid_cv.cv_results_['std_test_score']


In [None]:
log_grid_cv.best_params_

In [None]:
selected = log_grid_cv.best_estimator_.named_steps.feature_selection.get_support()
coefs = log_grid_cv.best_estimator_.named_steps.model.coef_
features = X_train.columns[selected]
pd.Series(coefs.ravel(), index = features.ravel()).sort_values(ascending=False)

### random forest model

In [None]:
pipe = Pipeline(
    steps =[("feature_selection",SelectKBest(score_func=f_regression)),("rf",
    RandomForestClassifier(random_state= 123))]
)

In [None]:
k = [k+1 for k in range(len(X_train.columns))]
n_est = [50,100,300,500]
grid_params = {'feature_selection__k': k,
        'rf__n_estimators': n_est
        
}

In [None]:
rf_grid_cv = GridSearchCV(pipe, param_grid=grid_params,scoring ="precision", cv=5, n_jobs=-1)
rf_grid_cv.fit(X_train, y_train.values.ravel())

In [None]:
rf_grid_cv.best_params_

In [None]:
#Plot a barplot of the random forest's feature importances,
#assigning the `feature_importances_` attribute of 

features_imp = rf_grid_cv.best_estimator_.named_steps.rf.feature_importances_
features_imp

In [None]:
X_train.columns

In [None]:
plt.subplots(figsize=(10, 5))
rf_feat = pd.Series(features_imp, index=X_train.columns).sort_values(ascending=False)
rf_feat.plot(kind='bar')
plt.xlabel('feature name')
plt.ylabel('importance')
plt.title('Tunied random forest feature importances');

In [None]:
cv_rf = cross_validate(rf_grid_cv.best_estimator_, X_train, y_train, 
                            scoring='precision', cv=5, n_jobs=-1)

In [None]:
cv_scores_rf = cv_rf['test_score']
np.mean(cv_scores_rf), np.std(cv_scores_rf)

In [None]:

y_tr_pred = rf_grid_cv.best_estimator_.predict(X_train)
y_te_pred = rf_grid_cv.best_estimator_.predict(X_test)
y_pred_probs = rf_grid_cv.best_estimator_.predict_proba(X_test.values) [:,1]
print(roc_auc_score(y_test, y_pred_probs))

In [None]:
report_rf = classification_report(y_test, y_te_pred)
cm_rf = confusion_matrix(y_test, y_te_pred)
ConfusionMatrixDisplay(confusion_matrix=cm_rf,display_labels=rf_grid_cv.classes_).plot()
plt.show()
print(classification_report(y_test, y_te_pred))

## Save best model object frome pipeline

In [None]:
best_model = rf_grid_cv.best_estimator_
best_model.version = '1.0'
best_model.pandas_version = pd.__version__
best_model.numpy_version = np.__version__
best_model.sklearn_version = sklearn_version
best_model.X_columns = [col for col in X_train.columns]
best_model.build_datetime = datetime.datetime.now()


In [None]:
import pickle

In [None]:
# save the model to disk
filename = 'wine_quality_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

## Modeling the wine quality

In [68]:
df= pd.read_csv("../Capstone_Two_Project/data/processed/df_model.csv", index_col = "wine_type_white")
df.drop("Unnamed: 0",axis =1)
df

Unnamed: 0_level_0,Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,free_SO2_ratio,alcohol_surgar_ratio,sugar_acidity_ratio,fixed_acidity_ratio
wine_type_white,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0,32.352941,494.736842,23.456790,91.358025
0,1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0,37.313433,376.923077,29.953917,89.861751
0,2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0,27.777778,426.086957,26.869159,91.121495
0,3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0,28.333333,515.789474,16.550523,97.560976
0,4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,0,32.500000,522.222222,22.332506,91.811414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,0,26.086957,700.000000,24.960998,96.723869
1,5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,0,33.928571,120.000000,115.606936,95.375723
1,5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,0,27.027027,783.333333,17.804154,96.439169
1,5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1,18.181818,1163.636364,18.998273,94.991364


In [None]:
X=df.drop("quality")

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

## summary

Four modeling were performed in good wine quality prediction. At beginner, the dummyclassifier is used as baseline model, which the prediction is irrelavent to featrues. Then three models logisticregression, random forest model and xgboost were evaluate through pipeline. the cv results shows the random forest model have high precision scores and therefore it be select for further tunning. 