# Red wine quality project: observations analyzed
## Jakub Kosterna, Bartosz Siński, Jan Smoleń

### Packages load & generation seed declaration

In [6]:
import pandas as pd
import numpy as np

import pickle

np.random.seed = 42

### Data & models load

In [113]:
X_test = pd.read_csv("data/X_test.csv").drop('Unnamed: 0', axis=1)
y_test = pd.read_csv("data/y_test.csv").drop('Unnamed: 0', axis=1)

gbm = pickle.load(open("models/gbm.pickle", 'rb'))
svm = pickle.load(open("models/svm.pickle", 'rb'))
rfc = pickle.load(open("models/rfc.pickle", 'rb'))
gbc = pickle.load(open("models/gbc.pickle", 'rb'))

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,8.4,0.360,0.32,2.2,0.081,32.0,79.0,0.99640,3.30,0.72,11.0
1,8.4,0.635,0.36,2.0,0.089,15.0,55.0,0.99745,3.31,0.57,10.4
2,6.6,0.440,0.15,2.1,0.076,22.0,53.0,0.99570,3.32,0.62,9.3
3,11.5,0.180,0.51,4.0,0.104,4.0,23.0,0.99960,3.28,0.97,10.1
4,8.9,0.635,0.37,1.7,0.263,5.0,62.0,0.99710,3.00,1.09,9.3
...,...,...,...,...,...,...,...,...,...,...,...
395,7.4,0.530,0.12,1.9,0.165,4.0,12.0,0.99702,3.26,0.86,9.2
396,9.4,0.270,0.53,2.4,0.074,6.0,18.0,0.99620,3.20,1.13,12.0
397,10.0,0.290,0.40,2.9,0.098,10.0,26.0,1.00060,3.48,0.91,9.7
398,8.5,0.340,0.44,1.7,0.079,6.0,12.0,0.99605,3.52,0.63,10.7


### First 32 observations' overview

In [50]:
preds = pd.DataFrame({'original id': pd.read_csv("data/X_test.csv").iloc[:,0],
                     'actual': y_test["is_good"],
                     'XGBoost': gbm.predict(X_test),
                     'Support Vector Machine': svm.predict(X_test),
                     'Random Forest': rfc.predict(X_test),
                     'Gradient Boosting': gbc.predict(X_test)})

In [52]:
preds[0:32]

Unnamed: 0,original id,actual,XGBoost,Support Vector Machine,Random Forest,Gradient Boosting
0,918,1,1,1,1,1
1,224,0,1,0,1,1
2,1211,0,0,0,0,1
3,269,1,1,1,1,1
4,240,0,0,0,0,0
5,605,1,1,0,0,0
6,885,0,1,0,1,1
7,1461,0,0,0,0,0
8,956,1,1,1,1,1
9,1260,0,0,0,0,0


### Some facts about results

In [101]:
all_false = preds[preds["actual"] != preds["XGBoost"]]
all_false = all_false[all_false["XGBoost"] == all_false["Support Vector Machine"]]
all_false = all_false[all_false["XGBoost"] == all_false["Random Forest"]]
all_false = all_false[all_false["XGBoost"] == all_false["Gradient Boosting"]]
print("Observations misclassified by each classifier: ", len(all_false), " (", len(all_false) / 4, "% of all)", sep = "")
all_false[0:8]

Observations misclassified by each classifier: 48 (12.0% of all)


Unnamed: 0,original id,actual,XGBoost,Support Vector Machine,Random Forest,Gradient Boosting
10,1546,0,1,1,1,1
15,1367,1,0,0,0,0
20,645,1,0,0,0,0
24,1487,0,1,1,1,1
25,8,1,0,0,0,0
37,248,1,0,0,0,0
39,900,0,1,1,1,1
47,19,1,0,0,0,0


In [102]:
all_true = preds[preds["actual"] == preds["XGBoost"]]
all_true = all_true[all_true["XGBoost"] == all_true["Support Vector Machine"]]
all_true = all_true[all_true["XGBoost"] == all_true["Random Forest"]]
all_true = all_true[all_true["XGBoost"] == all_true["Gradient Boosting"]]
print("Observations well classified by each classifier: ", len(all_true), " (", len(all_true) / 4, "% of all)", sep = "")
all_true[0:16]

Observations well classified by each classifier: 271 (67.75% of all)


Unnamed: 0,original id,actual,XGBoost,Support Vector Machine,Random Forest,Gradient Boosting
0,918,1,1,1,1,1
3,269,1,1,1,1,1
4,240,0,0,0,0,0
7,1461,0,0,0,0,0
8,956,1,1,1,1,1
9,1260,0,0,0,0,0
11,1502,0,0,0,0,0
12,1404,1,1,1,1,1
13,18,0,0,0,0,0
14,1563,0,0,0,0,0


In [96]:
gbm_true = preds[preds["actual"] == preds["XGBoost"]]
gbm_true = gbm_true[gbm_true["XGBoost"] != gbm_true["Support Vector Machine"]]
gbm_true = gbm_true[gbm_true["XGBoost"] != gbm_true["Random Forest"]]
gbm_true = gbm_true[gbm_true["XGBoost"] != gbm_true["Gradient Boosting"]]
print("Observations well classified only by XGBoost: ", len(gbm_true), " (", len(gbm_true) / 4, "% of all)", sep = "")
gbm_true

Observations well classified only by XGBoost: 5 (1.25% of all)


Unnamed: 0,original id,actual,XGBoost,Support Vector Machine,Random Forest,Gradient Boosting
5,605,1,1,0,0,0
116,1398,1,1,0,0,0
166,1285,0,0,1,1,1
262,309,1,1,0,0,0
361,220,1,1,0,0,0


In [97]:
svm_true = preds[preds["actual"] == preds["Support Vector Machine"]]
svm_true = svm_true[svm_true["Support Vector Machine"] != svm_true["XGBoost"]]
svm_true = svm_true[svm_true["Support Vector Machine"] != svm_true["Random Forest"]]
svm_true = svm_true[svm_true["Support Vector Machine"] != svm_true["Gradient Boosting"]]
print("Observations well classified only by XGBoost: ", len(svm_true), " (", len(svm_true) / 4, "% of all)", sep = "")
svm_true

Observations well classified only by XGBoost: 13 (3.25% of all)


Unnamed: 0,original id,actual,XGBoost,Support Vector Machine,Random Forest,Gradient Boosting
1,224,0,1,0,1,1
6,885,0,1,0,1,1
57,1342,1,0,1,0,0
69,1235,0,1,0,1,1
95,670,0,1,0,1,1
127,1463,1,0,1,0,0
142,539,0,1,0,1,1
208,85,0,1,0,1,1
288,133,1,0,1,0,0
295,451,1,0,1,0,0


In [95]:
rfc_true = preds[preds["actual"] == preds["Random Forest"]]
rfc_true = rfc_true[rfc_true["Random Forest"] != rfc_true["XGBoost"]]
rfc_true = rfc_true[rfc_true["Random Forest"] != rfc_true["Support Vector Machine"]]
rfc_true = rfc_true[rfc_true["Random Forest"] != rfc_true["Gradient Boosting"]]
print("Observations well classified only by XGBoost: ", len(rfc_true), " (", len(rfc_true) / 4, "% of all)", sep = "")
rfc_true

Observations well classified only by XGBoost: 1 (0.25% of all)


Unnamed: 0,original id,actual,XGBoost,Support Vector Machine,Random Forest,Gradient Boosting
260,1019,0,1,1,0,1


In [103]:
gbc_true = preds[preds["actual"] == preds["Gradient Boosting"]]
gbc_true = gbc_true[gbc_true["Gradient Boosting"] != gbc_true["XGBoost"]]
gbc_true = gbc_true[gbc_true["Gradient Boosting"] !=gbc_true["Support Vector Machine"]]
gbc_true = gbc_true[gbc_true["Gradient Boosting"] != gbc_true["Random Forest"]]
print("Observations well classified only by XGBoost: ", len(gbc_true), " (", len(gbc_true) / 4, "% of all)", sep = "")
gbc_true

Observations well classified only by XGBoost: 9 (2.25% of all)


Unnamed: 0,original id,actual,XGBoost,Support Vector Machine,Random Forest,Gradient Boosting
31,1283,1,0,0,0,1
117,231,1,0,0,0,1
165,482,0,1,1,1,0
280,483,0,1,1,1,0
308,631,0,1,1,1,0
309,1287,0,1,1,1,0
335,338,1,0,0,0,1
376,1574,1,0,0,0,1
388,256,0,1,1,1,0


An interesting fact - although the SVM performed the worst, it definitely stands out against the stakes in its own way on the plus side - has the largest number of observations, for which only he well predicted the result.

### Final observations for further analysis

Due to the above analysis and the nature of the problem and models, we decided to include **16 representative wines** in the further analysis:

In [104]:
chosen_ones = pd.DataFrame({"test set id": [0, 3, 8, 4, 7, 9, 15, 20,
                                           10, 24, 5, 166, 57, 6, 260, 31],
                           "original id": [918, 269, 956, 240, 1461, 1260, 1367, 645,
                                          1546, 1487, 605, 1285, 1342, 885, 1019, 1283],
                           "details": ["all TP #1", "all TP #2", "all TP #3",
                                       "all TN #1", "all TN #2", "all TN #3",
                                       "all FN #1", "all FN #2", "all FP #1", "all FP #2",
                                       "XGBoost TP, others FP", "XGBoost TN, others FN",
                                       "SVM TP, others FP", "SVM TN, others FP",
                                       "Random Forest TN, others FN",
                                       "Gradient Boosting TP, others FP"]})

In [106]:
chosen_ones

Unnamed: 0,test set id,original id,details
0,0,918,all TP #1
1,3,269,all TP #2
2,8,956,all TP #3
3,4,240,all TN #1
4,7,1461,all TN #2
5,9,1260,all TN #3
6,15,1367,all FN #1
7,20,645,all FN #2
8,10,1546,all FP #1
9,24,1487,all FP #2


In [111]:
chosen_ones = pd.merge(chosen_ones, preds)
chosen_ones

Unnamed: 0,test set id,original id,details,actual,XGBoost,Support Vector Machine,Random Forest,Gradient Boosting
0,0,918,all TP #1,1,1,1,1,1
1,3,269,all TP #2,1,1,1,1,1
2,8,956,all TP #3,1,1,1,1,1
3,4,240,all TN #1,0,0,0,0,0
4,7,1461,all TN #2,0,0,0,0,0
5,9,1260,all TN #3,0,0,0,0,0
6,15,1367,all FN #1,1,0,0,0,0
7,20,645,all FN #2,1,0,0,0,0
8,10,1546,all FP #1,0,1,1,1,1
9,24,1487,all FP #2,0,1,1,1,1


In [118]:
chosen_ones_all_details = pd.merge(chosen_ones, X_test.reset_index(), left_on = "test set id", right_on = "index")
chosen_ones_all_details = chosen_ones_all_details.drop("index", axis=1)

In [122]:
chosen_ones_all_details

Unnamed: 0,test set id,original id,details,actual,XGBoost,Support Vector Machine,Random Forest,Gradient Boosting,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0,918,all TP #1,1,1,1,1,1,8.4,0.36,0.32,2.2,0.081,32.0,79.0,0.9964,3.3,0.72,11.0
1,3,269,all TP #2,1,1,1,1,1,11.5,0.18,0.51,4.0,0.104,4.0,23.0,0.9996,3.28,0.97,10.1
2,8,956,all TP #3,1,1,1,1,1,9.0,0.36,0.52,2.1,0.111,5.0,10.0,0.99568,3.31,0.62,11.3
3,4,240,all TN #1,0,0,0,0,0,8.9,0.635,0.37,1.7,0.263,5.0,62.0,0.9971,3.0,1.09,9.3
4,7,1461,all TN #2,0,0,0,0,0,6.2,0.785,0.0,2.1,0.06,6.0,13.0,0.99664,3.59,0.61,10.0
5,9,1260,all TN #3,0,0,0,0,0,8.6,0.635,0.68,1.8,0.403,19.0,56.0,0.99632,3.02,1.15,9.3
6,15,1367,all FN #1,1,0,0,0,0,6.9,0.54,0.3,2.2,0.088,9.0,105.0,0.99725,3.25,1.18,10.5
7,20,645,all FN #2,1,0,0,0,0,7.8,0.64,0.1,6.0,0.115,5.0,11.0,0.9984,3.37,0.69,10.1
8,10,1546,all FP #1,0,1,1,1,1,7.0,0.57,0.02,2.0,0.072,17.0,26.0,0.99575,3.36,0.61,10.2
9,24,1487,all FP #2,0,1,1,1,1,6.0,0.64,0.05,1.9,0.066,9.0,17.0,0.99496,3.52,0.78,10.6


In [137]:
X_test_chosen_ones = X_test.reset_index()
X_test_chosen_ones = X_test_chosen_ones[X_test_chosen_ones["index"].isin(chosen_ones["test set id"])]
X_test_chosen_ones = X_test_chosen_ones.rename(columns={"index": "test set id"})
X_test_chosen_ones

Unnamed: 0,test set id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0,8.4,0.36,0.32,2.2,0.081,32.0,79.0,0.9964,3.3,0.72,11.0
3,3,11.5,0.18,0.51,4.0,0.104,4.0,23.0,0.9996,3.28,0.97,10.1
4,4,8.9,0.635,0.37,1.7,0.263,5.0,62.0,0.9971,3.0,1.09,9.3
5,5,8.3,0.6,0.13,2.6,0.085,6.0,24.0,0.9984,3.31,0.59,9.2
6,6,8.9,0.75,0.14,2.5,0.086,9.0,30.0,0.99824,3.34,0.64,10.5
7,7,6.2,0.785,0.0,2.1,0.06,6.0,13.0,0.99664,3.59,0.61,10.0
8,8,9.0,0.36,0.52,2.1,0.111,5.0,10.0,0.99568,3.31,0.62,11.3
9,9,8.6,0.635,0.68,1.8,0.403,19.0,56.0,0.99632,3.02,1.15,9.3
10,10,7.0,0.57,0.02,2.0,0.072,17.0,26.0,0.99575,3.36,0.61,10.2
15,15,6.9,0.54,0.3,2.2,0.088,9.0,105.0,0.99725,3.25,1.18,10.5


In [139]:
y_test_chosen_ones = y_test.reset_index()
y_test_chosen_ones = y_test_chosen_ones[y_test_chosen_ones["index"].isin(chosen_ones["test set id"])]
y_test_chosen_ones = y_test_chosen_ones.rename(columns={"index": "test set id"})
y_test_chosen_ones

Unnamed: 0,test set id,is_good
0,0,1
3,3,1
4,4,0
5,5,1
6,6,0
7,7,0
8,8,1
9,9,0
10,10,0
15,15,1


In [121]:
chosen_ones.to_csv("data/chosen_obserwations_basic.csv")
chosen_ones_all_details.to_csv("data/chosen_obserwations_full.csv")

X_test_chosen_ones.to_csv("data/X_test_chosen_ones.csv")
y_test_chosen_ones.to_csv("data/y_test_chosen_ones.csv")