In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import SVG
from sklearn.metrics import classification_report,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import warnings
warnings.simplefilter('ignore')

In [None]:
wine = pd.read_csv('winequality-red.csv')
wine.head()

In [None]:
sns.countplot(x='quality', data=wine)

In [None]:
corr_matrix = wine.corr()
plt.figure(figsize=(7, 6))
sns.heatmap(corr_matrix, annot=True)

In [None]:
corr_matrix = wine.corr()
corr_with_target = abs(corr_matrix['quality']).sort_values(ascending=False)
best_features = corr_with_target[1:6].index.tolist()
print("Best features based on correlation analysis: ", best_features)

In [None]:
wine.columns

In [None]:
wine.quality.unique()

In [None]:
# create a list of feature columns
feature_cols = ['alcohol', 'volatile acidity', 'sulphates', 'citric acid', 'total sulfur dioxide']

# define X and y
X = wine[feature_cols]
y = wine.quality

#### Effect of depth on bias and variance

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
max_depth_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
train_accuracies = []
test_accuracies = []
for max_depth in max_depth_list:
    dt = DecisionTreeClassifier(max_depth=max_depth, random_state=42)  
    dt.fit(X_train, y_train)
    train_pred = dt.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_pred)
    train_accuracies.append(train_accuracy)
    test_pred = dt.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_pred)
    test_accuracies.append(test_accuracy)
# Plot the training and test accuracies
plt.plot(max_depth_list, train_accuracies, label='Training accuracy')
plt.plot(max_depth_list, test_accuracies, label='Test accuracy')
plt.xlabel('Max depth')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

#### Approach1

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from graphviz import Source
# train decision tree with max_depth=2
tree_depth_2 = DecisionTreeClassifier(max_depth=2,random_state=1)
tree_depth_2.fit(X, y)
# print feature importances
for name, importance in zip(feature_cols,tree_depth_2.feature_importances_):
    print(f"{name}: {importance}")
    
graph = Source(export_graphviz(tree_depth_2, out_file=None,
                               feature_names=feature_cols,
                               class_names=['3','4','5','6','7','8'],
                               filled=True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Train decision tree with max_depth=2
tree_depth_2= DecisionTreeClassifier(max_depth=2,random_state=1)
tree_depth_2.fit(X_train, y_train)
y_pred = tree_depth_2.predict(X_test)
# accuracy on the testing set
accuracy = tree_depth_2.score(X_test, y_test)
print("Accuracy:", accuracy)
# generate classification report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
# create a decision tree classifier with max_depth=5
tree_depth_5 = DecisionTreeClassifier(max_depth=5,random_state=1)
tree_depth_5.fit(X, y)

for name, importance in zip(feature_cols,tree_depth_5.feature_importances_):
    print(f"{name}: {importance}")
    
# generate the Graphviz representation of the decision tree
graph5 = Source(export_graphviz(tree_depth_5, out_file=None,
                               feature_names=feature_cols,
                               class_names=['3','4','5','6','7','8'],
                               filled=True)) 
svg5 = SVG(graph5.pipe(format='svg'))
display(svg5)
# graph5.render('max depth 5 - Multi-class', format='png', view=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Train decision tree with max_depth=5
tree_depth_5 = DecisionTreeClassifier(max_depth=5,random_state=1)
tree_depth_5.fit(X_train, y_train)
y_pred = tree_depth_5.predict(X_test)
# accuracy on the testing set
accuracy = tree_depth_5.score(X_test, y_test)
print("Accuracy:", accuracy)
# generate classification report
report = classification_report(y_test, y_pred)
print(report)

#### hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
param_grid = {'criterion': ['gini', 'entropy'],
              'max_depth': [1, 2, 3, 5],
              'min_samples_leaf': [2, 5, 7, 10],
              'max_leaf_nodes': [None, 5, 7, 10]}
tree = DecisionTreeClassifier(random_state=1)
grid_search = GridSearchCV(estimator=tree, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X, y)
print("Best parameters: ", grid_search.best_params_)

In [None]:
# create a decision tree classifier with best parameters
best_params = DecisionTreeClassifier(criterion='entropy',max_depth=5,max_leaf_nodes= None ,min_samples_leaf=5,random_state=1)
best_params.fit(X, y)

# generate the Graphviz representation of the decision tree
best_tree = Source(export_graphviz(best_params, out_file=None,
                               feature_names=feature_cols,
                               class_names=['3','4','5','6','7','8'],
                               filled=True)) 
best_tree = SVG(best_tree.pipe(format='svg'))
display(best_tree)
# best_tree.format = 'png'
# best_tree.render('Tree using hyperparameters - Multi-class', view=True)

In [None]:
best_tree_metrics= DecisionTreeClassifier(criterion='entropy',max_depth=5,max_leaf_nodes= None ,min_samples_leaf=5,random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
best_tree_metrics.fit(X_train, y_train)
accuracy = best_tree_metrics.score(X_test, y_test)
print("Accuracy:", accuracy)
y_pred = best_tree_metrics.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

#### Approach2

In [None]:
df = pd.read_csv('winequality-red.csv')
df.head()

In [None]:
df['quality'] = df['quality'].apply(lambda x : 0 if x <=5 else 1)
df.head()

In [None]:
sns.countplot(x='quality', data=df)

In [None]:
corr_matrix = df.corr()
corr_with_target = abs(corr_matrix['quality']).sort_values(ascending=False)
best_features = corr_with_target[1:6].index.tolist()
print("Best features based on correlation analysis: ", best_features)

In [None]:
# create a list of feature columns
feature_cols = ['alcohol', 'volatile acidity', 'sulphates', 'citric acid', 'total sulfur dioxide']

# define X and y
X = df[feature_cols]
y = df.quality

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from graphviz import Source
# train decision tree with max_depth=2
tree_depth_2_binary= DecisionTreeClassifier(max_depth=2,random_state=1)
tree_depth_2_binary.fit(X, y)
# print feature importances
for name, importance in zip(feature_cols, tree_depth_2_binary.feature_importances_):
    print(f"{name}: {importance}")
    
graph = Source(export_graphviz(tree_depth_2_binary, out_file=None,
                               feature_names=feature_cols,
                               class_names=['0','1'],#'3', '4', '5', '6', '7', '8'],
                               filled=True))
svg = SVG(graph.pipe(format='svg'))
display(svg)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Train decision tree with max_depth=2
tree_depth_2_binary = DecisionTreeClassifier(max_depth=2,random_state=1)
tree_depth_2_binary.fit(X_train, y_train)
# accuracy on the testing set
accuracy = tree_depth_2_binary.score(X_test, y_test)
print("Accuracy:", accuracy)

In [None]:
# create a decision tree classifier with max_depth=5
tree_depth_5_binary = DecisionTreeClassifier(max_depth=5,random_state=1)
tree_depth_5_binary.fit(X, y)
for name, importance in zip(feature_cols,tree_depth_5_binary.feature_importances_):
    print(f"{name}: {importance}")
    
# generate the Graphviz representation of the decision tree
graph5 = Source(export_graphviz(tree_depth_5_binary, out_file=None,
                               feature_names=feature_cols,
                               class_names=['0', '1'],
                               filled=True)) 
svg5 = SVG(graph5.pipe(format='svg'))
display(svg5)
# graph5.render('max depth 5 - Binary class', format='png', view=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Train decision tree with max_depth=5
tree_depth_5_binary = DecisionTreeClassifier(max_depth=5,random_state=1)
tree_depth_5_binary.fit(X_train, y_train)
# y_pred = tree5.predict(X_test)
# accuracy on the testing set
accuracy = tree_depth_5_binary.score(X_test, y_test)
print("Accuracy:", accuracy)
y_pred = best_tree_metrics.predict(X_test)

#### Neural Networks

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [None]:
pip install networkx

In [None]:
X = df.drop(['quality'],axis=1)
y = df['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# you may need to install networkx with pip
import networkx as nx
import colorsys

def show_ann(mlp):
    hidden_layers_n = len(mlp.coefs_)-1
    layers_n = hidden_layers_n + 2
    input_neurons_n = len(mlp.coefs_[0])
    hidden_neurons_n = [len(mlp.coefs_[i+1]) for i in range(hidden_layers_n)]
    output_neurons_n = len(mlp.coefs_[-1][0])

    G = nx.DiGraph()
    pos = {}

    # Create the neurons of the input layer
    for i in range(input_neurons_n):
        pos['Layer0_{}'.format(i)] = (i,layers_n-1)

    for j in range(hidden_layers_n):
        # Create the neurons of the j'th hidden layer
        prev_layer = j
        cur_layer = j+1
        if (j == 0):
            prev_size = input_neurons_n
        else:
            prev_size = hidden_neurons_n[j-1]
        for i in range(hidden_neurons_n[j]):
            pos['Layer{}_{}'.format(cur_layer,i)] = (i,layers_n-1-cur_layer)
            for k in range(prev_size):
                w = mlp.coefs_[prev_layer][k][i]
                G.add_edge('Layer{}_{}'.format(prev_layer,k),'Layer{}_{}'.format(cur_layer,i), weight=w)

    # Create the neurons of the output layer
    prev_layer = hidden_layers_n
    cur_layer = hidden_layers_n+1
    for i in range(output_neurons_n):
        pos['Layer{}_{}'.format(cur_layer,i)] = (i,layers_n-1-cur_layer)
        for k in range(hidden_neurons_n[-1]):
            w = mlp.coefs_[prev_layer][k][i]
            G.add_edge('Layer{}_{}'.format(prev_layer,k),'Layer{}_{}'.format(cur_layer,i), weight=w)

    edges = G.edges()
    colors = [colorsys.hsv_to_rgb(0 if G[u][v]['weight'] < 0 else 0.65,
                                  1,#min(1, abs(G[u][v]['weight'])),
                                  1) for u,v in edges]
    weights = [abs(G[u][v]['weight'])*2 for u,v in edges]

    nx.draw(G, pos, node_color='y', node_size=450, width=weights, edge_color=colors)
    

In [None]:
X.columns

In [None]:
df.quality.unique()

In [None]:
scaler = StandardScaler()

# Fit only to the training data
scaler.fit(X_train)

# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(2, 2),(5, 5), (10, 10), (15, 15, 15), (30, 30),(30, 30, 15)]
}

mlp = MLPClassifier(max_iter=10000)
grid = GridSearchCV(mlp, param_grid, cv=5)
grid.fit(X_train, y_train)
results = grid.cv_results_
scores = results['mean_test_score']
best_scores_idx = scores.argsort()[-3:][::-1]
for i in range(3):
    idx = best_scores_idx[i]
    print("Configuration ", i+1, ":")
    print("Parameters:", results['params'][idx])
    print("")
best_config = results['params'][best_scores_idx[0]]
best_mlp = MLPClassifier(max_iter=10000, **best_config)
best_mlp.fit(X_train, y_train)

In [None]:
nn1 = MLPClassifier(hidden_layer_sizes=(2, 2),max_iter=500)
nn1.fit(X_train,y_train)
predictions = nn1.predict(X_test)
print(confusion_matrix(y_test, predictions))

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print('This dataset has {} input nodes and {} output node(s)'.format(len(X.columns), len(y.unique())))
print('There are {} 2D arrays of coefficients, one for each layer'.format(len(nn1.coefs_)))
print('The layers have the following number of coefficients: {}')
for l in range(len(nn1.coefs_)):
    m = len(nn1.coefs_[l])
    n = len(nn1.coefs_[l][0])
    print('  {}: {}x{} ({} nodes feeding into a layer of {} nodes)'.format(l, m, n, m, n))
# Print the actual coefficients
# print(mlp.coefs_)
print()
print('There are {} 1D arrays of intercepts, one for each layer'.format(len(nn1.intercepts_)))
print('Each layer has {} intercepts, one for each node'.format([len(nn1.intercepts_[l]) for l,_ in enumerate(nn1.intercepts_)]))


In [None]:
show_ann(nn1)

In [None]:
nn2 = MLPClassifier(hidden_layer_sizes=(15, 15, 15),max_iter=10000)
nn2.fit(X_train,y_train)
predictions = nn2.predict(X_test)
print(confusion_matrix(y_test, predictions))

In [None]:
print(classification_report(y_test,predictions))

In [None]:
show_ann(nn2)

In [None]:
nn3 = MLPClassifier(hidden_layer_sizes=(30, 30, 15),max_iter=10000)
nn3.fit(X_train,y_train)
predictions = nn3.predict(X_test)
print(confusion_matrix(y_test, predictions))

In [None]:
print(classification_report(y_test,predictions))

In [None]:
show_ann(nn3)

In [None]:
nn4 = MLPClassifier(hidden_layer_sizes=(30, 30),max_iter=10000)
nn4.fit(X_train,y_train)
predictions = nn1.predict(X_test)
print(confusion_matrix(y_test, predictions))

In [None]:
print(classification_report(y_test,predictions))

In [None]:
show_ann(nn4)