In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('./winequality-red.csv')

In [4]:
df.info()

In [5]:
df.head()

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1],df.iloc[:,-1],test_size=0.2,random_state=1)

In [8]:
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

In [9]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression()

regressor.fit(X_train_scaled, y_train)

In [10]:
y_pred = regressor.predict(X_test_scaled)

In [11]:
from sklearn.metrics import confusion_matrix

In [12]:
y_test = np.array(y_test)

In [13]:
train_score = regressor.score(X_train,y_train)*100
test_score = regressor.score(X_test, y_test)*100
print(train_score)
print(test_score)

In [15]:
from sklearn.ensemble import RandomForestRegressor
randomforest_regressor = RandomForestRegressor(max_depth=3,random_state=1)
randomforest_regressor.fit(X_train, y_train)


In [24]:
randomforest_train_score = randomforest_regressor.score(X_train_scaled, y_train)*100
randomforest_test_score = randomforest_regressor.score(X_test_scaled, y_test)*100
print(randomforest_train_score)
print(randomforest_test_score)

In [25]:
importances = randomforest_regressor.feature_importances_
labels = df.columns

In [26]:
feature_df = pd.DataFrame(list(zip(labels,importances)),columns=['feature','importance'])
feature_df = feature_df.sort_values(by='importance',ascending=False)

In [27]:
import seaborn as sns
import matplotlib.pyplot as plt
axis_fs = 18
title_fs = 22

sns.set(style='whitegrid')
ax = sns.barplot(x='importance',y='feature',data=feature_df)
ax.set_xlabel('Importance',fontsize=axis_fs)
ax.set_ylabel('Feature',fontsize=axis_fs)
ax.set_title('Random Forest\n feature importance',fontsize=title_fs)

plt.tight_layout()


In [30]:
y_pred_randomforest = randomforest_regressor.predict(X_test_scaled) + np.random.normal(0,0.25,len(y_test))
y_jitter_randomforest = y_test + np.random.normal(0,0.25,len(y_test))
res_df = pd.DataFrame(list(zip(y_jitter_randomforest,y_pred_randomforest)),columns=['True','Pred'])

ax = sns.scatterplot(x='True',y='Pred',data=res_df)
ax.set_aspect('equal')
ax.set_xlabel('True wine quality',fontsize=axis_fs)
ax.set_ylabel('predicted wine quality',fontsize=axis_fs)
ax.set_title('Residuals',fontsize=title_fs)

ax.plot([1,10],[1,10],'black',linewidth=1)
plt.ylim((2.5,8.5))
plt.xlim((2.5,8.5))

plt.tight_layout()