In [None]:
import numpy as np
import scipy.stats as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import missingno
import datetime as dt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv('winequality-red.csv')

EDA

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#Show me columns
df.columns

In [None]:
#Data types
data_counts = df.dtypes.value_counts()
data_counts.index = data_counts.index.astype(str)
# Making plot
plt.title("Data types in dataset")
plt.pie(data_counts.values, labels=data_counts.index, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.show()

In [None]:
#Checking dataset 
df.isnull().sum()

In [None]:
df.describe()

In [None]:
#Checking duplicates
df.duplicated().sum()

In [None]:
#Delete duplicates
df.loc[df.duplicated()]
df = df.drop_duplicates()

In [None]:
#Plot of everything 
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(12, 10))
plt.subplots_adjust(hspace=0.5)

for i, column in enumerate(df.columns):
    ax = axes[i // 3, i % 3]
    ax.hist(df[column], bins=10)
    ax.set_title(column)
    ax.set_xlabel('What')
    ax.set_ylabel('How many')

plt.show()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#Checkign ph of wine 
df['pH']

In [None]:
#What ph are in dataset
df['pH'].unique()

In [None]:
pH=df['pH'].value_counts().head(30)

In [None]:
pH=df['pH'].value_counts().head(30)

In [None]:
plt.figure(figsize=(20,6))
plt.tight_layout(pad=3)
plt.title("PH of wine",fontsize=15)
a=sns.barplot(x=pH.index,y=pH)
a.set_xticklabels(pH.index ,rotation=90)
a.set(xlabel='PH of wine', ylabel='Bottles of wine')
plt.show()

In [None]:
#Wine quality
df['quality'].unique()

In [None]:
#Wine quality plot
quality=df['quality'].value_counts()
plt.figure(figsize=(20,6))
plt.tight_layout(pad=3)
plt.title("Quality of wine",fontsize=15)
a=sns.barplot(x=quality.index,y=quality)
a.set_xticklabels(quality.index)
a.set(xlabel='Quality of wine', ylabel='Bottles of wine')
plt.show()

In [None]:
#Checking % of alc in wine
df["alcohol"].unique()

In [None]:
alcohol=df['alcohol'].value_counts().head(40)
plt.figure(figsize=(20,6))
plt.tight_layout(pad=3)
plt.title("percent of alcohol",fontsize=15)
a=sns.barplot(x=alcohol.index,y=alcohol)
a.set_xticklabels(alcohol.index)
a.set(xlabel='percent of alcohol', ylabel='Bottles of wine')
plt.set_cmap('cool')
plt.show()

In [None]:
#Residual sugar in wine
df["residual sugar"].unique()

In [None]:
A111=df['residual sugar'].value_counts().head(40)
plt.figure(figsize=(30,6))
plt.tight_layout(pad=3)
plt.title("residual sugar in wine",fontsize=15)
a=sns.barplot(x=A111.index,y=A111)
a.set_xticklabels(A111.index)
a.set(xlabel='g of sugar', ylabel='Bottles of wine')
plt.show()

In [None]:
#Wine acidity
df["fixed acidity"].unique()

In [None]:
A111=df['fixed acidity'].value_counts().head(40)
plt.figure(figsize=(20,6))
plt.tight_layout(pad=3)
plt.title("Wine acidity",fontsize=15)
a=sns.barplot(x=A111.index,y=A111)
a.set_xticklabels(A111.index)
a.set(xlabel='Acidity g/l', ylabel='Bottles of wine')
plt.show()

In [None]:
#Chlorides
df["chlorides"].unique()

In [None]:
A111=df['chlorides'].value_counts().head(20)
plt.figure(figsize=(25,6))
plt.tight_layout(pad=3)
plt.title("Chlorides in wine ",fontsize=10)
a=sns.barplot(x=A111.index,y=A111)
a.set_xticklabels(A111.index)
a.set(xlabel='Chlorides', ylabel='Bottles of wine')
plt.show()

In [None]:
df.info()

In [None]:
#Wine density
df["density"].unique()

In [None]:
A111=df['density'].value_counts().head(30)
plt.figure(figsize=(25,6))
plt.tight_layout(pad=3)
plt.title("Wine density ",fontsize=10)
a=sns.barplot(x=A111.index,y=A111)
a.set_xticklabels(A111.index)
a.set(xlabel='Density', ylabel='Bottles of wine')
plt.show()

In [None]:
#Sulphates
df["sulphates"].unique()

In [None]:
A111=df['sulphates'].value_counts().head(40)
plt.figure(figsize=(25,6))
plt.tight_layout(pad=3)
plt.title("Wine sulphates",fontsize=10)
a=sns.barplot(x=A111.index,y=A111)
a.set_xticklabels(A111.index)
a.set(xlabel='Sulphates', ylabel='Bottles of wine')
plt.show()

Machine Learning

In [None]:
#X train and x test
X_train = df['citric acid'][-1000:].values.reshape(-1, 1)
X_test = df['citric acid'][:-1000].values.reshape(-1, 1)

# Y train and y test 
y_train = df["fixed acidity"][-1000:].values.reshape(-1, 1)
y_test = df["fixed acidity"][:-1000].values.reshape(-1, 1)


In [None]:
#Ml
clf = LinearRegression().fit(X_train, y_train)
# Predicton with test 
y_pred = clf.predict(X_test)
# Coefficient
print("Coefficients: ", clf.coef_[0][0])
#Mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("R squared: %.2f" % r2_score(y_test, y_pred))

#And what we have
plt.scatter(X_test, y_test, color="black")
plt.plot(X_test,y_pred, color="blue", linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['citric acid'], df["fixed acidity"], test_size=0.3, random_state=34)

In [None]:
# Ml
clf = LinearRegression().fit(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))
# Predicton with test 
y_pred = clf.predict(X_test.values.reshape(-1, 1))
#Coefficients
print("Coefficients: ", clf.coef_[0][0])
#Mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("R squared: %.2f" % r2_score(y_test, y_pred))
o =  ["price", "sqft_living"]
#What we have? 
plt.scatter(X_test, y_test, color="green")
plt.plot(X_test,y_pred, color="blue", linewidth=2)

plt.xticks(())
plt.yticks(())

In [None]:
from sklearn.svm import LinearSVR
from sklearn.svm import SVR

In [None]:
# svm_linear_SVR = LinearSVR().fit(X_train, y_train)
svm_linear_SVR = LinearSVR().fit(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))
svm_SVR = SVR().fit(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))
# svm_SVR = SVR(kernel="linear").fit(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))
# svm_SVR = SVR().fit(X_train, y_train)

In [None]:
y_pred_linear_SVR = svm_linear_SVR.predict(X_test.values.reshape(-1, 1))
y_pred_SVR = svm_SVR.predict(X_test.values.reshape(-1, 1))

In [None]:
# The coefficient of determination: 1 is perfect prediction
from sklearn.metrics import mean_squared_error, r2_score
print("Linear SVR R squared: %.2f" % r2_score(y_test,y_pred_linear_SVR))
print("SVR R squared: %.2f" % r2_score(y_test, y_pred_SVR))

In [None]:
plt.scatter(X_test, y_test, color="black")
plt.plot(X_test,y_pred_linear_SVR, color="blue", linewidth=3)
# plt.plot(X_test.sort_values(),test, color="blue", linewidth=3)
plt.plot(X_test,y_pred_SVR, color="red", linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
df_norm = df.copy()
df_norm["cit_acid"] = (df_norm['citric acid']-df_norm['citric acid'].min())/(df_norm['citric acid'].max()-df_norm['citric acid'].min())
df_norm["fix_acid"] = (df_norm['fixed acidity']-df_norm['fixed acidity'].min())/(df_norm['fixed acidity'].max()-df_norm['fixed acidity'].min())

In [None]:
df_norm[["cit_acid","citric acid"]]

In [None]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(df['citric acid'].values.reshape(-1, 1))
y = sc_y.fit_transform(df['fixed acidity'].values.reshape(-1, 1))

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(df_norm.YE_norm, df_norm.Salary_norm, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# svm_linear_SVR = LinearSVR().fit(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))
# svm_SVR = SVR().fit(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))
svm_linear_SVR = LinearSVR().fit(X_train.reshape(-1, 1), y_train.reshape(-1, 1))
# svm_SVR = SVR().fit(X_train.reshape(-1, 1), y_train.reshape(-1, 1))
# svm_SVR = SVR(C=0.1).fit(X_train, y_train)
svm_SVR = SVR().fit(X_train, y_train)

In [None]:
y_pred_linear_SVR = svm_linear_SVR.predict(X_test)
y_pred_SVR = svm_SVR.predict(X_test)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['citric acid'], df["fixed acidity"], test_size=0.3, random_state=34)

In [None]:
# The coefficient of determination: 1 is perfect prediction
print("Linear SVR R squared: %.2f" % r2_score(y_test,y_pred_linear_SVR))
print("SVR R squared: %.2f" % r2_score(y_test, y_pred_SVR))

In [None]:
XtestSVR = X_test.copy()
XtestSVR = np.ravel(XtestSVR)
XtestSVR.sort()

ypredSVR = y_pred_SVR.copy()
ypredSVR.sort()

In [None]:
plt.scatter(X_test, y_test, color="black")
plt.plot(X_test,y_pred_linear_SVR, color="blue", linewidth=3)
# plt.plot(X_test.sort_values(),test, color="blue", linewidth=3)
plt.plot(XtestSVR,ypredSVR, color="red", linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=5) # easily can overfit
regr_3 = DecisionTreeRegressor(max_depth=8) # easily can overfit
regr_1.fit(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))
regr_2.fit(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))
regr_3.fit(X_train.values.reshape(-1, 1), y_train.values.reshape(-1, 1))

# Predict
y_1 = regr_1.predict(X_test.values.reshape(-1, 1))
y_2 = regr_2.predict(X_test.values.reshape(-1, 1))
y_3 = regr_3.predict(X_test.values.reshape(-1, 1))

In [None]:
X_test.sort_values()
y_1.sort()
y_2.sort()
y_3.sort()

In [None]:
plt.figure()
# plt.scatter(df.YearsExperience, df.Salary, s=20, edgecolor="black", c="darkorange", label="data")
# plt.plot(X_test, y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
# plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.scatter(X_test.sort_values(), y_test.sort_values(), s=20, edgecolor="black", c="darkorange", label="data")
plt.plot(X_test.sort_values(), y_1, color="cornflowerblue", label="max_depth=2", linewidth=2)
plt.plot(X_test.sort_values(), y_2, color="yellowgreen", label="max_depth=5", linewidth=2)
plt.plot(X_test.sort_values(), y_3, color="red", label="max_depth=8", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

In [None]:
regr_1.feature_importances_

In [None]:
regr_1.score(X_test.values.reshape(-1, 1), y_test.values.reshape(-1, 1)) # max_depth = 2

In [None]:
regr_2.score(X_test.values.reshape(-1, 1), y_test.values.reshape(-1, 1)) # max_depth = 5

In [None]:
regr_3.score(X_test.values.reshape(-1, 1), y_test.values.reshape(-1, 1)) # max_depth = 8