## **_Importing Libraries_**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv('/content/datasets_4458_8204_winequality-red.csv')

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [None]:
dataset.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [None]:
dataset['quality'].unique()

array([5, 6, 7, 4, 8, 3])

In [None]:
dataset.corr()['quality'].sort_values(ascending=False)

quality                 1.000000
alcohol                 0.476166
sulphates               0.251397
citric acid             0.226373
fixed acidity           0.124052
residual sugar          0.013732
free sulfur dioxide    -0.050656
pH                     -0.057731
chlorides              -0.128907
density                -0.174919
total sulfur dioxide   -0.185100
volatile acidity       -0.390558
Name: quality, dtype: float64

## **_Importing dataset_**

In [None]:
x = dataset.iloc[: ,:-1].values
y = dataset.iloc[:, -1].values

## **_Split to Train and Test Data_**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## **_Standardising Data_**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
t_train = y_train.reshape(len(y_train),1)
t_test = y_test.reshape(len(y_test),1)

In [None]:
x_sc = StandardScaler()
y_sc = StandardScaler()

In [None]:
x_std_train = x_sc.fit_transform(x_train)
y_std_train = x_sc.fit_transform(t_train)

In [None]:
x_std_test = x_sc.transform(x_test)
y_std_test = x_sc.transform(t_test)

## **_Train on Different Algorithms_**

we have now totally trained on five algorithms they are:

* RandomForestRegressor class we have used ensemble library.
* LinearRegression class we have used linear_model library.
* DecisionTreeRegressor class we have used tree library.
* SVR class we have used svm library.
* PolynomialFeatures class we have used preprocessing library.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
m_reg = LinearRegression()
p_reg = LinearRegression()
d_reg = DecisionTreeRegressor()
r_reg = RandomForestRegressor(n_estimators=500)
s_reg = SVR()

In [None]:
x_poly = PolynomialFeatures(degree = 2)
x_poly = x_poly.fit_transform(x_train)

In [None]:
m_reg.fit(x_train, y_train)
p_reg.fit(x_poly, y_train)
d_reg.fit(x_train, y_train)
r_reg.fit(x_train, y_train)
s_reg.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [None]:
temp = PolynomialFeatures(degree = 2)
temp = temp.fit_transform(x_test)

In [None]:
m_pred = m_reg.predict(x_test)
p_pred = p_reg.predict(temp)
d_pred = d_reg.predict(x_test)
r_pred = r_reg.predict(x_test)
s_pred = s_reg.predict(x_test)

## **_Result_**

In [None]:
from sklearn.metrics import r2_score

In [None]:
m = r2_score(y_test, m_pred)
p = r2_score(y_test, p_pred)
d = r2_score(y_test, d_pred)
r = r2_score(y_test, r_pred)
s = r2_score(y_test, s_pred)

In [None]:
print(m, p, d, r, s)

0.4072028494905534 0.3891135182494534 0.2484131349318427 0.5528399125923502 0.19722683495512128


**_Conclusion_**  

Used Regression:

* Since RandomForest is giving more efficiency than other algorithms.
* We can consider RandomForest as the best algorithm for this dataset.