In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from math import sqrt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
data = pd.read_csv('D:\ML\data\data.csv', delimiter=',')

In [3]:
y = data["quality"]
X = data.drop(["quality"], axis=1)

In [4]:
y

0       5.0
1       5.0
2       5.0
3       6.0
4       5.0
       ... 
5315    6.0
5316    5.0
5317    6.0
5318    7.0
5319    6.0
Name: quality, Length: 5320, dtype: float64

In [5]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,kind_red,kind_white
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,1.0,0.0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,1.0,0.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,1.0,0.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,1.0,0.0
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,0.0,1.0
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,0.0,1.0
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,0.0,1.0
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,0.0,1.0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)

In [7]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3724, 13), (3724,), (1596, 13), (1596,))

In [8]:
model = LinearRegression().fit(X_train, y_train)

In [9]:
for i in zip(data.columns, model.coef_):
    print(i)

('fixed acidity', 0.09151225722245249)
('volatile acidity', -1.3555520352392887)
('citric acid', 0.1089106036981969)
('residual sugar', 0.05952088349664775)
('chlorides', -0.8273261927115626)
('free sulfur dioxide', 0.006056940093291607)
('total sulfur dioxide', -0.0014979662908620739)
('density', -119.16562556886215)
('pH', 0.7029643371960033)
('sulphates', 0.7217877055853993)
('alcohol', 0.19467573466724583)
('quality', 0.18154849312794308)
('kind_red', -0.18154849313315471)


In [10]:
y_pred = model.predict(X_test)
y_pred

array([5.41039835, 5.54063671, 5.19250413, ..., 6.31545475, 5.68675665,
       6.27825687])

In [11]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {model.score(X_test, y_test)}')

MAE: 0.5901124734420327
MSE: 0.5962812451979581
RMSE: 0.7721924923216738
MAPE: 0.32483063966778014
R^2: 0.30357939379198295


In [12]:
len(model.coef_)
model.coef_

array([ 9.15122572e-02, -1.35555204e+00,  1.08910604e-01,  5.95208835e-02,
       -8.27326193e-01,  6.05694009e-03, -1.49796629e-03, -1.19165626e+02,
        7.02964337e-01,  7.21787706e-01,  1.94675735e-01,  1.81548493e-01,
       -1.81548493e-01])

In [13]:
parameters = {'alpha': np.arange(0, 1, 0.1)}
ridge_optimal = GridSearchCV(Ridge(), parameters).fit(X_train, y_train)
ridge_optimal.best_params_

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


{'alpha': 0.0}

In [14]:
ridge = Ridge(alpha=0.0).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {ridge.score(X_test, y_test)}')
ridge.coef_

MAE: 0.5900481183427319
MSE: 0.5965236016084676
RMSE: 0.772349403837711
MAPE: 0.3248365353417137
R^2: 0.3032963360911334


array([ 9.28558692e-02, -1.36235768e+00,  8.66572440e-02,  6.00037804e-02,
       -8.26476776e-01,  6.51211471e-03, -1.59514038e-03, -1.18798443e+02,
        7.04970236e-01,  7.23900008e-01,  1.94454709e-01, -8.34757155e+12,
       -8.34757155e+12])

In [15]:
parameters = {'alpha': np.arange(0, 1, 0.1)}
lasso_optimal = GridSearchCV(Lasso(), parameters).fit(X_train, y_train)
lasso_optimal.best_params_

  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  self.best_estimator_.fit(X, y, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'alpha': 0.0}

In [16]:
lasso = Lasso(alpha=0.0).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')
print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'RMSE: {sqrt(mean_squared_error(y_test, y_pred))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y_test, y_pred))}')
print(f'R^2: {lasso.score(X_test, y_test)}')
lasso.coef_

  lasso = Lasso(alpha=0.0).fit(X_train, y_train)
  model = cd_fast.enet_coordinate_descent(


MAE: 0.5901124734420331
MSE: 0.5962812451979586
RMSE: 0.772192492321674
MAPE: 0.32483063966778003
R^2: 0.3035793937919824


  model = cd_fast.enet_coordinate_descent(


array([ 9.15122572e-02, -1.35555204e+00,  1.08910604e-01,  5.95208835e-02,
       -8.27326193e-01,  6.05694009e-03, -1.49796629e-03, -1.19165626e+02,
        7.02964337e-01,  7.21787706e-01,  1.94675735e-01,  3.63096986e-01,
        5.72292836e-14])

In [17]:
from sklearn.preprocessing import PolynomialFeatures
y = y_train
X = X_train
model_1 = PolynomialFeatures(5)
X_polynom = model_1.fit_transform(X)
model_2 = LinearRegression().fit(X_polynom, y)
y_pred_polynom = model_2.predict(X_polynom)
X_polynom

array([[1.  , 6.5 , 0.2 , ..., 0.  , 0.  , 1.  ],
       [1.  , 8.7 , 0.45, ..., 0.  , 0.  , 1.  ],
       [1.  , 6.9 , 0.23, ..., 0.  , 0.  , 1.  ],
       ...,
       [1.  , 5.7 , 0.28, ..., 0.  , 0.  , 1.  ],
       [1.  , 6.2 , 0.52, ..., 0.  , 0.  , 0.  ],
       [1.  , 6.6 , 0.25, ..., 0.  , 0.  , 1.  ]])

In [21]:
print(f'MAE: {mean_absolute_error(y, y_pred_polynom)}')
print(f'MSE: {mean_squared_error(y, y_pred_polynom)}')
print(f'RMSE: {sqrt(mean_squared_error(y, y_pred_polynom))}')
print(f'MAPE: {sqrt(mean_absolute_percentage_error(y, y_pred_polynom))}')
print(f'R^2: {model_2.score(X_polynom, y)}')
model_2.coef_

MAE: 0.28758292456458323
MSE: 0.1378080875677415
RMSE: 0.3712251171024685
MAPE: 0.22538010012001733
R^2: 0.8129368631018509


array([-3.04804522, -5.30999409,  3.04248893, ...,  0.        ,
        0.        , -1.75624701])

In [22]:
import sys
sys.path.append('D:\ML\myLib')
from Class1 import RidgeRegression
from metrics1 import R2

model_test = RidgeRegression()
model_test.fit(X_train, y_train)
y_pred = model_test.predict(X_test)

print(f'MSE: {mean_squared_error(y_test, y_pred)}')
print(f'R2: {R2(y_test, y_pred)}')

MSE: 0.5981637852164381
R2: 0.30138070035421527


In [21]:
import sys
sys.path.append('D:\ML\myLib')
import metrics1

print(f'MAE: {metrics1.MAE(y_test, y_pred)}')
print(f'MSE: {metrics1.MSE(y_test, y_pred)}')
print(f'RMSE: {metrics1.RMSE(y_test, y_pred)}')
print(f'MAPE: {metrics1.MAPE(y_test, y_pred)}')
print(f'R^2: {metrics1.R2(y_test, y_pred)}')

MAE: 0.5758200336687653
MSE: 0.5641509600867284
RMSE: 0.7510998336351355
MAPE: 0.10630421725322
R^2: 0.31490458487385653
