In [31]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression , Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PolynomialFeatures
# from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline

In [32]:
# fetch dataset 
concrete_compressive_strength = fetch_ucirepo(id=165) 

In [33]:
# data (as pandas dataframes) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets 

In [34]:
lr = LinearRegression()
lr.fit(X,y)

In [35]:
lr.intercept_

array([-23.33121358])

In [36]:
lr.coef_

array([[ 0.11980433,  0.10386581,  0.08793432, -0.14991842,  0.2922246 ,
         0.01808621,  0.02019035,  0.11422207]])

In [37]:
ridge = Ridge()
ridge.fit(X,y)

In [38]:
ridge.intercept_

array([-23.32957301])

In [39]:
ridge.coef_

array([[ 0.11980439,  0.10386586,  0.08793476, -0.14992243,  0.2922038 ,
         0.01808557,  0.0201901 ,  0.11422205]])

### Train Test Split using Linear Regression

In [40]:
X_train , X_test , y_train , y_test = train_test_split(X,y,random_state=24,test_size=0.3)

In [41]:
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
r2_score(y_test,y_pred)

0.5771752777048791

### Train Test Split using ridge

In [42]:
X_train , X_test , y_train , y_test = train_test_split(X,y,random_state=24,test_size=0.3)

In [43]:
ridge.fit(X_train,y_train)
y_pred = ridge.predict(X_test)
r2_score(y_test,y_pred)

0.5771749099675626

### Train Test Split using ridge for degree 3

In [44]:
poly = PolynomialFeatures(degree=3,include_bias=False).set_output(transform='pandas')
X_poly_trn = poly.fit_transform(X_train)
X_poly_tst = poly.transform(X_test)
ridge.fit(X_poly_trn,y_train)
y_pred = ridge.predict(X_poly_tst)
r2_score(y_test,y_pred)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


0.8695065063191985

### Filtered the extracted features based on coefficient values

In [45]:
df_coef = pd.DataFrame({'col_names':list(X_poly_trn.columns),'coef':list(ridge.coef_[0])})

In [46]:
df_coef.shape

(164, 2)

In [47]:
df_coef[df_coef['coef']>0.0001]

Unnamed: 0,col_names,coef
1,Blast Furnace Slag,0.636144
5,Coarse Aggregate,2.612272
6,Fine Aggregate,0.909094
7,Age,0.123618
8,Cement^2,0.010445
9,Cement Blast Furnace Slag,0.031224
10,Cement Fly Ash,0.008666
14,Cement Fine Aggregate,0.023399
16,Blast Furnace Slag^2,0.004927
17,Blast Furnace Slag Fly Ash,0.015132


#### Considering different values of alpha

In [52]:
# Without alpha it is a linear regression
ridge = Ridge(alpha=0.02)
ridge.fit(X_train,y_train)
y_pred = ridge.predict(X_test)
r2_score(y_test,y_pred)

0.577175270349799

In [53]:
ridge = Ridge(alpha=0.01)
ridge.fit(X_train,y_train)
y_pred = ridge.predict(X_test)
r2_score(y_test,y_pred)

0.5771752740273375

In [54]:
ridge = Ridge(alpha=0.03)
ridge.fit(X_train,y_train)
y_pred = ridge.predict(X_test)
r2_score(y_test,y_pred)

0.5771752666722643

In [55]:
ridge = Ridge(alpha=0.22)
ridge.fit(X_train,y_train)
y_pred = ridge.predict(X_test)
r2_score(y_test,y_pred)

0.5771751967997472

#### Tuning for alpha = [0.01,0.1,0.3,0.6,1,1.5,2,4,10]

In [85]:
alphas = [0.01,0.1,0.3,0.6,1,1.5,2,4,10]
scores = []

In [86]:
for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train,y_train)
    y_pred = ridge.predict(X_test)
    scores.append(r2_score(y_test,y_pred))

In [87]:
print(scores)

[0.5771752740273375, 0.5771752409296147, 0.5771751673801079, 0.5771750570584018, 0.5771749099675626, 0.5771747261116779, 0.5771745422643093, 0.5771738069600134, 0.5771716018651836]


In [88]:
np.max(scores)

0.5771752740273375

In [89]:
i_max = np.argmax(scores)
print(i_max)
print("Best alpha: ",alphas[i_max])
print("Best Score",scores[i_max])

0
Best alpha:  0.01
Best Score 0.5771752740273375


In [93]:
alphas = np.linspace(0.0001,10,20)
scores = []
for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(X_train,y_train)
    y_pred = ridge.predict(X_test)
    scores.append(r2_score(y_test,y_pred))
print(scores)
np.max(scores)
i_max = np.argmax(scores)
print(i_max)
print("Best alpha: ",alphas[i_max])
print("Best Score",scores[i_max])

[0.5771752776681036, 0.5771750841198391, 0.5771748905810096, 0.5771746970516156, 0.577174503531658, 0.5771743100211375, 0.5771741165200551, 0.5771739230284114, 0.5771737295462072, 0.5771735360734433, 0.5771733426101202, 0.5771731491562393, 0.5771729557118006, 0.577172762276805, 0.5771725688512538, 0.577172375435147, 0.577172182028486, 0.5771719886312712, 0.5771717952435036, 0.5771716018651836]
0
Best alpha:  0.0001
Best Score 0.5771752776681036
