## The Dataset Used here can be found on http://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
demo = pd.read_csv("concrete.csv")
X=demo.drop('strength',axis=1)
y=demo['strength']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
lr=LinearRegression()
lr.fit(Xtrain,ytrain)
print("Training R2")
print(lr.score(Xtrain,ytrain))
print("Testing R2")
print(lr.score(Xtest,ytest))

Training R2
0.6374354064424276
Testing R2
0.5022375995887227


### to eliminate the bias of random_state, We use cross_val_score

In [3]:
scoresdt = cross_val_score(lr, Xtrain, ytrain, cv=10)

In [4]:
scoresdt

array([0.52571613, 0.71904056, 0.62260097, 0.64177363, 0.71449256,
       0.70070694, 0.58346583, 0.52009653, 0.65254622, 0.50414244])

In [5]:
print(np.mean(scoresdt))#this tells about Bias Error
print(np.std(scoresdt))#Variance Error

0.618458179800325
0.07771466904055849


# Creating Polynomial Features and using PCA to improve the performance of the model

## As creating Polynomial features will make the relationship between features more linear
## And PCA will remove the noise(i.e.,Multicollinearity)

In [14]:
demo = pd.read_csv("concrete.csv")
X=demo.drop('strength',axis=1)
y=demo['strength']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
poly = PolynomialFeatures(degree=3)
pca = PCA(n_components=128)
sc=StandardScaler()
scaledXtrain = sc.fit_transform(Xtrain)
scaledXtest = sc.transform(Xtest)
polyscaledXtrain = poly.fit_transform(scaledXtrain)
polyscaledXtest = poly.transform(scaledXtest)
pcaXtrain = pca.fit_transform(polyscaledXtrain)
pcaXtest = pca.transform(polyscaledXtest)
lr=LinearRegression()
scores = cross_val_score(lr,Xtrain,ytrain,cv=10)
scoresdt = cross_val_score(lr, polyscaledXtrain, ytrain, cv=10)
lrpca=LinearRegression()
scoresdtpca = cross_val_score(lrpca, pcaXtrain, ytrain, cv=10)

In [15]:
print('Score Before Extracting polynomial features')
print(np.mean(scores))
print("Score After Extracting polynomial features")
print(np.mean(scoresdt))
print("Score After PCA")
print(np.mean(scoresdtpca))

Score Before Extracting polynomial features
0.618458179800325
Score After Extracting polynomial features
0.8332261105080546
Score After PCA
0.8514289844107281


## Here We can see that just by creating polynomial features and PCA , How the performance of a simple linear regression model is improved

## Creating a pipeline for the same

In [23]:
#With Pipeline

import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
demo = pd.read_csv("concrete.csv")
X=demo.drop('strength',axis=1)
y=demo['strength']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= .20,random_state=10)
pipe = Pipeline((
("scaler", StandardScaler()),
("poly",PolynomialFeatures(degree=3)),
("pca",PCA(n_components=128)),
("rf", LinearRegression()),
))
scoresdtpipe = cross_val_score(pipe, Xtrain, ytrain, cv=10)
print("After PCA Avg Cross Val R2")
print(np.mean(scoresdtpipe))

After PCA Avg Cross Val R2
0.8516050336241344


In [24]:
print("Final model R2 on test data after fitting only on train data")
pipe.fit(Xtrain,ytrain)
print(pipe.score(Xtest,ytest))

Final model R2 on test data after fitting only on train data
0.8915097013812546


## Here we used only linear regression to show how this techniques improve the performance

## The Model performance will further increase if we go for complex models!!