In [1]:
# we need imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [2]:
# we need to do some plots 
import matplotlib.pyplot as plt

In [3]:
# also we need some regression models and a funciton for generating K-Fold cross-validations
from sklearn.linear_model import LinearRegression,Ridge, Lasso, ElasticNet
from sklearn.model_selection import KFold

In [5]:
# import some real data and conceptualize what we want to analyze
data = pd.read_csv('concrete.csv')

In [6]:
data 

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [7]:
# extract x and y
x = data.loc[:,'cement':'age'].values
y = data['strength'].values

In [8]:
# think of the pipeline (the order of application) for our research process

In [9]:
# we need a funciton defined that will perform KFold
kf = KFold(n_splits=10,shuffle=True,random_state=123)
#n_splits = number of folds, shuffle is randomness

In [10]:
# here I would like to create a pipeline function that combines multple procedures
# such as first scaling and then polynomial features

In [11]:
scale = StandardScaler()
poly = PolynomialFeatures(degree=1)

In [12]:
# import pipeline
from sklearn.pipeline import Pipeline

In [13]:
# define your pipeline
pipe = Pipeline([['Scaler',scale],['Polynomial Features',poly]])

In [15]:
# we need the actual cross- validations
score_train = []
score_test = []
#model = LinearRegression(fit_intercept=False)
model = Ridge(alpha=0.01,fit_intercept=False)

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  xtest = x[idxtest]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  # here we apply the pipeline
  xpolytrain = pipe.fit_transform(xtrain)
  xpolytest = pipe.transform(xtest)
  model.fit(xpolytrain,ytrain)
  score_train.append(model.score(xpolytrain,ytrain))
  score_test.append(model.score(xpolytest,ytest))

In [16]:
# for the linear model
print('The internal validation R2 score is : ' +str(np.mean(score_train)))
print('The external validation R2 score is : ' +str(np.mean(score_test)))

The internal validation R2 score is : 0.6159992959916887
The external validation R2 score is : 0.5958566080071821


In [15]:
# for the quadratic model
print('The internal validation R2 score is : ' +str(np.mean(score_train)))
print('The external validation R2 score is : ' +str(np.mean(score_test)))

The internal validation R2 score is : 0.6159992933217394
The external validation R2 score is : 0.5958577924284152


In [16]:
# for the cubic model
print('The internal validation R2 score is : ' +str(np.mean(score_train)))
print('The external validation R2 score is : ' +str(np.mean(score_test)))

The internal validation R2 score is : 0.6159992933217394
The external validation R2 score is : 0.5958577924284152


In [17]:
# for the quartic model
print('The internal validation R2 score is : ' +str(np.mean(score_train)))
print('The external validation R2 score is : ' +str(np.mean(score_test)))

The internal validation R2 score is : 0.6159992933217394
The external validation R2 score is : 0.5958577924284152


In [18]:
# for the linear model with Ridge penalty
print('The internal validation R2 score is : ' +str(np.mean(score_train)))
print('The external validation R2 score is : ' +str(np.mean(score_test)))

The internal validation R2 score is : 0.6159992933217394
The external validation R2 score is : 0.5958577924284152
