# A simple regression problem using a UCI repository dataset (Concrete Compressive Strength)

### Importing libraries and dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [1]:
# This code is copied from the UCI website

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
concrete_compressive_strength = fetch_ucirepo(id=165) 
  
# data (as pandas dataframes) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets 

In [5]:
X.iloc[:2, :]

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28


In [6]:
y.iloc[:2]

Unnamed: 0,Concrete compressive strength
0,79.99
1,61.89


### The dataset has no missing values, according to the website, so feature scaling it is.

In [32]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [33]:
from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
sc_y = StandardScaler()

x_train = sc_x.fit_transform(x_train)
y_train = sc_y.fit_transform(y_train)

x_test = sc_x.transform(x_test)
y_test = sc_y.transform(y_test)

### Training and testing multiple models

In [81]:
from sklearn.metrics import mean_absolute_error

mean_absolute_errors = {}

#### Linear regression

In [90]:
from sklearn.linear_model import LinearRegression


regressor = LinearRegression()
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

error = mean_absolute_error(y_test, y_pred)

mean_absolute_errors['Linear Regression'] = error

error

0.468109844485127

#### polynomial regression

In [89]:
# import polynomial regression train and test

from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree = 3)
x_poly = poly_reg.fit_transform(x_train)
regressor = LinearRegression()
regressor.fit(x_poly, y_train)

y_pred = regressor.predict(poly_reg.transform(x_test))

error = mean_absolute_error(y_test, y_pred)

mean_absolute_errors['Polynomial Regression'] = error

error

0.2866942167963159

#### Support vector machines

In [88]:
from sklearn.svm import SVR

regressor = SVR(kernel = 'rbf')
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

error = mean_absolute_error(y_test, y_pred)

mean_absolute_errors['SVM'] = error

error

  y = column_or_1d(y, warn=True)


0.2864562684533619

#### KNN

In [87]:
from sklearn.neighbors import KNeighborsRegressor

regressor = KNeighborsRegressor(n_neighbors = 2, metric = 'minkowski', p = 2)
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

error = mean_absolute_error(y_test, y_pred)

mean_absolute_errors['KNN'] = error

error

0.3788846140455968

#### Random forest

In [86]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(x_train, y_train)

y_pred = regressor.predict(x_test)

error = mean_absolute_error(y_test, y_pred)

mean_absolute_errors['Random forest'] = error

error

  return fit_method(estimator, *args, **kwargs)


0.20205370676653947

#### Let us evaluate

In [93]:
for key, value in mean_absolute_errors.items():
    print(key, ':', value)
    print('')

Linear Regression : 0.468109844485127

Polynomial Regression : 0.2866942167963159

SVM : 0.2864562684533619

Random forest : 0.20205370676653947

KNN : 0.3788846140455968



#### The best model is Random forest for this problem followed by SVM and a tiny tiny bit behind is polynomial