In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('/content/sample_data/cement_slump.csv')
df.head()

Unnamed: 0,Cement,Slag,Fly ash,Water,SP,Coarse Aggr.,Fine Aggr.,SLUMP(cm),FLOW(cm),Compressive Strength (28-day)(Mpa)
0,273.0,82.0,105.0,210.0,9.0,904.0,680.0,23.0,62.0,34.99
1,163.0,149.0,191.0,180.0,12.0,843.0,746.0,0.0,20.0,41.14
2,162.0,148.0,191.0,179.0,16.0,840.0,743.0,1.0,20.0,41.81
3,162.0,148.0,190.0,179.0,19.0,838.0,741.0,3.0,21.5,42.08
4,154.0,112.0,144.0,220.0,10.0,923.0,658.0,20.0,64.0,26.82


So, basically in this dataset, with all features, model should be able to predict Compressive Strength (28-day)(Mpa). Which is a continous feature.

In [3]:
X = df.drop('Compressive Strength (28-day)(Mpa)', axis = 1)
y = df['Compressive Strength (28-day)(Mpa)']

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

When, we use SVM for regression task, it looks closely at feature. and when a ML algo looks closely at features, its better to scale them. Scaling won't at all but would always benefit.

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
from sklearn.svm import SVR, LinearSVR

SVR is faster implementation of support vector regression. But the caveat is that it only gives LinearSVR. It has one parameter **epsilon** extra, which shows how much error you want for each training instance. epsilon=0 means we want 0 error would cause overfitting.

In [11]:
model = SVR()

model.fit(X_train, y_train)

In [12]:
predictions = model.predict(X_test)

In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mean_absolute_error(y_test, predictions)

4.8763039753905515

In [14]:
np.sqrt(mean_squared_error(y_test, predictions))

6.029425110638286

Let's perform grid search.

In [16]:
param_grid = {'C': [0.001, 0.01, 0.1, 0.5, 1],
              'kernel': ['linear', 'rbf', 'poly'],
              'gamma': ['scale', 'auto'],
              'degree': [2,3,4],
              'epsilon' : [0, 0.001, 0.1, 0.5, 1, 2]}

In [17]:
from sklearn.model_selection import GridSearchCV

model = SVR()
grid_model = GridSearchCV(model, param_grid)
grid_model.fit(X_train, y_train)

In [18]:
predictions = grid_model.predict(X_test)

In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mean_absolute_error(y_test, predictions)

2.315691167571919

In [20]:
np.sqrt(mean_squared_error(y_test, predictions))

3.0284846241971914

Woah! Good performance!

In [22]:
grid_model.best_params_

{'C': 1, 'degree': 2, 'epsilon': 2, 'gamma': 'scale', 'kernel': 'linear'}