In [10]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import pandas as pd
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression
from math import sqrt

In [4]:
af = pd.read_csv("dataset/steel.txt", delimiter='\t',encoding='latin1', na_values="n/a")
af.columns = ['normalising_temperature', 'tempering_temperature', 'sample_id', 'percent_silicon', 'percent_chromium', 'manufacture_year', 'percent_copper', 'percent_nickel', 'percent_sulphur', 'percent_carbon', 'percent_manganese', 'tensile_strength']
df = af.drop('sample_id', axis = 1)
df.head()

Unnamed: 0,normalising_temperature,tempering_temperature,percent_silicon,percent_chromium,manufacture_year,percent_copper,percent_nickel,percent_sulphur,percent_carbon,percent_manganese,tensile_strength
0,178.5,950,0.153,1.212726,2010,0.942,0.887,0.0,1.92,0.0,140.035334
1,178.5,375,0.153,1.621165,1992,0.942,0.887,0.0,1.92,0.0,42.21765
2,178.5,900,0.153,0.809989,1991,0.942,0.887,0.0,1.92,0.0,95.015309
3,189.525,900,0.1624,1.036229,2009,0.849,0.9382,0.0,2.035,0.0,113.266773
4,189.525,950,0.1624,0.935452,2004,0.849,0.9382,0.0,2.035,0.0,160.774427


In [5]:
scaler = StandardScaler() 
scaler.fit(df.drop('tensile_strength', axis = 1))
scaledfeatures = scaler.transform(df.drop('tensile_strength', axis = 1))
df_features = pd.DataFrame(scaledfeatures, columns = df.columns[:-1])
df_features.head()

Unnamed: 0,normalising_temperature,tempering_temperature,percent_silicon,percent_chromium,manufacture_year,percent_copper,percent_nickel,percent_sulphur,percent_carbon,percent_manganese
0,-1.855504,0.762359,1.108055,-0.251285,1.439492,2.318687,-1.155916,-1.073563,0.555803,-0.928403
1,-1.855504,-1.462678,1.108055,1.31993,-1.366628,2.318687,-1.155916,-1.073563,0.555803,-0.928403
2,-1.855504,0.568878,1.108055,-1.800567,-1.522523,2.318687,-1.155916,-1.073563,0.555803,-0.928403
3,-1.78737,0.568878,1.22473,-0.930247,1.283596,0.956205,-0.465711,-1.073563,1.176183,-0.928403
4,-1.78737,0.762359,1.22473,-1.317926,0.504119,0.956205,-0.465711,-1.073563,1.176183,-0.928403


In [4]:
# df_features = pd.DataFrame(df)
# df_features.head()

In [6]:
X = np.array(df_features) 
y = np.array(df['tensile_strength'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.33)

In [11]:
svr = SVR(gamma='auto')
svr.fit(X, y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [12]:
y_test_pred = svr.predict(X_test)
y_train_pred = svr.predict(X_train)

In [13]:
print("MSE train: {0:.4f}, test: {1:.4f}".\
      format(sqrt(mean_squared_error(y_train, y_train_pred)), 
             sqrt(mean_squared_error(y_test, y_test_pred))))
print("R^2 train: {0:.4f}, test: {1:.4f}".\
      format(r2_score(y_train, y_train_pred),
             r2_score(y_test, y_test_pred)))

MSE train: 77.5960, test: 83.2784
R^2 train: 0.2486, test: 0.2067


In [14]:
svr = SVR(kernel='poly', C=1e3, degree=2, gamma='auto')
svr.fit(X_train, y_train)

SVR(C=1000.0, cache_size=200, coef0=0.0, degree=2, epsilon=0.1, gamma='auto',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [13]:
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)

In [15]:
print("MSE train: {0:.4f}, test: {1:.4f}".\
      format(sqrt(mean_squared_error(y_train, y_train_pred)), 
             sqrt(mean_squared_error(y_test, y_test_pred))))
print("R^2 train: {0:.4f}, test: {1:.4f}".\
      format(r2_score(y_train, y_train_pred),
             r2_score(y_test, y_test_pred)))

MSE train: 77.5960, test: 83.2784
R^2 train: 0.2486, test: 0.2067


In [16]:
svr = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr.fit(X_train, y_train)

SVR(C=1000.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [17]:
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)

In [19]:
print("MSE train: {0:.4f}, test: {1:.4f}".\
      format(sqrt(mean_squared_error(y_train, y_train_pred)), 
             sqrt(mean_squared_error(y_test, y_test_pred))))
print("R^2 train: {0:.4f}, test: {1:.4f}".\
      format(r2_score(y_train, y_train_pred),
             r2_score(y_test, y_test_pred)))

MSE train: 11.9971, test: 36.5521
R^2 train: 0.9820, test: 0.8472


In [20]:
X_new = SelectKBest(f_regression, k=8).fit_transform(X, y)

In [21]:
X_new.shape

(552, 8)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, random_state = 42, test_size = 0.33)

In [23]:
svr = SVR(gamma='auto')
svr.fit(X_new, y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [24]:
y_test_pred = svr.predict(X_test)
y_train_pred = svr.predict(X_train)

In [25]:
print("MSE train: {0:.4f}, test: {1:.4f}".\
      format(sqrt(mean_squared_error(y_train, y_train_pred)), 
             sqrt(mean_squared_error(y_test, y_test_pred))))
print("R^2 train: {0:.4f}, test: {1:.4f}".\
      format(r2_score(y_train, y_train_pred),
             r2_score(y_test, y_test_pred)))

MSE train: 74.9116, test: 81.2562
R^2 train: 0.2997, test: 0.2448


In [26]:
svr = SVR(kernel='poly', C=1e3, degree=2, gamma='auto')
svr.fit(X_train, y_train)

SVR(C=1000.0, cache_size=200, coef0=0.0, degree=2, epsilon=0.1, gamma='auto',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [27]:
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)

In [28]:
print("MSE train: {0:.4f}, test: {1:.4f}".\
      format(sqrt(mean_squared_error(y_train, y_train_pred)), 
             sqrt(mean_squared_error(y_test, y_test_pred))))
print("R^2 train: {0:.4f}, test: {1:.4f}".\
      format(r2_score(y_train, y_train_pred),
             r2_score(y_test, y_test_pred)))

MSE train: 60.4617, test: 74.2645
R^2 train: 0.5438, test: 0.3692


In [29]:
svr = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr.fit(X_train, y_train)

SVR(C=1000.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [30]:
y_train_pred = svr.predict(X_train)
y_test_pred = svr.predict(X_test)

In [31]:
print("MSE train: {0:.4f}, test: {1:.4f}".\
      format(sqrt(mean_squared_error(y_train, y_train_pred)), 
             sqrt(mean_squared_error(y_test, y_test_pred))))
print("R^2 train: {0:.4f}, test: {1:.4f}".\
      format(r2_score(y_train, y_train_pred),
             r2_score(y_test, y_test_pred)))

MSE train: 17.6205, test: 35.0022
R^2 train: 0.9613, test: 0.8599


In [74]:
parameters = [{'kernel': ['rbf'], 'gamma': [0.1,0.01,0.001],'C': [1, 10, 100, 1000, 10000, 1e3, 1e4, 1e5, 1e6], 'degree':[1,2,3,4], 
              'epsilon':[0.1, 0.01,0.001]}]
print("Tuning hyper-parameters")
from sklearn.metrics import make_scorer
scorer = make_scorer(mean_squared_error, greater_is_better=False)
svr = GridSearchCV(SVR(), parameters, cv = 10,scoring=scorer)
#svr = GridSearchCV(SVR(epsilon = 0.01), parameters, cv = 10)

Tuning hyper-parameters


In [75]:
svr.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [1, 10, 100, 1000, 10000, 1000.0, 10000.0,
                                100000.0, 1000000.0],
                          'degree': [1, 2, 3, 4], 'epsilon': [0.1, 0.01, 0.001],
                          'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(mean_squared_error, greater_is_better=False),
             verbose=0)

In [76]:
svr.best_estimator_

SVR(C=1000, cache_size=200, coef0=0.0, degree=1, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [77]:
model = SVR(C=1000, cache_size=200, coef0=0.0, degree=1, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
model.fit(X_train, y_train)

SVR(C=1000, cache_size=200, coef0=0.0, degree=1, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [78]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [79]:
print("MSE train: {0:.4f}, test: {1:.4f}".\
      format(sqrt(mean_squared_error(y_train, y_train_pred)), 
             sqrt(mean_squared_error(y_test, y_test_pred))))
print("R^2 train: {0:.4f}, test: {1:.4f}".\
      format(r2_score(y_train, y_train_pred),
             r2_score(y_test, y_test_pred)))

MSE train: 17.6205, test: 35.0022
R^2 train: 0.9613, test: 0.8599
