In [1]:
# Datset source
# https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise

In [2]:
# Problem Statement: Predict the scaled sound pressure level of airfoils at various wind tunnel speeds and angles of attack

In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [4]:
# Read the dataset

import pandas as pd
afsn_df = pd.read_csv('airfoil_self_noise.dat', sep='\t', header=None,
                 names=['frequency','angle_of_attack','chord_length','free_stream_velocity','suction_side_thickness','scaled_sound_level_dbs'])
print(afsn_df.shape)
afsn_df.head()

(1503, 6)


Unnamed: 0,frequency,angle_of_attack,chord_length,free_stream_velocity,suction_side_thickness,scaled_sound_level_dbs
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [5]:
# To make this notebook's output identical at every run

np.random.seed(2)

In [6]:
# Split the dataframe into features and labels

X = afsn_df.drop(['scaled_sound_level_dbs'], axis=1).values
y = afsn_df.loc[:, 'scaled_sound_level_dbs'].values
print("X shape: ", X.shape, "y shape: ", y.shape)
print("Sample X values: ", X[:5], "\n", "Sample y values: ", y[:5], )

X shape:  (1503, 5) y shape:  (1503,)
Sample X values:  [[8.00000e+02 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [1.00000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [1.25000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [1.60000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [2.00000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]] 
 Sample y values:  [126.201 125.201 125.951 127.591 127.461]


In [7]:
# Split the dataset into train and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=2)

print(" X_train shape: ", X_train.shape,"\n", "y_train shape: ", y_train.shape,"\n",
        "X_test shape: ", X_test.shape,"\n", "y_test shape: ", y_test.shape,"\n")

 X_train shape:  (1427, 5) 
 y_train shape:  (1427,) 
 X_test shape:  (76, 5) 
 y_test shape:  (76,) 



In [8]:
# Scale the data

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Model 1
# Sklearn LinearSVR model with default parameters

from sklearn.svm import LinearSVR
lin_svr = LinearSVR(random_state=2)
lin_svr.fit(X_train_scaled, y_train)

LinearSVR(random_state=2)

In [10]:
# R^2 values for train and test sets

print("Train set R^2 score: ", lin_svr.score(X_train_scaled, y_train))
print("Test set R^2 score: ", lin_svr.score(X_test_scaled, y_test))

Train set R^2 score:  0.48567750973311197
Test set R^2 score:  0.6137735821843726


In [11]:
# Mean Squared Errors of train and test sets

from sklearn.metrics import mean_squared_error
print("Train set mse: ", mean_squared_error(y_train, lin_svr.predict(X_train_scaled)))
print("Test set mse: ", mean_squared_error(y_test, lin_svr.predict(X_test_scaled)))

Train set mse:  24.393124439638314
Test set mse:  19.2285052002043


In [12]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, lin_svr.predict(X_train_scaled)))
print("Test set mae: ", mean_absolute_error(y_test, lin_svr.predict(X_test_scaled)))

Train set mae:  3.7023926821890822
Test set mae:  3.3649034711503267


In [13]:
# We will try to increase the R^2 score by using nonlinear kernels

In [14]:
# Model 2
# Sklearn SVR model with rbf kernel

from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {"gamma": reciprocal(0.001, 1.0), "C": uniform(1, 10)}
rbf_rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=30, n_jobs=6, verbose=10, cv=3, random_state=2)
rbf_rnd_search_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


RandomizedSearchCV(cv=3, estimator=SVR(), n_iter=30, n_jobs=6,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe0ef4c0fd0>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe0ef4c0cd0>},
                   random_state=2, verbose=10)

In [15]:
rbf_rnd_search_cv.best_estimator_

SVR(C=10.938520114212729, gamma=0.8160954191881514)

In [16]:
# R^2 values for train and test sets

print("Train set R^2 score: ", rbf_rnd_search_cv.best_estimator_.score(X_train_scaled, y_train))
print("Test set R^2 score: ", rbf_rnd_search_cv.best_estimator_.score(X_test_scaled, y_test))

Train set R^2 score:  0.8958179979402592
Test set R^2 score:  0.845125971846538


In [17]:
# Mean Squared Errors of train and test sets

print("Train set mse: ", mean_squared_error(y_train, rbf_rnd_search_cv.best_estimator_.predict(X_train_scaled)))
print("Test set mse: ", mean_squared_error(y_test, rbf_rnd_search_cv.best_estimator_.predict(X_test_scaled)))

Train set mse:  4.9411110513467715
Test set mse:  7.710492908714065


In [18]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, rbf_rnd_search_cv.best_estimator_.predict(X_train_scaled)))
print("Test set mae: ", mean_absolute_error(y_test, rbf_rnd_search_cv.best_estimator_.predict(X_test_scaled)))

Train set mae:  1.3568760878776225
Test set mae:  1.814675185862608


In [19]:
# Model 3
# Sklearn SVR model with polynomial kernel

from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

poly_param_distributions = {"gamma": reciprocal(0.001, 1.0), "C": uniform(1, 10)}
poly_rnd_search_cv = RandomizedSearchCV(SVR(kernel='poly', degree=3, coef0=1), poly_param_distributions, n_iter=30, n_jobs=-1, verbose=5, cv=3, random_state=2)
poly_rnd_search_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


RandomizedSearchCV(cv=3, estimator=SVR(coef0=1, kernel='poly'), n_iter=30,
                   n_jobs=-1,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe0ef845d50>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe0ef8459d0>},
                   random_state=2, verbose=5)

In [20]:
poly_rnd_search_cv.best_estimator_

SVR(C=2.844398656469153, coef0=1, gamma=0.22698933025637727, kernel='poly')

In [21]:
# R^2 values for train and test sets

print("Train set R^2 score: ", poly_rnd_search_cv.best_estimator_.score(X_train_scaled, y_train))
print("Test set R^2 score: ", poly_rnd_search_cv.best_estimator_.score(X_test_scaled, y_test))

Train set R^2 score:  0.7363365357927671
Test set R^2 score:  0.713315782443209


In [22]:
# Mean Squared Errors of train and test sets

print("Train set mse: ", mean_squared_error(y_train, poly_rnd_search_cv.best_estimator_.predict(X_train_scaled)))
print("Test set mse: ", mean_squared_error(y_test, poly_rnd_search_cv.best_estimator_.predict(X_test_scaled)))

Train set mse:  12.50494741004955
Test set mse:  14.272739289260011


In [23]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, poly_rnd_search_cv.best_estimator_.predict(X_train_scaled)))
print("Test set mae: ", mean_absolute_error(y_test, poly_rnd_search_cv.best_estimator_.predict(X_test_scaled)))

Train set mae:  2.5421754063224133
Test set mae:  2.792188086733045


In [24]:
# It turns out rbf kernel model is a better model than linear svr and polynomial kernel with the specified set of parameters