In [1]:
# Datset source
# https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise

In [2]:
# Problem Statement: Predict the scaled sound pressure level of airfoils at various wind tunnel speeds and angles of attack

In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import time
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

In [4]:
# Read the dataset

import pandas as pd
afsn_df = pd.read_csv('airfoil_self_noise.dat', sep='\t', header=None,
                 names=['frequency','angle_of_attack','chord_length','free_stream_velocity','suction_side_thickness','scaled_sound_level_dbs'])
print(afsn_df.shape)
afsn_df.head()

(1503, 6)


Unnamed: 0,frequency,angle_of_attack,chord_length,free_stream_velocity,suction_side_thickness,scaled_sound_level_dbs
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [5]:
# To make this notebook's output identical at every run

np.random.seed(2)

In [6]:
# Split the dataframe into features and labels

X = afsn_df.drop(['scaled_sound_level_dbs'], axis=1).values
y = afsn_df.loc[:, 'scaled_sound_level_dbs'].values
print("X shape: ", X.shape, "y shape: ", y.shape)
print("Sample X values: ", X[:5], "\n", "Sample y values: ", y[:5])

X shape:  (1503, 5) y shape:  (1503,)
Sample X values:  [[8.00000e+02 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [1.00000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [1.25000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [1.60000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [2.00000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]] 
 Sample y values:  [126.201 125.201 125.951 127.591 127.461]


In [7]:
# Split the dataset into train and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=2)

print(" X_train shape: ", X_train.shape,"\n", "y_train shape: ", y_train.shape,"\n",
        "X_test shape: ", X_test.shape,"\n", "y_test shape: ", y_test.shape,"\n")

 X_train shape:  (1427, 5) 
 y_train shape:  (1427,) 
 X_test shape:  (76, 5) 
 y_test shape:  (76,) 



In [8]:
start_time = time.time()

xgbr = XGBRegressor(objective='reg:squarederror')

param_grid = {
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        'min_child_weight': np.arange(0.0001, 0.5, 0.001),
        'gamma': np.arange(0.0,40.0,0.005),
        'learning_rate': np.arange(0.0005,0.3,0.0005),
        'subsample': np.arange(0.01,1.0,0.01),
        'colsample_bylevel': np.round(np.arange(0.1,1.0,0.01)),
        'colsample_bytree': np.arange(0.1,1.0,0.01)
}

kfold = KFold(n_splits=10, shuffle=True, random_state=2)
xgbr_search = RandomizedSearchCV(xgbr, param_grid, n_iter = 100, n_jobs=10, cv=kfold, verbose=1, random_state=2)
xgbr_result = xgbr_search.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (xgbr_result.best_score_, xgbr_result.best_params_))
print("Execution time in seconds: ", time.time()-start_time)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best: 0.944991 using {'subsample': 0.63, 'min_child_weight': 0.1361, 'max_depth': 12, 'learning_rate': 0.2965, 'gamma': 3.685, 'colsample_bytree': 0.9799999999999995, 'colsample_bylevel': 1.0}
Execution time in seconds:  13.023132562637329


In [9]:
# R^2 values for train and test sets

print("Train set R^2 score: ", xgbr_search.best_estimator_.score(X_train, y_train))
print("Test set R^2 score: ", xgbr_search.best_estimator_.score(X_test, y_test))

Train set R^2 score:  0.9900027021716362
Test set R^2 score:  0.9268948829982065
