In [1]:
# Datset source
# https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise

In [2]:
# Problem Statement: Predict the scaled sound pressure level of airfoils at various wind tunnel speeds and angles of attack

In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import datetime

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [4]:
# Read the dataset

import pandas as pd
afsn_df = pd.read_csv('airfoil_self_noise.dat', sep='\t', header=None,
                 names=['frequency','angle_of_attack','chord_length','free_stream_velocity','suction_side_thickness','scaled_sound_level_dbs'])
print(afsn_df.shape)
afsn_df.head()

(1503, 6)


Unnamed: 0,frequency,angle_of_attack,chord_length,free_stream_velocity,suction_side_thickness,scaled_sound_level_dbs
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [5]:
# To make this notebook's output identical at every run

np.random.seed(2)

In [6]:
# Split the dataframe into features and labels

X = afsn_df.drop(['scaled_sound_level_dbs'], axis=1).values
y = afsn_df.loc[:, 'scaled_sound_level_dbs'].values
print("X shape: ", X.shape, "y shape: ", y.shape)
print("Sample X values: ", X[:5], "\n", "Sample y values: ", y[:5], )

X shape:  (1503, 5) y shape:  (1503,)
Sample X values:  [[8.00000e+02 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [1.00000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [1.25000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [1.60000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]
 [2.00000e+03 0.00000e+00 3.04800e-01 7.13000e+01 2.66337e-03]] 
 Sample y values:  [126.201 125.201 125.951 127.591 127.461]


In [7]:
# Split the dataset into train and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=2)

print(" X_train shape: ", X_train.shape,"\n", "y_train shape: ", y_train.shape,"\n",
        "X_test shape: ", X_test.shape,"\n", "y_test shape: ", y_test.shape,"\n")

 X_train shape:  (1427, 5) 
 y_train shape:  (1427,) 
 X_test shape:  (76, 5) 
 y_test shape:  (76,) 



In [8]:
# Model 1
# Sklearn DecisionTreeRegressor model with max_depth 10

from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor(max_depth=10, random_state=2)
dt_reg.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=10, random_state=2)

In [9]:
# R^2 values for train and test sets

print("Train set R^2 score: ", dt_reg.score(X_train, y_train))
print("Test set R^2 score: ", dt_reg.score(X_test, y_test))

Train set R^2 score:  0.9529943806438242
Test set R^2 score:  0.8324028465244899


In [10]:
# Mean Squared Errors of train and test sets

from sklearn.metrics import mean_squared_error
print("Train set mse: ", mean_squared_error(y_train, dt_reg.predict(X_train)))
print("Test set mse: ", mean_squared_error(y_test, dt_reg.predict(X_test)))

Train set mse:  2.229367651650768
Test set mse:  8.343921048615776


In [11]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, dt_reg.predict(X_train)))
print("Test set mae: ", mean_absolute_error(y_test, dt_reg.predict(X_test)))

Train set mae:  0.924139583369395
Test set mae:  2.1977629504267644


In [12]:
# The Decision Tree Regressor with max depth 10 achieves a good R^2 score, so no complex models will be developed