''' 
ABOUT THE DATASET:

NASA data set, obtained from a series of aerodynamic and acoustic tests of two and three-dimensional airfoil blade 
sections conducted in an anechoic wind tunnel. The data set comprises different size NACA 0012 airfoils at various wind 
tunnel speeds and angles of attack. The span of the airfoil and the observer position were the same in all of the experiments. 
The data set has 6 attributes as given below. 
Now we will read a dataset for Machine Learning using pandas
filename = "airfoil_self_noise.dat.txt"
This is tab separated file with no headers
Use appropriate command to read this data file and store the data into dataframe.
After that convert it to numpy
Then use numpy to split the data into training and test set.
The training data should have 80% of data, and the test data should ahve 20% data.

Details of Data set:
Attribute Information:

This problem has the following inputs:
(Attributes 1 to 5 form X_data) 
1. Frequency, in Hertzs. 
2. Angle of attack, in degrees. 
3. Chord length, in meters. 
4. Free-stream velocity, in meters per second. 
5. Suction side displacement thickness, in meters. 

The only output is (and Y_data): 
6. Scaled sound pressure level, in decibels. 
'''

HERE,AIM IS TO UNDERSTAND HOW TO WRITE A NATIVE CODE OF LINEAR REGRESSION IN PYTHON.

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import math

In [3]:
'''
Read and normalise the data
'''
def readdata(filename, cols, delim = "\t"):
    df = pd.read_csv(filename,sep='\t',header=(0),names=cols)
    print('===================Original Data==============================================')
    print(df.head())
    x=df.drop(["ScaledSoundPressureLevel"],axis=1)
    
    y=df["ScaledSoundPressureLevel"]
    print(",................X--NEW..........................")
    print(x.head())
    print(",............................................")
    
    
    #scale the data
    scaler = MinMaxScaler() 
    scaled_values = scaler.fit_transform(x) 
   
    print('==========================After scaling=======================================')
    print(scaled_values) # it is a numpy array, alternatively df.to_numpy()
    X_data=scaled_values  #taking X all cols exept target col
    Y_data=y.to_numpy()
    
    
    num_training = int(0.8 * X_data.shape[0]) # 80% of the X data, taking the index 
    
    #x_data_train_1=0.80*X_data

# split the actual data
    x_data_train, x_data_test = X_data[:num_training], X_data[num_training:]
    y_data_train, y_data_test = Y_data[:num_training], Y_data[num_training:]
    print('==========================Train Test Split=======================================')
    print('x_data_train',x_data_train.shape)
    print('y_data_train',y_data_train.shape)
    print('x_data_test',x_data_test.shape)
    print('y_data_test',y_data_test.shape)
    print('=====================')
    
    return x_data_train, y_data_train, x_data_test, y_data_test



In [4]:
cols = ['Frequency', 'AngleOfAttack', 'ChordLength', 'FreeStreamVelocity', 'SuctionSideDisplacementThickness', 'ScaledSoundPressureLevel']
x_data_train, y_data_train, x_data_test, y_data_test = readdata('airfoil_self_noise.dat.txt', cols)

   Frequency  AngleOfAttack  ChordLength  FreeStreamVelocity  \
0       1000            0.0       0.3048                71.3   
1       1250            0.0       0.3048                71.3   
2       1600            0.0       0.3048                71.3   
3       2000            0.0       0.3048                71.3   
4       2500            0.0       0.3048                71.3   

   SuctionSideDisplacementThickness  ScaledSoundPressureLevel  
0                          0.002663                   125.201  
1                          0.002663                   125.951  
2                          0.002663                   127.591  
3                          0.002663                   127.461  
4                          0.002663                   125.571  
,................X--NEW..........................
   Frequency  AngleOfAttack  ChordLength  FreeStreamVelocity  \
0       1000            0.0       0.3048                71.3   
1       1250            0.0       0.3048             

In [5]:
'''
Write a function to train the data
''' 
def train( x_data_train, y_data_train, l_rate, iterations):
    w=np.zeros(5)
    b =np.zeros(1)
    m=len(x_data_train)
    print(x_data_train.shape)
    for i in range(iterations):
        
        y_train_pred = np.dot(x_data_train,w)+b
        #print(y_train_pred.shape)
        
        #("Calculating cost")
        cost=np.sum(np.power((y_train_pred-y_data_train),2))
        dw=(1/m)*np.dot(x_data_train.T,(y_train_pred-y_data_train))
        db=(1/m)*np.sum(y_train_pred-y_data_train)
        
        #Updating Weights
        
        w=w-l_rate*dw
 
        #Updating Bias
    
        b=b-l_rate*db 
    
        
    return w, b,y_train_pred

In [10]:
import time #to check time it is taking to train for 1000 iterations
t1=time.time()

w, b,y_train_pred=train( x_data_train, y_data_train, l_rate=0.01, iterations=1000)
t2=time.time()
print(t2-t1,"ms")

(1201, 5)
0.13463878631591797 ms


In [11]:
'''
function for prediction
'''
def classify(y_data_test,x_data_test, W, b):
    print(w.shape)
    print(b.shape)
    print(x_data_test.shape)
    y_pred_test = np.dot(w,x_data_test.T)+b 
    return y_pred_test

In [13]:
y_pred_test=classify(y_data_test,x_data_test, w, b)

(5,)
(1,)
(301, 5)


In [16]:
#TRAIN ACCURACY
def LinRegaccuracyTrain(y_data_train, y_train_pred):

    
    total_error = 0
    for i in range(0, len(y_data_train)):
        errSet= abs((y_train_pred[i] - y_data_train[i]) / y_data_train[i])
        total_error += errSet
    total_error = (total_error / len(y_data_train))
    accuracyScore = 1 - total_error
    return accuracyScore

In [17]:
#test Accuracy
LinRegaccuracyTrain(y_data_train, y_train_pred)

0.9189641251926304

In [18]:
def NumpyAcc(y_data_train, y_train_pred): #commomn function to find accuracy
    acc=1-np.sum(np.abs(y_data_train-y_train_pred)/y_data_train)/len(y_data_train)
    return acc

In [19]:
NumpyAcc(y_data_train, y_train_pred)

0.9189641251926304

In [20]:
NumpyAcc(y_data_test, y_pred_test)

0.910061572870106

In [77]:
print("MSE train: ", mean_squared_error(y_data_train, y_train_pred))
print("MSE test: ", mean_squared_error(y_data_test, y_pred_test))

print("RMSE train: ", math.sqrt(mean_squared_error(y_data_train, y_train_pred)))
print("RMSE test: ", math.sqrt(mean_squared_error(y_data_test, y_pred_test)))


MSE train:  34.908859724038535
MSE test:  42.90191663778914
RMSE train:  5.908372002848038
RMSE test:  6.5499554683821435
