In [1]:
import numpy as np
import pandas as pd
from numpy import genfromtxt
import matplotlib.pyplot as plt
from sklearn import preprocessing

In [2]:
# load dataset
data = pd.read_csv('Ozone Level Detection.data',
                   sep=";|:|,",
                   header=None,
                   engine='python')

# change missing value to np.nan
data[data == '?'] = np.nan

#data.isnull().sum()
np_data = data.to_numpy()
np_data.shape
#data

(2536, 74)

## رفع مشکل داده های گم‌شده



برای رفع داده های گم شده از منبع زیر استفاده شده است.
https://www.analyticsvidhya.com/blog/2021/10/handling-missing-value/


Missing values are imputed using the k-Nearest Neighbors approach where a Euclidean distance is used to find the nearest neighbors.

من از نزدیک ترین همسایگی استفاده کردم. البته چون داده های مان مربوط به اوزون است و به احتمال زیاد داده های یک روز نزدیک به روز قبل است بنابراین می توان از مقادیر روز قبل و بعد نیز استفاده کرد.


In [3]:
X = data.iloc[:,1:74]

In [4]:
from sklearn.impute import KNNImputer
impute_knn = KNNImputer(n_neighbors=5)
FixedInputData = impute_knn.fit_transform(X)
FixedInputData = pd.DataFrame(FixedInputData)
#FixedInputData.insert(0,-1, np_data[:,0]) Adding date as property
FixedInputData = FixedInputData.to_numpy()



## implementing KFold



ورودی این تابع تعداد تقسیم، آرایه و اینکه رندوم بشود یا نه را می گیرد و خروجی آن آرایه ای از کلاس فولد می دهد که شامل داده ی آموزش و تست است. بدیهی است که تعداد این آرایه برابر تعداد تقسیم که در ورودی گرفته است، می باشد.

In [5]:
class Fold:
    def __init__(self,X_Train,Y_Train,X_Test,Y_Test):
        self.X_Train = X_Train
        self.Y_Train = Y_Train
        self.X_Test = X_Test
        self.Y_Test = Y_Test


def KFold(NumberOfSplits, InputData, Shuffle=False):
    if(Shuffle):
         np.random.shuffle(InputData)
    output = []

    SplitedArray = np.array_split(InputData, NumberOfSplits)
    for i in range(NumberOfSplits):
        test = SplitedArray[i]
        train = np.concatenate( SplitedArray[0:i] +  SplitedArray[i + 1 :], axis=0 )
        X_Train = preprocessing.normalize(train[:,:-1])
        Y_Train = train[:, -1]

        X_Test = preprocessing.normalize(test[:,:-1])
        Y_Test = test[:,-1]
        output.append(Fold(X_Train,Y_Train,X_Test,Y_Test))
    
    return output
 

    

In [6]:
kfold_Out = KFold(3,FixedInputData)
kfold_Out[0].X_Train.shape[0]

1690

## پیادهسازی مدل رگرسیون لاجیستیک

In [7]:
def sigmoid(x):
    return (1/(1 +  np.e ** (-x)))

In [8]:
def CrossEntropyLossFunction(M, Y_Hat, Y, Regularization,W):
    return (-1/M) * (Y.dot( np.log(Y_Hat).T) + (1-Y).dot( np.log(1 - Y_Hat).T)) + Regularization/2*np.sum(np.power(W, 2))


In [9]:
def CalculateYHat(W,X,b):
    return np.dot(W , np.transpose(X)) + b

In [10]:
def Accuracy(W,b,X,Y):
    Z = CalculateYHat(W,X,b)
    A = sigmoid(Z)
    Y_hat = np.round(A)
    return (Y_hat == Y).sum() / Y.shape[1]
    

In [11]:
def TrainAndTest(NumberOfFolds, LearningRate, NumberOfEpoch, Regularization):
    
    nr_split = NumberOfFolds
    kfold_Out = KFold(nr_split,FixedInputData,True)


    learningRate = LearningRate
    nr_epoch = NumberOfEpoch
    regularization = Regularization

    TestLossArray = np.zeros(nr_split)
    TestAccuracyArray = np.zeros(nr_split)
    TrainAccuracyArray = np.zeros(nr_split)
    
    for i in range (nr_split): #TODO
        # for each split do the training
        X = kfold_Out[i].X_Train
        Y = np.reshape( kfold_Out[i].Y_Train,(1, kfold_Out[i].Y_Train.shape[0]))
        #print("shape: ",X.shape)
        M = X.shape[0]
        W = np.zeros((1 ,X.shape[1])) # 1 * 72
        b = np.zeros((1,1)) # 1 * 1

        for j in range(nr_epoch): # traning for nr_epoch
                
            Z = CalculateYHat(W,X,b)  # W => 1 * 72 ||||| X => M * 72 
            
            A = sigmoid(Z) # Y_hat
            #print(A)
            
            
            # dL/dZ
            dZ = A - Y 
            
            
            # Calculate derivitives
            dW = (1/M) * (np.dot(X.T,dZ.T)) + regularization * (W.T) # dL/dW
            db = (1/M) * np.sum(dZ) # dL/db

            # update the weights and bias
            W = W - learningRate * np.transpose(dW)
            
            b = b - learningRate * np.transpose(db)

        # Calculate loss for current split
        X_test = kfold_Out[i].X_Test
        Y_test = np.reshape( kfold_Out[i].Y_Test,(1, kfold_Out[i].Y_Test.shape[0]))
        Z = CalculateYHat(W,X_test,b)
        A = sigmoid(Z)
        
        Y_hat = np.round(A)
        TestLossArray[i] = CrossEntropyLossFunction(M,A, Y_test,Regularization, W)
        TestAccuracyArray[i] =(Y_hat == Y_test).sum() / Y_test.shape[1]


        TrainAccuracyArray[i] = Accuracy(W,b,X,Y)
        #print((Y_hat == Y_test).sum() / Y_test.shape[1])

    print ("Average Of Train Accuracy: ", np.mean(TrainAccuracyArray))
    print ("Average Of Test Accuracy: ", np.mean(TestAccuracyArray))
    print("average of Loss: ", np.mean(TestLossArray))
    print("standard deviation of Loss: ", np.std(TestLossArray))
    

## ارزیابی مدل رگرسیون


In [12]:
Test_Vars = [[3,0.001, 500, 0.1],\
             [3,0.01, 300, 0.01],\
             [3,0.0001, 500, 0.6],\
             [3,0.001, 500, 1],\
             [3,0.001, 500, 10],\
             [3,0.001, 500, 0],\
             [6,0.001, 500, 0]]

for i in range(len(Test_Vars)):
    print()
    print("Number of folds: ",Test_Vars[i][0],"  Learning rate: ", Test_Vars[i][1],"   Number of Epochs: ", Test_Vars[i][2], "   Regularization: ",Test_Vars[i][3]  )
    TrainAndTest(Test_Vars[i][0], Test_Vars[i][1], Test_Vars[i][2], Test_Vars[i][3])
    print()


Number of folds:  3   Learning rate:  0.001    Number of Epochs:  500    Regularization:  0.1
Average Of Train Accuracy:  0.9712149364835532
Average Of Test Accuracy:  0.9712162118054097
average of Loss:  0.26202669220131364
standard deviation of Loss:  0.0006876800959503983


Number of folds:  3   Learning rate:  0.01    Number of Epochs:  300    Regularization:  0.01
Average Of Train Accuracy:  0.9712142366420672
Average Of Test Accuracy:  0.9712134140939378
average of Loss:  0.12106968693258004
standard deviation of Loss:  0.0019722958028713525


Number of folds:  3   Learning rate:  0.0001    Number of Epochs:  500    Regularization:  0.6
Average Of Train Accuracy:  0.9712155196847915
Average Of Test Accuracy:  0.9712185432316365
average of Loss:  0.3359824592045387
standard deviation of Loss:  0.00037001671946424125


Number of folds:  3   Learning rate:  0.001    Number of Epochs:  500    Regularization:  1
Average Of Train Accuracy:  0.9712155196847915
Average Of Test Accuracy:


همانطور که در بالا قابل مشاهده است، هایپرپارامتر های من تعداد فولد ها، ضریب یادگیری، تعداد ایپاک ها، و ضریب رگولاریزیشن است. متاسفانه به دلیل اینکه داده ها مناسب نمی باشند نمیتوان نتیجه ی متمایز کننده ای از تغییر این متغیر ها بدست آورد.
البته میانگین لاس برای حالت آخر از همه بهتر بوده

اورفیت زمانی اتفاق می افتد که دقت داده ترین بالا باشد ولی دقت داده تست پایین باشد و آندر فیت زمانی اتفاق می افتد که مدل ما ساده باشد و بنابراین دقت داده های ترین پایین باشد. در اینجا میبینیم که دقت داده های تست و ترین نزدیک به هم و بالا هستند بنابراین مدل ما نه آندر فیت است و نه اورفیت

## Neural  Netwrok

In [13]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

یکی از ورودی های کلاس فید فورواردمون آرایه ای است که تعداد آن نشان دهنده ی تعداد لایه ها و مقدار قرار گرفته در هر لایه نشان دهنده تعداد نورون ها در آن لایه است

In [14]:
class Feedforward(torch.nn.Module):
        def __init__(self, X, layers):
            super(Feedforward, self).__init__()
            self.num_layers = len(layers)
            num_feature = X.shape[1]
            self.fcs = nn.ModuleList()

            if(self.num_layers == 0):
                self.fcs.append(nn.Linear(num_feature,1)) # connect input to output   
            else:
                #First layer
                self.fcs.append(nn.Linear(num_feature,layers[0]))

                #Loop through all other layers except the last one
                #print(self.num_layers)
                for i in range(1,self.num_layers):
                    self.fcs.append(nn.Linear(layers[i-1],layers[i]))

                #Last layer
                
                self.fcs.append(nn.Linear(layers[-1],1))
            
        def forward(self, x):
            if(self.num_layers == 0):
                return torch.sigmoid(self.fcs[-1](x))
            else:

                # first layer
                hidden = self.fcs[0](x)
                relu_ = F.relu(hidden)

                # every other layers except the last one
                for i in range(1, self.num_layers):
                    
                    hidden = self.fcs[i](relu_)
                    relu_ = F.relu(hidden)

                # last layer usess sigmoid
                output = self.fcs[-1](relu_)
                output = torch.sigmoid(output)
                return output


In [15]:
def NueralTrainAndTest(NumberOfFolds, LearningRate, NumberOfEpoch,Layers):
    nr_split = NumberOfFolds
    kfold_Out = KFold(nr_split,FixedInputData,True)

    nr_epoch = NumberOfEpoch
    layers = Layers

    TestLossArray = np.zeros(nr_split)
    TestAccuracyArray = np.zeros(nr_split)
    TrainAccuracyArray = np.zeros(nr_split)


    for i in range (nr_split): #TODO
        # for each split do the training
        X = torch.from_numpy(kfold_Out[i].X_Train).float()
        Y = torch.from_numpy( kfold_Out[i].Y_Train).float()
        M = X.shape[0]

        model = Feedforward(X,layers)
        criterion = torch.nn.BCELoss()
        optimizer = torch.optim.SGD(model.parameters(), lr = LearningRate)

        model.train()
        for epoch in range(nr_epoch):
            optimizer.zero_grad()
            # Forward pass
            y_pred = model(X)
            # Compute Loss
            loss = criterion(y_pred.squeeze(), Y)

            # Backward pass
            loss.backward()
            optimizer.step()
            

        
        
        X_test = torch.from_numpy(kfold_Out[i].X_Test).float()
        Y_test = torch.from_numpy(kfold_Out[i].Y_Test).float()
        model.eval()
        y_pred = model(X_test)
        after_train = criterion(y_pred.squeeze(), Y_test) 
        Y_test = Y_test.detach().numpy()
        y_pred = np.round(y_pred.detach().numpy()[:,0])
        #print((y_pred == Y_test).sum() / Y_test.shape[0])

        TestLossArray[i] = after_train.item()
        TestAccuracyArray[i] = (y_pred == Y_test).sum() / Y_test.shape[0]


        #TrainAccuracyArray[i] = Accuracy(W,b,X,Y)
        
    #print ("Average Of Train Accuracy: ", np.mean(TrainAccuracyArray))
    print ("Average Of Test Accuracy: ", np.mean(TestAccuracyArray))
    print("average of Loss: ", np.mean(TestLossArray))
    print("standard deviation of Loss: ", np.std(TestLossArray))


## ارزیابی مدل شبکه عصبی

In [17]:
Test_Vars = [[3,0.01, 500, [2,3]],\
             [4,0.01, 500, [2,3]],\
             [5,0.01, 500, [2,3]],\
             [6,0.01, 500, [2,3]],\
             [7,0.01, 500, [2,3]]]

for i in range(len(Test_Vars)):
    print()
    print("Number of folds: ",Test_Vars[i][0],"  Learning rate: ", Test_Vars[i][1],"   Number of Epochs: ", Test_Vars[i][2], "   Layers: ",Test_Vars[i][3]  )
    NueralTrainAndTest(Test_Vars[i][0], Test_Vars[i][1], Test_Vars[i][2], Test_Vars[i][3])
    print()


Number of folds:  3   Learning rate:  0.01    Number of Epochs:  500    Layers:  [2, 3]
Average Of Test Accuracy:  0.9712138803791831
average of Loss:  0.19726609190305075
standard deviation of Loss:  0.06572045533373759


Number of folds:  4   Learning rate:  0.01    Number of Epochs:  500    Layers:  [2, 3]
Average Of Test Accuracy:  0.9712145110410095
average of Loss:  0.1499730609357357
standard deviation of Loss:  0.005558535432413567


Number of folds:  5   Learning rate:  0.01    Number of Epochs:  500    Layers:  [2, 3]
Average Of Test Accuracy:  0.971215580300983
average of Loss:  0.2376216620206833
standard deviation of Loss:  0.04527516095910024


Number of folds:  6   Learning rate:  0.01    Number of Epochs:  500    Layers:  [2, 3]
Average Of Test Accuracy:  0.9712147864310815
average of Loss:  0.19616582865516344
standard deviation of Loss:  0.055179416928565025


Number of folds:  7   Learning rate:  0.01    Number of Epochs:  500    Layers:  [2, 3]
Average Of Test Accu

آزمون بالا با تعداد فولد های متفاوت اتفاق افتاده است. متاسفانه به دلیل خوب نبودن داده ها تمایز خاصی دیده نمیشود. فقط در حالت دوم میانگین لاس از همه کمتر بوده است

In [18]:
Test_Vars = [[3,0.001, 500, []],\
             [3,0.01, 300, [2,3]],\
             [3,0.001, 500, [10,20,33]],\
             [3,0.001, 500,[120,3,20]],\
             [6,0.001, 500, [1]]]

for i in range(len(Test_Vars)):
    print()
    print("Number of folds: ",Test_Vars[i][0],"  Learning rate: ", Test_Vars[i][1],"   Number of Epochs: ", Test_Vars[i][2], "   Layers: ",Test_Vars[i][3]  )
    NueralTrainAndTest(Test_Vars[i][0], Test_Vars[i][1], Test_Vars[i][2], Test_Vars[i][3])
    print()


Number of folds:  3   Learning rate:  0.001    Number of Epochs:  500    Layers:  []
Average Of Test Accuracy:  0.9712143466644285
average of Loss:  0.5061007738113403
standard deviation of Loss:  0.016959712085407554


Number of folds:  3   Learning rate:  0.01    Number of Epochs:  300    Layers:  [2, 3]
Average Of Test Accuracy:  0.9712143466644285
average of Loss:  0.3275444010893504
standard deviation of Loss:  0.06732790700040253


Number of folds:  3   Learning rate:  0.001    Number of Epochs:  500    Layers:  [10, 20, 33]
Average Of Test Accuracy:  0.9712148129496739
average of Loss:  0.5772304137547811
standard deviation of Loss:  0.03435666092191804


Number of folds:  3   Learning rate:  0.001    Number of Epochs:  500    Layers:  [120, 3, 20]
Average Of Test Accuracy:  0.9712138803791831
average of Loss:  0.4645797610282898
standard deviation of Loss:  0.004656840521461322


Number of folds:  6   Learning rate:  0.001    Number of Epochs:  500    Layers:  [1]
Average Of T

در اینجا نیز همانند قسمت لاجیستیک تغییر هایپر پارامتر ها ما را به نتیجه ی متمایزی نمیرساند. هایپر پارامتر های ما در این تست تعداد فولد، ضریب یادگیری، تعداد ایپاک ها و تعداد لایه و نورون موجود در هر لایه است.


البته حالت آخر نتیجه ی خوبی را ارایه نداده ولی حالت های دیگر همگی نتیجه ی خوبی داشتند. مقدار لاس در حالت دوم از همه کمتر بود

دوباره در اینجا نیز از آنجایی که نتیجه ی داده تست و داده ترین نزدیک هم هستند و هردو دقت بالایی دارند بنابراین نه اورفیت اتفاق افتاده و نه آندر فیت