In [79]:
#Estimate Coefficients for linear regression 
def StochaticGradientDescentL(Dataset,LearningRate,iterations):
    Coeff=[0.0 for i in range(len(Dataset[0]))]
    for i in range(iterations):
        Cost_Fun=0
        for j in range(len(Dataset)):
            OutputPredicted=Predict(Dataset[j],Coeff)
            Error=OutputPredicted-Dataset[j][-1]
            Cost_Fun += Error**2
            Coeff[0]=Coeff[0]-LearningRate*Error
            for k in range(len(Dataset[0])-1):
                Coeff[k+1]=Coeff[k+1]-LearningRate*Error*Dataset[j][k]
        print('Iteration:=%d, Learning rate=%.3f, Cost function=%.3f' % (i, LearningRate, Cost_Fun))
    return Coeff  
    

In [29]:
#Estimate coefficients for ridge regression
def StochaticGradientDescentR(Dataset,LearningRate,iterations,lamda):
    Coeff=[0.0 for i in range(len(Dataset[0]))]
    for i in range(iterations):
        Cost_Fun=0
        for j in range(len(Dataset)):
            OutputPredicted=Predict(Dataset[j],Coeff)
            Error=OutputPredicted-Dataset[j][-1]
           
            Cost_Fun += Error**2
           
            Coeff[0]=Coeff[0]-LearningRate*(Error+lamda*Coeff[0])
            for k in range(len(Dataset[0])-1):
                Coeff[k+1]=Coeff[k+1]-LearningRate*(Error*Dataset[j][k]+lamda*Coeff[k+1])
        for c in range(len(Coeff)):
            Cost_Fun+=lamda*(Coeff[c])**2
        print('Iteration:=%d, Learning rate=%.3f, Cost function=%.3f' % (i, LearningRate, Cost_Fun))
    return Coeff

In [1]:
#Linear Regression With Stochastic Gradient Descent
from random import seed
from random import randrange
from csv import reader
import numpy as np
from math import sqrt
 
#Read from file
def loadFile(filename):
    dataset =list()
    with open(filename, 'r') as file:
        File_reader = reader(file)
        for row in File_reader:
            if not row:
                continue
            dataset.append(row)
    dataset=np.array(dataset)
    np.place(dataset, dataset == "NO",2)
    np.place(dataset, dataset == "YES",1)
    np.place(dataset, dataset == "MDT",3)

    dataset = np.delete(dataset, 0, 0)
    dataset = dataset.astype(np.float)
    return dataset

def GetMinAndMax(dataset):
    MinMaxList = list()
    for i in range(len(dataset[0])):
        Column = dataset[:,i]
        Min = min(Column)
        Max = max(Column)
        MinMaxList.append([Min, Max])
    return MinMaxList
 
#Data normalization from to be in range 0-1
def normalize_dataset(dataset, MinMaxList):
    for x in dataset:
        for i in range(len(x)):
            x[i]=(x[i]-MinMaxList[i][0])/(MinMaxList[i][1]-MinMaxList[i][0])
    return dataset

#Output prediction
def Predict(rowInput,Coefficients):
    output=Coefficients[0]
    for i in range(1,len(Coefficients)):
        output+=Coefficients[i]*rowInput[i-1]
    return output
 
# Estimate linear regression coefficients using stochastic gradient descent
def StochaticGradientDescentL(Dataset,LearningRate,iterations):
    Coeff=[0.0 for i in range(len(Dataset[0]))]
    for i in range(iterations):
        Cost_Fun=0
        for j in range(len(Dataset)):
            OutputPredicted=Predict(Dataset[j],Coeff)
            Error=OutputPredicted-Dataset[j][-1]
            Cost_Fun += Error**2
            Coeff[0]=Coeff[0]-LearningRate*Error
            for k in range(len(Dataset[0])-1):
                Coeff[k+1]=Coeff[k+1]-LearningRate*Error*Dataset[j][k]
        #print('Iteration:=%d, Learning rate=%.3f, Cost function=%.3f' % (i, LearningRate, Cost_Fun))
    return Coeff 

#Trains the model with some train data then get coefficients and predict test data
def linearRegressionTesting(train, test, learning_rate, iterations):
    predictions = list()
    coeff = StochaticGradientDescentL(train, learning_rate, iterations)
    for row in test:
        predicted = Predict(row, coeff)
        predictions.append(predicted)
    predictions=regressionToClassification(predictions)
    return predictions
 
def regressionToClassification(predictions):
    for i in range(len(predictions)):
        if(predictions[i]<0.25):
            predictions[i]=0
        elif (predictions[i]>0.25 and predictions[i]<0.75):
            predictions[i]=0.5
        else:
            predictions[i]=1
    return predictions
    

def CalculateAccuracy(predictions,realdata):
    correct=0
    wrong=0
    for i in range(len(realdata)):
        if(realdata[i][-1]==predictions[i]):
            correct+=1
        else:
            wrong+=1
    Accuracy=(correct/len(realdata))*100
    return Accuracy
            
# Read data set and preproccessing
filename = 'Train.csv'
dataset = loadFile(filename)

#normalize dataset
minmax = GetMinAndMax(dataset)
Normalized_dataset=normalize_dataset(dataset, minmax)

learningRate = 0.01
Iterations = 250
predictions=linearRegressionTesting(Normalized_dataset[:500,:],Normalized_dataset[500:,:],learningRate,Iterations)
filename = 'Train.csv'
dataset = loadFile(filename)

Accuracy=CalculateAccuracy(predictions,Normalized_dataset[500:,:])
print(Accuracy)

94.6236559139785


In [2]:
def cross_validation(data_array,k_folds,iterations,LearningRate):
    data_c=data_array
    np.take(data_c,np.random.permutation(data_c.shape[0]),axis=0,out=data_c)
   
    x_batch=np.array_split(data_c,k_folds,axis=0)
    AccuracyList=list()
    for i in range(k_folds):
        x_batch_c=x_batch
        x_test=x_batch_c[i]
        x_temp=np.delete(x_batch_c, i, 0)
  
        
        x_train=np.concatenate(x_temp,axis=0)

        Coeff=StochaticGradientDescentL(x_train,0.01,250)
        pred_test=list()
        for row in x_test:
            P=Predict(row,Coeff)
            pred_test.append(P)
    
        PredictionsTestSet=regressionToClassification(pred_test)
        Accuracy=CalculateAccuracy(PredictionsTestSet,x_test)
        AccuracyList.append(Accuracy)
        print(AccuracyList)
    
    return np.sum(AccuracyList)/(k_folds)

AccuracyTesting=cross_validation(Normalized_dataset,5,250,0.01)
print("Avg Accuracy:",AccuracyTesting)

[89.85507246376811]
[89.85507246376811, 92.7007299270073]
[89.85507246376811, 92.7007299270073, 93.43065693430657]
[89.85507246376811, 92.7007299270073, 93.43065693430657, 93.43065693430657]
[89.85507246376811, 92.7007299270073, 93.43065693430657, 93.43065693430657, 92.7007299270073]
Avg Accuracy: 92.42356923727917
