In [19]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math

In [20]:
df = pd.read_csv("Gotem_Pumpkins.csv")
train = df.copy(deep = True)

train = train.drop(train.columns[0], axis = 1)
#Aspect ration is just Major Axis length divided by Minor Axis length and hence does not provide any new information; so we drop it.
train = train.drop(columns = ["Aspect_Ration"])
train["Class"] = train["Class"].apply(lambda x: 1 if x=="Ürgüp Sivrisi" else 0)

#normalizing some features so all our values fall between 0 and 1
std_features = ['Area','Perimeter','Major_Axis_Length','Minor_Axis_Length','Convex_Area','Equiv_Diameter']
for i in std_features:
   train[i] = (train[i] - train[i].min())/(train[i].max() - train[i].min())

X_train = train.copy(deep = True)
X_train = X_train.drop(columns = ["Class"])
y_train = train["Class"].copy(deep = True)

y_train.head()

0    1
1    1
2    1
3    0
4    1
Name: Class, dtype: int64

In [21]:
#Cleaning up data in test file as well
tdf = pd.read_csv("Freyja_Pumpkins.csv")
test = tdf.copy(deep = True)

test = test.drop(test.columns[0], axis = 1)
test = test.drop(columns = ["Aspect_Ration"])
test["Class"] = test["Class"].apply(lambda x: 1 if x=="Ürgüp Sivrisi" else 0)

std_features = ['Area','Perimeter','Major_Axis_Length','Minor_Axis_Length','Convex_Area','Equiv_Diameter']
for i in std_features:
   test[i] = (test[i] - test[i].min())/(test[i].max() - test[i].min())

X_test = test.copy(deep = True)
X_test = X_test.drop(columns = ["Class"])
y_test = test["Class"].copy(deep = True)

X_test.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Compactness
0,0.145556,0.11533,0.1446,0.434095,0.145624,0.181331,0.8125,0.9921,0.7301,0.8571,0.7623
1,0.297777,0.286532,0.299781,0.528611,0.299258,0.352803,0.836,0.9909,0.7132,0.8342,0.7397
2,0.298614,0.322175,0.386747,0.438886,0.298707,0.353704,0.8742,0.9925,0.7079,0.8008,0.6949
3,0.416324,0.443632,0.487234,0.519257,0.421149,0.476438,0.8781,0.988,0.7495,0.7909,0.6905
4,0.409459,0.395315,0.357026,0.651413,0.412636,0.469483,0.8232,0.9896,0.7093,0.8283,0.7523


In [22]:
def sigmoid(z):

    g = 1/(1+np.exp(-z))
   
    return g

In [31]:
w = np.zeros(X_train.shape[1])
b = 0

#cost based on log piecewise function

def cost(x,y,weight,bias): 
    cost_sum = 0
    m = x.shape[0]
    for i in range(m):
        f_wb_i = sigmoid(np.dot(weight,x.iloc[i]) + bias)
        cost = -y[i]*math.log(f_wb_i) - (1-y[i])*math.log(1-f_wb_i)
        cost_sum += cost

    return cost_sum/m

cost(X_train,y_train,w,b)

0.6931471805599112

In [32]:
def compute_gradient(x,y,w,b): #calculate partial derivative terms in gradient descent algorithm
    dj_dw_i = np.zeros(x.shape[1])
    dj_dw = np.zeros(x.shape[1])
    dj_db_i = 0
    dj_db = 0
    m = x.shape[0]
    for i in range(m):
        dj_dw_i = (sigmoid(np.dot(w,x.iloc[i]) + b) - y.iloc[i])*x.iloc[i]
        dj_dw += dj_dw_i
        dj_db_i = sigmoid(np.dot(w,x.iloc[i]) + b) - y.iloc[i]
        dj_db += dj_db_i
    dj_dw = dj_dw/m
    dj_db = dj_db/m

    return dj_dw,dj_db

In [60]:
def gradient_descent(x,y,w,b,alpha,iterations):
    init_cost = cost(x,y,np.zeros(x.shape[1]),0)
    J_hist = [init_cost]
    for i in range(iterations):
        dj_dw,dj_db = compute_gradient(x,y,w,b)
        w = w-alpha*dj_dw
        b = b-alpha*dj_db
        J_hist.append(cost(x,y,w,b))
            

    return w,b,J_hist

iterations = 1000
w_final,b_final,J_hist = gradient_descent(X_train,y_train,w,b,1.6,iterations) #learning rate taken through trial and error

In [63]:
print(b_final)

0.7091802392450598


In [77]:
m = X_test.shape[0]
pred_list = []
predsum = 0
for i in range(m):
    pred_i = sigmoid(np.dot(w_final,X_test.iloc[i]) + b_final)
    if pred_i>=0.5:
        pred_list.append(1)
    else:
        pred_list.append(0)
    if pred_list[i] == y_test[i]:
        predsum+=1

print("Accuracy % is :",100*predsum/m)


Accuracy % is : 86.4
