In [1]:
import numpy as np

In [7]:
# data reader from csv file
def load_data(path, dataType, skip_header):
    data = np.genfromtxt(path, delimiter=',', skip_header=skip_header, dtype=dataType)
    return data 

In [40]:
# the data columns of the data file 
dataType = [
    ('name', 'U40'),
    ('year', int),
    ('selling_price', int),
    ('km_driven', int),
    ('fuel', 'U15'),
    ('seller_type', 'U16'),
    ('transmission', 'U15'),
    ('owner', 'U20')
]

In [41]:
# load the data 
data = load_data('car_data.csv', dataType, True)

In [42]:
print(len(data['name'])) #the name is not that valuable tho
print(len(data['year']))
print(len(data['selling_price']))
print(len(data['km_driven']))
print(len(data['fuel']))
print(len(data['seller_type']))
print(len(data['transmission']))
print(len(data['owner']))


4340
4340
4340
4340
4340
4340
4340
4340


In [49]:
# the categorical features should be encoded to make use of them
seller_type_categories = ["Individual", "Dealer", "Trustmark Dealer"]
fuel_categories = ["Petrol", "Diesel", "CNG", "Electric", "LPG"]
transmission_categories = ["Manual", "Automatic"]
owner_categories = ["First Owner", "Second Owner", "Third Owner", "Fourth & Above Owner"]

In [52]:
# I used a integers to label the different categories
seller_type_mapping = {
    "Individual" : 1,
    "Dealer" : 2,
    "Trustmark Dealer" : 3
}

fuel_mapping = {
    "Petrol" : 1,
    "Diesel" : 2,
    "CNG" : 3,  # CNG means Compressed Natural Gas
    "Electric" : 4, 
    "LPG" : 5   # LPG means Liquefied Petroleum Gas
}
transmission_mapping = {
    "Manual" : 1,
    "Automatic" : 2
}
owner_mapping = {
    "First Owner" : 1,
    "Second Owner" : 2,
    "Third Owner" : 3,
    "Fourth & Above Owner" : 4,
    "Test Drive Car" : 5
}

In [55]:
# These 6 are the ones to be used as a feature
seller_type = np.array([seller_type_mapping[s_type] for s_type in data['seller_type']])
fuel = np.array([fuel_mapping[f_type] for f_type in data['fuel']])
transmission = np.array([transmission_mapping[t_type] for t_type in data['transmission']])
owner = np.array([owner_mapping[o_type] for o_type in data['owner']])
age = np.array([2023-year for year in data['year']])
km_driven = np.array(data['km_driven'])


In [58]:
# writing the X, the features matrix
X = np.column_stack((seller_type, fuel, transmission, owner, age, km_driven))
Y = np.array(data['selling_price'])

In [61]:
# function to calculate the cost - square error cost
def compute_cost_matrix(X, y, w, b, verbose=False):
    m,n = X.shape

    # calculate f_wb for all examples.
    f_wb = X @ w + b  
    # calculate cost
    total_cost = (1/(2*m)) * np.sum((f_wb-y)**2)
    # if verbose: print("f_wb:")
    # if verbose: print(f_wb)
        
    return total_cost

In [62]:
# function to compute the gradients
def compute_gradient_matrix(X, y, w, b): 
    
    m,n = X.shape
    f_wb = X @ w + b              
    e   = f_wb - y                
    dj_dw  = (1/m) * (X.T @ e)    
    dj_db  = (1/m) * np.sum(e)    
        
    return dj_db,dj_dw

In [None]:
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters): 
    """
    Performs batch gradient descent to learn theta. Updates theta by taking 
    num_iters gradient steps with learning rate alpha
    
    Args:
      X : (array_like Shape (m,n)    matrix of examples 
      y : (array_like Shape (m,))    target value of each example
      w_in : (array_like Shape (n,)) Initial values of parameters of the model
      b_in : (scalar)                Initial value of parameter of the model
      cost_function: function to compute cost
      gradient_function: function to compute the gradient
      alpha : (float) Learning rate
      num_iters : (int) number of iterations to run gradient descent
    Returns
      w : (array_like Shape (n,)) Updated values of parameters of the model after
          running gradient descent
      b : (scalar)                Updated value of parameter of the model after
          running gradient descent
    """
    
    # number of training examples
    m = len(X)
    
    # An array to store values at each iteration primarily for graphing later
    hist={}
    hist["cost"] = []; hist["params"] = []; hist["grads"]=[]; hist["iter"]=[];
    
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in
    save_interval = np.ceil(num_iters/10000) # prevent resource exhaustion for long runs

    for i in range(num_iters):

        # Calculate the gradient and update the parameters
        dj_db,dj_dw = gradient_function(X, y, w, b)   

        # Update Parameters using w, b, alpha and gradient
        w = w - alpha * dj_dw               
        b = b - alpha * dj_db               
      
        # Save cost J,w,b at each save interval for graphing
        if i == 0 or i % save_interval == 0:     
            hist["cost"].append(cost_function(X, y, w, b))
            hist["params"].append([w,b])
            hist["grads"].append([dj_dw,dj_db])
            hist["iter"].append(i)

        # Print cost every at intervals 10 times or as many iterations if < 10
        if i% math.ceil(num_iters/10) == 0:
            #print(f"Iteration {i:4d}: Cost {cost_function(X, y, w, b):8.2f}   ")
            cst = cost_function(X, y, w, b)
            print(f"Iteration {i:9d}, Cost: {cst:0.5e}")
    return w, b, hist #return w,b and history for graphing