# Linear Regression From Scratch
a try to code algorithm without using ai libraries

In [83]:
import csv
import numpy as np

### Gradiant Decent Method

![Gradiant Decent](img/gd.png)

## 1. Load Dataste

In [84]:
def load_csv(filename):
  with open(filename, 'r') as file: # open file
    reader = csv.reader(file) # create a csv reader for opened file
    columns = next(reader) # get columns (header section)
    dataset = list(reader) # read rest of data
  return columns, np.array(dataset)

In [85]:
columns, dataset = load_csv('../datasets/housedata/data.csv')

In [86]:
columns

['date',
 'price',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'street',
 'city',
 'statezip',
 'country']

In [87]:
dataset.shape[0] # records

4600

In [88]:
dict(zip(columns, dataset[0])) # first record

{'date': '2014-05-02 00:00:00',
 'price': '313000.0',
 'bedrooms': '3.0',
 'bathrooms': '1.5',
 'sqft_living': '1340',
 'sqft_lot': '7912',
 'floors': '1.5',
 'waterfront': '0',
 'view': '0',
 'condition': '3',
 'sqft_above': '1340',
 'sqft_basement': '0',
 'yr_built': '1955',
 'yr_renovated': '2005',
 'street': '18810 Densmore Ave N',
 'city': 'Shoreline',
 'statezip': 'WA 98133',
 'country': 'USA'}

## 2. Select Numeric Data Only
we can also create labels or encodings for non-numerical data

In [89]:
# input_columns = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated']
# input_columns_indices = []

# for i in range(len(columns)):
#   if columns[i] in input_columns:
#     input_columns_indices.append(i)

# dict(zip(input_columns_indices, input_columns))

In [90]:
# output_column = 'price'
# output_column_index = -1

# for i in range(len(columns)):
#   if columns[i] == output_column:
#     output_column_index = i
#     break

# print(output_column_index, output_column)

In [91]:
# inputs = []
# outputs = []

# for i in range(dataset.shape[0]):
#   inputs.append(dataset[i][[input_columns_indices]])
#   outputs.append(dataset[i][output_column_index])

## 2. Or - Let us do `Label-Encoding`
Convert non-numerical data to numerical data

In [92]:
def label_encode(dataset, columns, non_numerical_columns):
  encoded_data = dataset.copy() # make a copy of dataset to not change original data
  label_encodings = {} # encodings map

  for col in non_numerical_columns:
    col_index = columns.index(col)
    uniqe_values = np.unique(dataset[:, col_index]) # get a list of unique values
    encoding = {value.lower().strip():idx for idx,value in enumerate(uniqe_values)}
    label_encodings[col] = encoding

    for row in encoded_data:
      row[col_index] = encoding[row[col_index].lower().strip()] # replace string with value from encoding map

  return encoded_data, label_encodings

In [93]:
encoded_data, encodings = label_encode(dataset, columns, ['street', 'city', 'statezip', 'country'])

In [94]:
columns

['date',
 'price',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'street',
 'city',
 'statezip',
 'country']

In [95]:
dict(zip(columns, encoded_data[0])) # encoded

{'date': '2014-05-02 00:00:00',
 'price': '313000.0',
 'bedrooms': '3.0',
 'bathrooms': '1.5',
 'sqft_living': '1340',
 'sqft_lot': '7912',
 'floors': '1.5',
 'waterfront': '0',
 'view': '0',
 'condition': '3',
 'sqft_above': '1340',
 'sqft_basement': '0',
 'yr_built': '1955',
 'yr_renovated': '2005',
 'street': '1522',
 'city': '36',
 'statezip': '62',
 'country': '0'}

In [96]:
dict(zip(columns, dataset[0])) # original

{'date': '2014-05-02 00:00:00',
 'price': '313000.0',
 'bedrooms': '3.0',
 'bathrooms': '1.5',
 'sqft_living': '1340',
 'sqft_lot': '7912',
 'floors': '1.5',
 'waterfront': '0',
 'view': '0',
 'condition': '3',
 'sqft_above': '1340',
 'sqft_basement': '0',
 'yr_built': '1955',
 'yr_renovated': '2005',
 'street': '18810 Densmore Ave N',
 'city': 'Shoreline',
 'statezip': 'WA 98133',
 'country': 'USA'}

### Handling date

Handling date is quite a complicated task. here are different appraoches we can use:
1. simply convert to timestamp (not much useful)
2. features-enginerring:
  - create features (like difference between date of record and last renovated, etc.)
  - split / create features (i.e., extract day, month, year, saperately)

In [97]:
encoded_data[:2, 0] # check out 2 date samples

array(['2014-05-02 00:00:00', '2014-05-02 00:00:00'], dtype='<U46')

In [98]:
from datetime import datetime

In [99]:
datetime.timestamp(
  datetime.strptime(encoded_data[0][0], '%Y-%m-%d %H:%M:%S')
) # timestamp

1398970800.0

In [100]:
# Encode all dates to timestamp
for row_idx in range(encoded_data.shape[0]):
  encoded_data[row_idx][0] = datetime.timestamp(
    datetime.strptime(encoded_data[row_idx][0], '%Y-%m-%d %H:%M:%S')
  )
  

In [101]:
encoded_data[0, :2] # check out 2 date samples

array(['1398970800.0', '313000.0'], dtype='<U46')

## 3. Normalization

In [102]:
encoded_data = np.array(encoded_data, 'float32') # convert all to float (32-bit)

####  Scaling `0.0` to `1.0`

In [103]:
encoded_data[:2] # 2 samples

array([[1.3989708e+09, 3.1300000e+05, 3.0000000e+00, 1.5000000e+00,
        1.3400000e+03, 7.9120000e+03, 1.5000000e+00, 0.0000000e+00,
        0.0000000e+00, 3.0000000e+00, 1.3400000e+03, 0.0000000e+00,
        1.9550000e+03, 2.0050000e+03, 1.5220000e+03, 3.6000000e+01,
        6.2000000e+01, 0.0000000e+00],
       [1.3989708e+09, 2.3840000e+06, 5.0000000e+00, 2.5000000e+00,
        3.6500000e+03, 9.0500000e+03, 2.0000000e+00, 0.0000000e+00,
        4.0000000e+00, 5.0000000e+00, 3.3700000e+03, 2.8000000e+02,
        1.9210000e+03, 0.0000000e+00, 3.8990000e+03, 3.5000000e+01,
        5.8000000e+01, 0.0000000e+00]], dtype=float32)

In [104]:
# functinon to do min max scaling
def min_max_normalize(data):
    min_val = np.min(data, axis=0)
    max_val = np.max(data, axis=0)
    return (data - min_val) / (max_val - min_val)

In [105]:
normalized_data = min_max_normalize(encoded_data)

  return (data - min_val) / (max_val - min_val)


In [106]:
normalized_data[:2] # check 2 samples

array([[0.        , 0.01177134, 0.33333334, 0.1875    , 0.07365224,
        0.00677546, 0.2       , 0.        , 0.        , 0.5       ,
        0.10730089, 0.        , 0.48245615, 0.99553126, 0.33642793,
        0.8372093 , 0.81578946,        nan],
       [0.        , 0.08965777, 0.5555556 , 0.3125    , 0.24905087,
        0.00783547, 0.4       , 0.        , 1.        , 1.        ,
        0.3318584 , 0.05809129, 0.18421052, 0.        , 0.86184794,
        0.81395346, 0.7631579 ,        nan]], dtype=float32)

In [107]:
columns

['date',
 'price',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'street',
 'city',
 'statezip',
 'country']

In [120]:
output_column = 1 # price
input_columns = list(range(len(columns)))
input_columns.remove(output_column)

print(input_columns, output_column)

[0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] 1


In [130]:
input_columns = input_columns[:-1] # remove country

In [131]:
X = normalized_data[:, input_columns] # input
Y = normalized_data[:, output_column] # output

In [132]:
X[:3]

array([[0.        , 0.33333334, 0.1875    , 0.07365224, 0.00677546,
        0.2       , 0.        , 0.        , 0.5       , 0.10730089,
        0.        , 0.48245615, 0.99553126, 0.33642793, 0.8372093 ],
       [0.        , 0.5555556 , 0.3125    , 0.24905087, 0.00783547,
        0.4       , 0.        , 1.        , 1.        , 0.3318584 ,
        0.05809129, 0.18421052, 0.        , 0.86184794, 0.81395346],
       [0.        , 0.33333334, 0.25      , 0.11845102, 0.01053391,
        0.        , 0.        , 0.        , 0.75      , 0.17256637,
        0.        , 0.57894737, 0.        , 0.50641024, 0.41860464]],
      dtype=float32)

In [150]:
Y.shape

(4600,)

In [151]:
X.shape

(4600, 15)

In [175]:
# CPerform GD
def gradient_descent(X, Y, weights, learning_rate, iterations):
    number_of_samples = len(Y)

    for iteration_idx in range(iterations): # no of iterations to train
        avg_err = 0
        for sample_idx in range(number_of_samples): # take each sample
            predicted = np.dot(X[sample_idx], weights) # predict
            error = predicted - Y[sample_idx] # find error
            for column_idx in range(len(weights)):
                weights[column_idx] -= learning_rate * error * X[sample_idx][column_idx] # learn value
            avg_err += abs(error)
        avg_err /= number_of_samples
        # print(f"i={iteration_idx}, e={error}, w={weights}")
        print(f"{iteration_idx} = {error}")
    return weights

In [176]:
## trying to code myself

# def gradient_decent(X, Y, learning_rate, initial_weights, iterations):
#   for _ in range(len(iterations)):
#     for i in range(X.shape[0]): # for all inputs
#         predicted = np.dot(X[i], initial_weights)
#         error = predicted - Y[i]

#         for column_ind in range(len(initial_weights)):
#            initial_weights[column_ind] -= error * learning_rate * X[i][column_ind]

#   return initial_weights

In [177]:
# Train LR model, return weights
def train_linear_regression(X, Y, learning_rate, iterations):
    initial_weights = np.full(X.shape[1], 1.00, 'float32') # initially set all weights = 1
    
    trained_weights = gradient_descent(
      X,
      Y,
      initial_weights,
      learning_rate,
      iterations
    )

    return trained_weights


In [181]:
learning_rate = 0.001
iterations = 100

In [182]:
for i in range(len(X)):
  np.isnan(X[i][-1])
  

In [183]:
trained_weights = train_linear_regression(X, Y, learning_rate, iterations)

0 = 0.11020359396934509
1 = 0.05988784506917
2 = 0.02873467281460762
3 = 0.008485952392220497
4 = -0.00495092011988163
5 = -0.013907299377024174
6 = -0.019828569144010544
7 = -0.023658258840441704
8 = -0.026029285043478012
9 = -0.027375146746635437
10 = -0.027997145429253578
11 = -0.02810586243867874
12 = -0.027849756181240082
13 = -0.0273358765989542
14 = -0.026640689000487328
15 = -0.025820275768637657
16 = -0.02491563744843006
17 = -0.02395734004676342
18 = -0.022968139499425888
19 = -0.02196531742811203
20 = -0.020961584523320198
21 = -0.019966797903180122
22 = -0.01898852363228798
23 = -0.018032200634479523
24 = -0.017102012410759926
25 = -0.016200818121433258
26 = -0.015330865979194641
27 = -0.014493613503873348
28 = -0.01368978712707758
29 = -0.012919697910547256
30 = -0.012183439917862415
31 = -0.011480731889605522
32 = -0.010810963809490204
33 = -0.010173482820391655
34 = -0.00956725049763918
35 = -0.00899150688201189
36 = -0.008445076644420624
37 = -0.007926862686872482
38 = 

In [184]:
trained_weights

array([ 4.0292591e-03, -2.8634807e-02, -5.1967934e-02,  2.5493432e-02,
        5.5539250e-01,  2.5177561e-02,  6.6455990e-02, -1.3566415e-02,
        1.0148150e-02,  6.2921979e-02,  9.4818592e-02, -1.1575583e-03,
        1.3377463e-03,  4.8107261e-04,  5.0800806e-03], dtype=float32)

# Pending Study
- Different accuracy methods

In [188]:
def r_square_accuracy(X, Y, weights):
  total_error = 0.0
  total_variance = 0.0
  mean_y = np.mean(Y)

  for i in range(len(Y)):
      predicted = np.dot(X[i], weights)
      total_error += (predicted - Y[i]) ** 2
      total_variance += (Y[i] - mean_y) ** 2
  return 1 - (total_error / total_variance)

In [189]:
r_square_accuracy(X, Y, trained_weights)

-0.6895254599149807