# Linear Regression for Manas Task One

In [206]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import math

### Initial Loading

In [207]:
data = pd.read_csv("Train.csv")
test_data = pd.read_csv("Test.csv")
data.drop(columns=['Unnamed: 0'], inplace=True)
data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Tata Tiago 1.2 Revotron XZ WO Alloy,Hyderabad,2017,16500,Petrol,Manual,First,23.84 kmpl,1199 CC,84 bhp,5.0,5.0
1,Mahindra TUV 300 2015-2019 mHAWK100 T8 AMT,Kochi,2017,47357,Diesel,Automatic,First,18.49 kmpl,1493 CC,100 bhp,7.0,8.37
2,Skoda Rapid 2013-2016 1.6 MPI Ambition,Mumbai,2014,35000,Petrol,Manual,First,15.0 kmpl,1598 CC,103.52 bhp,5.0,4.5
3,Tata Indica V2 DLS BSII,Jaipur,2007,200000,Diesel,Manual,Second,17.2 kmpl,1396 CC,53.5 bhp,5.0,0.85
4,Tata Manza Club Class Quadrajet90 LX,Mumbai,2013,64000,Diesel,Manual,Second,21.02 kmpl,1248 CC,88.76 bhp,5.0,2.65


In [208]:
data.describe()

Unnamed: 0,Year,Kilometers_Driven,Seats,Price
count,3750.0,3750.0,3718.0,3750.0
mean,2013.313333,58128.811733,5.277838,9.498632
std,3.26425,40343.978048,0.806216,11.207836
min,1998.0,1000.0,2.0,0.44
25%,2011.0,34000.0,5.0,3.5
50%,2014.0,53000.0,5.0,5.665
75%,2016.0,73157.75,5.0,9.9
max,2019.0,775000.0,10.0,160.0


In [209]:
data.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               1
Engine               28
Power                28
Seats                32
Price                 0
dtype: int64

### Cleaning Up the Data

In [210]:
def processData(column, unit):
    return column.str.replace(" " + unit, "", regex=False).replace("null", None).astype(float)

In [211]:
data["Mileage"] = processData(data["Mileage"], "kmpl")
data["Power"] = processData(data["Power"], "bhp")
data["Engine"] = processData(data["Engine"], "CC")

In [212]:
X_train = data.drop(columns=['Price', 'Name'])
y_train = data['Price']

In [213]:
X_train = X_train.apply(pd.to_numeric, errors='coerce')

# print(X_train.isnull().sum())
X_train

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats
0,,2017,16500,,,,23.84,1199.0,84.00,5.0
1,,2017,47357,,,,18.49,1493.0,100.00,7.0
2,,2014,35000,,,,15.00,1598.0,103.52,5.0
3,,2007,200000,,,,17.20,1396.0,53.50,5.0
4,,2013,64000,,,,21.02,1248.0,88.76,5.0
...,...,...,...,...,...,...,...,...,...,...
3745,,2009,150000,,,,11.00,2993.0,235.00,5.0
3746,,2012,85000,,,,11.79,2179.0,120.00,7.0
3747,,2007,55100,,,,18.90,998.0,67.10,5.0
3748,,2014,77500,,,,20.45,1461.0,83.80,5.0


## Main Processing for LR

In [214]:
def normalise(x):
    return (x - x.mean()) / x.std()

In [215]:
class LinearRegression:
    def __init__(self, alpha=0.0001, n_iter=1000, rc=10):
        self.alpha = alpha
        self.n_iter = n_iter
        self.rc = rc
        self.w = None
        self.b = None
        self.cost = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        self.b = 0

        for _ in range(self.n_iter):
            y_pred = np.dot(X, self.w) + self.b
            dw  = (1/n_samples)*(  np.dot(X.T , y_pred  - y)) + (self.rc/n_samples)*(self.weights)
            db  = (1/n_samples)*np.sum(y_pred  - y )

            self.w -= self.alpha * dw
            self.b -= self.alpha * db

            cost = (1 / (2 * n_samples)) * np.sum((y_pred - y) ** 2) + \
                   (self.rc / (2 * n_samples)) * np.sum(self.w ** 2)
            self.costs.append(cost)
    
    def predict(self, X):
        predictions = np.dot(X, self.w) + self.b
        return predictions

In [216]:
def MSE(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

In [217]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

In [218]:
def R2(y_true, y_pred):
    tv = np.sum((y_true - np.mean(y_true))**2)
    pv = np.sum((y_true - np.mean(y_pred))**2)
    return 1 - (pv/tv)

### Preparing for Implementing the Algorithm

In [219]:
X_train = pd.DataFrame(normalise(X_train))
y_train = pd.DataFrame(y_train)

X_train = X_train.apply(pd.to_numeric, errors='coerce')
y_train = y_train.apply(pd.to_numeric, errors='coerce')

X_train = X_train.fillna(X_train.mean())
y_train = y_train.fillna(y_train.mean())

X_train = np.where(np.isnan(X_train), np.nanmean(X_train, axis=0), X_train)

  X_train = np.where(np.isnan(X_train), np.nanmean(X_train, axis=0), X_train)
