# Predict withLinear Regression

## Gradient Descent

In [3]:

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import math

def predict_using_sklean():
    df = pd.read_csv("test_scores.csv")
    r = LinearRegression()
    r.fit(df[['math']],df.cs)
    return r.coef_, r.intercept_

def gradient_descent(x,y):
    m_curr = 0
    b_curr = 0
    iterations = 100
    n = len(x)
    learning_rate = 0.0002

    cost_previous = 0

    for i in range(iterations):
        y_predicted = m_curr * x + b_curr
        cost = (1/n)*sum([value**2 for value in (y-y_predicted)])
        md = -(2/n)*sum(x*(y-y_predicted))
        bd = -(2/n)*sum(y-y_predicted)
        m_curr = m_curr - learning_rate * md
        b_curr = b_curr - learning_rate * bd
        if math.isclose(cost, cost_previous, rel_tol=1e-20):
            break
        cost_previous = cost
        print ("m {}, b {}, cost {}, iteration {}".format(m_curr,b_curr,cost, i))

    return m_curr, b_curr

if __name__ == "__main__":
    df = pd.read_csv("test_scores.csv")
    x = np.array(df.math)
    y = np.array(df.cs)

    m, b = gradient_descent(x,y)
    print("Using gradient descent function: Coef {} Intercept {}".format(m, b))

    m_sklearn, b_sklearn = predict_using_sklean()
    print("Using sklearn: Coef {} Intercept {}".format(m_sklearn,b_sklearn))

m 1.9783600000000003, b 0.027960000000000002, cost 5199.1, iteration 0
m 0.20975041279999962, b 0.0030470367999999894, cost 4161.482445460163, iteration 1
m 1.7908456142986242, b 0.025401286955264, cost 3332.2237319269248, iteration 2
m 0.37738163667530467, b 0.005499731626422651, cost 2669.4843523161976, iteration 3
m 1.6409848166378898, b 0.023373894401807944, cost 2139.826383775145, iteration 4
m 0.5113514173939655, b 0.0074774305434828076, cost 1716.5264071567592, iteration 5
m 1.5212165764726306, b 0.021771129698498662, cost 1378.2272007804495, iteration 6
m 0.6184191426785134, b 0.009075514323270572, cost 1107.8601808918404, iteration 7
m 1.4254981563597626, b 0.020507724625171385, cost 891.7842215178443, iteration 8
m 0.7039868810749315, b 0.010370210797388455, cost 719.0974036421305, iteration 9
m 1.3490002310389348, b 0.01951553325074733, cost 581.0869686205, iteration 10
m 0.7723719384951477, b 0.01142244086408669, cost 470.7897237271261, iteration 11
m 1.2878632281408475, b 

In [44]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

def sklearn_predict():
    r = LinearRegression()
    r.fit(df[["math"]],df.cs)
    return r.coef_ , r.intercept_

def gradient_predict(x,y):
    iteration = 100
    m = 0
    c = 0
    n = len(x)
    learning_rate = 0.0002
    cost = 0
    for i in range(iteration):
        y_predict = m * x + c
        mse = 1/n * sum([j**2 for j in y - y_predict])
        dm = -2/n * sum(x*y - y_predict)  #diff of mse with d/dm
        dc = -2/n * sum(y - y_predict)   #diff of mse with d/dc
        m = m -  learning_rate*dm     
        c = c - learning_rate*dc
        
        if math.isclose(mse, cost, rel_tol=1e-20):
            break
        cost = mse
        print("Iteration {}  Coef = {} , Itercept = {}, Cost = {}" .format(i,m,c,cost))
    return m,c

if __name__ == '__main__':
    df = pd.read_csv("test_scores.csv")
    x = np.array(df.math)
    y = np.array(df.cs)
    h,r = gradient_predict(x,y)
    print("Using gradient descent function: Coef {} Intercept {}".format(h, r))
    coef,inter = sklearn_predict()
    print("Using sklearn descent function: Coef {} Intercept {}" .format(coef,inter))

Iteration 0  Coef = 1.9783600000000003 , Itercept = 0.027960000000000002, Cost = 5199.1
Iteration 1  Coef = 3.903847036800001 , Itercept = 0.0030470367999999894, Cost = 4161.482445460163
Iteration 2  Coef = 5.777895025161985 , Itercept = -0.07330497483801604, Cost = 38730.45177403339
Iteration 3  Coef = 7.601898992079592 , Itercept = -0.19970100792040907, Cost = 106039.17707012122
Iteration 4  Coef = 9.377216131414393 , Itercept = -0.37478386858560764, Cost = 203400.27701581828
Iteration 5  Coef = 11.105166829930434 , Itercept = -0.5972331700695659, Cost = 328295.5123239282
Iteration 6  Coef = 12.78703566550272 , Itercept = -0.8657643344972793, Cost = 478366.0488089262
Iteration 7  Coef = 14.424072378254287 , Itercept = -1.1791276217457132, Cost = 651403.2600456047
Iteration 8  Coef = 16.017492815356032 , Itercept = -1.5361071846439696, Cost = 845340.0402405178
Iteration 9  Coef = 17.568479850203577 , Itercept = -1.9355201497964252, Cost = 1058242.5995267008
Iteration 10  Coef = 19.078

## Save and load traning model

In [47]:
import pandas as pd
from sklearn.linear_model import LinearRegression
df = pd.read_csv("test_scores.csv")

r = LinearRegression()
r.fit(df[["math"]],df.cs)
r.predict([[10]])



array([12.09258169])

In [48]:
import pickle

In [50]:
with open ('model_pickle','wb') as f:  # wb = write binary
    pickle.dump(r,f)

In [55]:
with open('model_pickle', 'rb') as f: #rb = read binary
    obj = pickle.load(f)

In [56]:
obj.predict([[90]])



array([93.51148072])

# # Dummy variable and hot Encoding

In [57]:
import pandas as pd
hr = pd.read_csv("homeprices.csv")
hr

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [60]:
dummy = pd.get_dummies(hr.town)
dummy

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [62]:
merge = pd.concat([hr,dummy],axis = 'columns')
merge

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [63]:
final = merge.drop(['town','west windsor'], axis = 'columns')
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [64]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [65]:
x = final.drop('price',axis = 'columns')
x

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [67]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [68]:
model.fit(x,y)

LinearRegression()

In [69]:
model.predict([[2800,1,0]])



array([565089.22812299])

In [72]:
model.score(x,y)

0.9573929037221873

## One Hot Encoding

In [73]:
hr

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [74]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [78]:
hr_le = hr
hr_le.town = le.fit_transform(hr_le.town)
hr_le

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [79]:
x = hr[['town','area']].values
x

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [82]:
y = hr_le.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [87]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = ct.fit_transform(x)

In [88]:
x

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [89]:
x = x[:,1:]
x

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [90]:
model.fit(x,y)

LinearRegression()

In [92]:
model.predict([[1,0,2800]])

array([590775.63964739])