In [1]:
#importing libraries
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#spliting our data into two training and testing
#mostly prople take 30% data for testing
#for large dataset(100K samples) it can be reduced to (20% to 10%)
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler

In [3]:
#loading our dataset
Data=pd.read_csv("50_Startups.csv")
Data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
Y=Data["Profit"].values

In [5]:
#We have already taken independed value(Y)
#so we are removing it from dataset 
data=Data.drop(["Profit"],axis=1)
data.head()
#These are our depended variable
#Now you can see the last column(state) they are not numeric
#So we need to convert it to numeric value

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [6]:
#pandas.get_dummies will change our value to numeric(0 or 1)
dummies=pd.get_dummies(data["State"])
dummies.head()

Unnamed: 0,California,Florida,New York
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [7]:
#adding our dummies variable to our dataset
data[["California","Florida","New York"]]=dummies
data.head()
DATA=data.drop(["State"],axis=1)
DATA.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,California,Florida,New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


In [8]:
x_train,x_test,y_train,y_test=train_test_split(DATA,Y,test_size=0.3) 
# 30% of our datasetset is for testing

In [9]:
#Now we performing Feature Scaling
#For dependent and independent variable 
#you need to initialize two standardscaler(one for each)

In [10]:
xs=StandardScaler()
ys=StandardScaler()
X=xs.fit_transform(x_train)
Y=ys.fit_transform(y_train.reshape(-1,1))


In [11]:
#we are using normal equation 
#so we need to add one column of ones at first to get intercept variable
X=np.c_[np.ones([len(X),1]),X]

In [13]:
#performing our calculation
theta=np.linalg.inv(X.T.dot(X)).dot(X.T.dot(Y))

In [14]:
theta

array([[-1.76941795e-16],
       [ 9.57979294e-01],
       [-5.82382735e-02],
       [ 3.64486404e-02],
       [ 2.81250000e-01],
       [ 3.90625000e-01],
       [ 2.81250000e-01]])

In [15]:
#prediction
#we need to scale our independed variable 
#so we use xs-(we initialize at starting) 
#important:should use xs.transform not xs.fit_transform
X=np.c_[np.ones([1,1]),xs.transform([[46426.07,157693.92,210797.67,1,0,0]])]

In [16]:
X

array([[ 1.        , -0.53405959,  1.23681729,  0.01542122,  1.38443731,
        -0.6770032 , -0.72231512]])

In [17]:
print((X.dot(theta)))

[[-0.66131856]]


In [18]:
#this is only for printing guys
ans=ys.inverse_transform(X.dot(theta))
for i in ans:
    for j in i:
        pass


In [19]:
print("Prediction profit for your company is {:.2f}".format(j))

Prediction profit for your company is 82227.37


In [20]:
Y_predict=np.c_[np.ones([len(x_test),1]),xs.transform(x_test)].dot(theta)
Y_predict
#this is scaled values we need to perform inverse transform

array([[ 0.09270973],
       [-1.63824358],
       [ 0.21459194],
       [ 2.02342159],
       [ 1.12230889],
       [-0.2541931 ],
       [-0.66131856],
       [ 0.98544783],
       [-0.10835055],
       [ 1.82854447],
       [ 0.89186226],
       [-0.28865319],
       [-0.68281844],
       [ 1.45476483],
       [-1.05095961]])

In [21]:
y_predict=ys.inverse_transform(Y_predict)

In [22]:
r2_score=np.sum((y_predict-np.mean(y_test)**2))/np.sum((y_test-np.mean(y_test)**2))

In [23]:
r2_score 

0.9999998628263665