In [93]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings('ignore')

In [94]:
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [95]:
df = np.round(pd.read_csv('50_Startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)
df.shape

(50, 4)

In [96]:
df.sample(4)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
1,16.0,15.0,44.0,19.0
15,11.0,12.0,26.0,13.0
29,7.0,15.0,11.0,10.0
17,9.0,15.0,28.0,13.0


In [97]:
df = df.iloc[:,0:-1]

In [98]:
# Adding missing values
df.iloc[1,0] = np.NaN
df.iloc[3,1] = np.NaN
df.iloc[-1,-1] = np.NaN

In [99]:
# Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [100]:
# 0th iteration
df0.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,7.163265,15.0,44.0
2,15.0,10.0,41.0
3,14.0,12.163265,38.0
4,14.0,9.0,37.0


In [101]:
# Remove the col1 imputed value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN

df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,,15.0,44.0
2,15.0,10.0,41.0
3,14.0,12.163265,38.0
4,14.0,9.0,37.0


In [102]:
# Use 4 rows to build a model and use the last for prediction

X = df1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
0,14.0,47.0
2,10.0,41.0
3,12.163265,38.0
4,9.0,37.0


In [103]:
y = df1.iloc[[0,2,3,4],0]
y

0    17.0
2    15.0
3    14.0
4    14.0
Name: R&D Spend, dtype: float64

In [104]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[1,1:].values.reshape(1,2))

array([15.90294324])

In [105]:
df1.iloc[1,0] = 15.90294324

In [106]:
df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,15.902943,15.0,44.0
2,15.0,10.0,41.0
3,14.0,12.163265,38.0
4,14.0,9.0,37.0


In [107]:
# Remove the col2 imputed value

df1.iloc[3,1] = np.NaN

df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,15.902943,15.0,44.0
2,15.0,10.0,41.0
3,14.0,,38.0
4,14.0,9.0,37.0


In [108]:
# Use other rows to build a model and use the first for prediction
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
0,17.0,47.0
1,15.902943,44.0
2,15.0,41.0
4,14.0,37.0


In [109]:
y = df1.iloc[[0,1,2,4],1]
y

0    14.0
1    15.0
2    10.0
4     9.0
Name: Administration, dtype: float64

In [110]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))

array([10.14874768])

In [111]:
df1.iloc[3,1] = 10.14874768

In [112]:
df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,15.902943,15.0,44.0
2,15.0,10.0,41.0
3,14.0,10.148748,38.0
4,14.0,9.0,37.0


In [113]:
# Remove the col3 imputed value
df1.iloc[4,-1] = np.NaN

df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,15.902943,15.0,44.0
2,15.0,10.0,41.0
3,14.0,10.148748,38.0
4,14.0,9.0,


In [114]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
0,17.0,14.0
1,15.902943,15.0
2,15.0,10.0
3,14.0,10.148748


In [115]:
y = df1.iloc[0:4,-1]
y

0    47.0
1    44.0
2    41.0
3    38.0
Name: Marketing Spend, dtype: float64

In [116]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))

array([37.96193327])

In [117]:
df1.iloc[4,-1] = 37.96193327

In [118]:
# After 1st Iteration
df1.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,15.902943,15.0,44.0
2,15.0,10.0,41.0
3,14.0,10.148748,38.0
4,14.0,9.0,37.961933


In [119]:
# Subtract 0th iteration from 1st iteration
df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,0.0,0.0,0.0
1,8.739678,0.0,0.0
2,0.0,0.0,0.0
3,0.0,-2.014518,0.0
4,0.0,0.0,0.961933
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [120]:
df2 = df1.copy()

df2.iloc[1,0] = np.NaN

df2.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,,15.0,44.0
2,15.0,10.0,41.0
3,14.0,10.148748,38.0
4,14.0,9.0,37.961933


In [121]:
X = df2.iloc[[0,2,3,4],1:3]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[1,1:].values.reshape(1,2))

array([15.9939676])

In [122]:
df2.iloc[1,0] = 15.9939676

In [123]:
df2.iloc[3,1] = np.NaN
X = df2.iloc[[0,1,2,4],[0,2]]
y = df2.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))

array([10.13176918])

In [124]:
df2.iloc[3,1] = 10.13176918

In [125]:
df2.iloc[4,-1] = np.NaN

X = df2.iloc[0:4,0:2]
y = df2.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))

array([37.99757961])

In [126]:

df2.iloc[4,-1] = 37.99757961

In [127]:
df2.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,15.993968,15.0,44.0
2,15.0,10.0,41.0
3,14.0,10.131769,38.0
4,14.0,9.0,37.99758


In [128]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,0.0,0.0,0.0
1,0.091024,0.0,0.0
2,0.0,0.0,0.0
3,0.0,-0.016979,0.0
4,0.0,0.0,0.035646
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [130]:
df3 = df2.copy()

df3.iloc[1,0] = np.NaN

df3.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,,15.0,44.0
2,15.0,10.0,41.0
3,14.0,10.131769,38.0
4,14.0,9.0,37.99758


In [131]:
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[1,1:].values.reshape(1,2))

array([15.99960587])

In [132]:
df3.iloc[1,0] = 15.99960587

In [133]:
df3.iloc[3,1] = np.NaN
X = df3.iloc[[0,1,2,4],[0,2]]
y = df3.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[3,[0,2]].values.reshape(1,2))

array([10.12716602])

In [134]:
df3.iloc[3,1] = 10.12716602

In [135]:
df3.iloc[4,-1] = np.NaN

X = df3.iloc[0:4,0:2]
y = df3.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[4,0:2].values.reshape(1,2))

array([37.99984235])

In [None]:
df3.iloc[4,-1] = 37.99984235

In [138]:
df3.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,17.0,14.0,47.0
1,15.999606,15.0,44.0
2,15.0,10.0,41.0
3,14.0,10.127166,38.0
4,14.0,9.0,37.999842


In [139]:
df3 - df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,0.0,0.0,0.0
1,0.005638,0.0,0.0
2,0.0,0.0,0.0
3,0.0,-0.004603,0.0
4,0.0,0.0,0.002263
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0
