# Wraping method for filteration

# Read Dataset

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('50_Startups.csv')

In [3]:
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [4]:
# Dependent Data
y = dataset['Profit']

# Feature Enginearing

In [5]:
X = dataset[ ['R&D Spend', 'Administration', 'Marketing Spend', 'State' ] ]

In [6]:
X.head(2)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California


In [7]:
X = pd.get_dummies(X , drop_first=True )

In [8]:
X.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


# Train test split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Creating model

In [11]:
from sklearn.linear_model import LinearRegression

In [12]:
model = LinearRegression()

In [13]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Prediction

In [14]:
y_pred = model.predict(X_test)

In [15]:
y_test

13    134307.35
39     81005.76
30     99937.59
45     64926.08
17    125370.37
48     35673.41
26    105733.54
25    107404.34
32     97427.84
19    122776.86
Name: Profit, dtype: float64

In [16]:
y_pred

array([126362.87908255,  84608.45383634,  99677.49425147,  46357.46068582,
       128750.48288504,  50912.4174188 , 109741.35032702, 100643.24281647,
        97599.27574594, 113097.42524432])

# Accuracy 

In [17]:
residual = 134307 - 126362

In [18]:
error = residual/134307 * 100

In [19]:
error

5.9155516838288404

In [20]:
accuracy = 100 - error

In [21]:
accuracy

94.08444831617116

In [22]:
model.coef_

array([ 8.05630064e-01, -6.87878823e-02,  2.98554429e-02,  9.38793006e+02,
        6.98775997e+00])

# Using Wrapping method for increasing the performance 

In [23]:
X.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida',
       'State_New York'],
      dtype='object')

In [24]:
# creating ols model
import statsmodels.api as sm

In [25]:
model_ols=sm.OLS(y,X).fit()

In [26]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared (uncentered):,0.988
Model:,OLS,Adj. R-squared (uncentered):,0.986
Method:,Least Squares,F-statistic:,727.1
Date:,"Tue, 21 Apr 2020",Prob (F-statistic):,7.87e-42
Time:,00:16:37,Log-Likelihood:,-545.15
No. Observations:,50,AIC:,1100.0
Df Residuals:,45,BIC:,1110.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
R&D Spend,0.7182,0.066,10.916,0.000,0.586,0.851
Administration,0.3113,0.035,8.885,0.000,0.241,0.382
Marketing Spend,0.0786,0.023,3.429,0.001,0.032,0.125
State_Florida,3464.4536,4905.406,0.706,0.484,-6415.541,1.33e+04
State_New York,5067.8937,4668.238,1.086,0.283,-4334.419,1.45e+04

0,1,2,3
Omnibus:,1.355,Durbin-Watson:,1.288
Prob(Omnibus):,0.508,Jarque-Bera (JB):,1.241
Skew:,-0.237,Prob(JB):,0.538
Kurtosis:,2.391,Cond. No.,828000.0


# Taking b for the analysis (putting on the start X0=1)

In [27]:
import numpy as np
ones=np.ones((50,1)) 
ones.shape

(50, 1)

In [28]:
X_new = np.append(arr=ones , values=X, axis=1)

In [29]:
X.head(2)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0


In [30]:
X_new=pd.DataFrame(X_new,columns=['ones','R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida',
       'State_New York'])
X_new

Unnamed: 0,ones,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,1.0,165349.2,136897.8,471784.1,0.0,1.0
1,1.0,162597.7,151377.59,443898.53,0.0,0.0
2,1.0,153441.51,101145.55,407934.54,1.0,0.0
3,1.0,144372.41,118671.85,383199.62,0.0,1.0
4,1.0,142107.34,91391.77,366168.42,1.0,0.0
5,1.0,131876.9,99814.71,362861.36,0.0,1.0
6,1.0,134615.46,147198.87,127716.82,0.0,0.0
7,1.0,130298.13,145530.06,323876.68,1.0,0.0
8,1.0,120542.52,148718.95,311613.29,0.0,1.0
9,1.0,123334.88,108679.17,304981.62,0.0,0.0


In [31]:
model_ols = sm.OLS(endog=y ,  exog=X_new ).fit()

In [32]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Tue, 21 Apr 2020",Prob (F-statistic):,1.34e-27
Time:,00:50:47,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ones,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
R&D Spend,0.8060,0.046,17.369,0.000,0.712,0.900
Administration,-0.0270,0.052,-0.517,0.608,-0.132,0.078
Marketing Spend,0.0270,0.017,1.574,0.123,-0.008,0.062
State_Florida,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
State_New York,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [33]:
X_new.columns

Index(['ones', 'R&D Spend', 'Administration', 'Marketing Spend',
       'State_Florida', 'State_New York'],
      dtype='object')

In [34]:
# Removing the state_new york variable because its p_value is more then significant value(.05) performance=94.5
X_new = X_new[['ones','R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida']]

In [35]:
model_ols = sm.OLS(endog=y ,  exog=X_new ).fit()

In [36]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Tue, 21 Apr 2020",Prob (F-statistic):,8.49e-29
Time:,01:08:22,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ones,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
R&D Spend,0.8060,0.046,17.606,0.000,0.714,0.898
Administration,-0.0270,0.052,-0.523,0.604,-0.131,0.077
Marketing Spend,0.0270,0.017,1.592,0.118,-0.007,0.061
State_Florida,220.1585,2900.536,0.076,0.940,-5621.821,6062.138

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [37]:
X_new.columns

Index(['ones', 'R&D Spend', 'Administration', 'Marketing Spend',
       'State_Florida'],
      dtype='object')

In [38]:
# Removing State_florida value .940>.05 performance =94.6
X_new = X_new[['ones', 'R&D Spend', 'Administration', 'Marketing Spend']]

In [39]:
model_ols = sm.OLS(endog=y ,  exog=X_new ).fit()

In [40]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Tue, 21 Apr 2020",Prob (F-statistic):,4.53e-30
Time:,01:12:23,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ones,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
R&D Spend,0.8057,0.045,17.846,0.000,0.715,0.897
Administration,-0.0268,0.051,-0.526,0.602,-0.130,0.076
Marketing Spend,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [41]:
X_new.columns

Index(['ones', 'R&D Spend', 'Administration', 'Marketing Spend'], dtype='object')

In [42]:
# Removing Administrations .602>.05 performance=94.8
X_new = X_new[['ones', 'R&D Spend','Marketing Spend']]

In [43]:
model_ols = sm.OLS(endog=y ,  exog=X_new ).fit()

In [44]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Tue, 21 Apr 2020",Prob (F-statistic):,2.1600000000000003e-31
Time:,01:14:47,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ones,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
R&D Spend,0.7966,0.041,19.266,0.000,0.713,0.880
Marketing Spend,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [45]:
X_new.columns

Index(['ones', 'R&D Spend', 'Marketing Spend'], dtype='object')

In [46]:
# removing Marketing spend .06>.05  pperformance =94.8
X_new1= X_new[['ones', 'R&D Spend']]

In [47]:
model_ols = sm.OLS(endog=y ,  exog=X_new1).fit()

In [48]:
model_ols.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Tue, 21 Apr 2020",Prob (F-statistic):,3.5000000000000004e-32
Time:,01:18:13,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
ones,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
R&D Spend,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [63]:
# perforomance is decreses(94.5 ) so we will not remove previous variable
# finalize feature extraction after checking the performance is
# Independent Data
X_new = X_new[['R&D Spend','Marketing Spend']]

# Train_test_split

In [64]:
from sklearn.model_selection import train_test_split

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.20, random_state=42)

# Creating model

In [66]:
from sklearn.linear_model import LinearRegression

In [67]:
model = LinearRegression()

In [60]:
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Prediction

In [61]:
X_train.shape

(40, 2)

In [62]:
y_train.shape

(40,)

In [57]:
y_pred=model.predict(X_test)

In [58]:
y_pred

array([127521.38604123,  82615.07411457,  97683.2462344 ,  46400.65677644,
       130782.53611917,  45967.0205249 , 109813.19061887, 101612.68921418,
        97023.64013854, 113241.36575804])

In [59]:
y_test

13    134307.35
39     81005.76
30     99937.59
45     64926.08
17    125370.37
48     35673.41
26    105733.54
25    107404.34
32     97427.84
19    122776.86
Name: Profit, dtype: float64

# Accuracy

In [60]:
accuracy=(127521/134307)*100
accuracy

94.94739663606512

In [61]:
model.coef_

array([0.78337431, 0.03923979])