In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

In [2]:
startups = pd.read_csv("https://raw.githubusercontent.com/krishnaik06/Multiple-Linear-Regression/master/50_Startups.csv")

In [3]:
startups2 = startups.rename(columns={'R&D Spend':'RyDSpend','Marketing Spend':'MarketingSpend'})
startups2.head()

Unnamed: 0,RyDSpend,Administration,MarketingSpend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


Hacemos las dummy de las cuidades y quitamos una que es redundante

In [4]:
dummy_state_tier = pd.get_dummies(startups2["State"], prefix = "State").iloc[:,1:]
dummy_state_tier.head()

Unnamed: 0,State_Florida,State_New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0


In [5]:
column_names = startups2.columns.values.tolist()
startups_new = startups2[column_names].join(dummy_state_tier)
startups_new.head()

Unnamed: 0,RyDSpend,Administration,MarketingSpend,State,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,New York,192261.83,0,1
1,162597.7,151377.59,443898.53,California,191792.06,0,0
2,153441.51,101145.55,407934.54,Florida,191050.39,1,0
3,144372.41,118671.85,383199.62,New York,182901.99,0,1
4,142107.34,91391.77,366168.42,Florida,166187.94,1,0


In [30]:
lm_n = smf.ols(formula="Administration~MarketingSpend+RyDSpend", data = startups_new).fit()
rsquared_n = lm_n.rsquared
VIF = 1/(1-rsquared_n)
VIF

1.1750910070550453

In [31]:
lm_n = smf.ols(formula="MarketingSpend~Administration+RyDSpend", data = startups_new).fit()
rsquared_n = lm_n.rsquared
VIF = 1/(1-rsquared_n)
VIF

2.3267732905308773

In [32]:
lm_n = smf.ols(formula="RyDSpend~MarketingSpend+Administration", data = startups_new).fit()
rsquared_n = lm_n.rsquared
VIF = 1/(1-rsquared_n)
VIF

2.4689030699947017

Decido no quitar ninguna porque los valores de VIF salen bien

In [35]:
feature_cols = ["Administration", "MarketingSpend","RyDSpend","State_Florida","State_New York"]
X = startups_new[feature_cols]
Y = startups_new["Profit"]
lm = LinearRegression()
lm.fit(X,Y)
print(lm.intercept_)
print(lm.coef_)

50125.34383165128
[-2.70043196e-02  2.69798610e-02  8.06023114e-01  1.98788793e+02
 -4.18870191e+01]


In [34]:
list(zip(feature_cols, lm.coef_))

[('Administration', -0.027004319575493617),
 ('MarketingSpend', 0.02697986103451222),
 ('RyDSpend', 0.8060231137179691),
 ('State_Florida', 198.78879286278894),
 ('State_New York', -41.88701913356584)]

In [11]:
lm.score(X,Y)

0.9489924406645264

In [36]:
startups_new["Prediccion"]= 50125.34383 - 0.0270043195 * startups_new["Administration"] + 0.02697986 * startups_new["MarketingSpend"] + 0.80602311 * startups_new["RyDSpend"] + 198.78879286 * startups_new["State_Florida"] - 41.887019133 * startups_new["State_New York"]
startups_new.head()

Unnamed: 0,RyDSpend,Administration,MarketingSpend,State,Profit,State_Florida,State_New York,Prediccion,Prediccion2
0,165349.2,136897.8,471784.1,New York,192261.83,0,1,192390.570269,163853.359207
1,162597.7,151377.59,443898.53,California,191792.06,0,0,189071.319051,164398.087295
2,153441.51,101145.55,407934.54,Florida,191050.39,1,0,182276.185746,158271.000735
3,144372.41,118671.85,383199.62,New York,182901.99,0,1,173584.975264,151628.401685
4,142107.34,91391.77,366168.42,Florida,166187.94,1,0,172277.132915,130253.27564


In [37]:
SSD = np.sum((startups_new["Prediccion"] - startups_new["Profit"])**2)
SSD

3920339644.401898

In [38]:
RSE = np.sqrt(SSD/(len(startups_new)-len(feature_cols)-1))
RSE

9439.206973144583

In [39]:
spend_mean=np.mean(startups_new["Profit"])
spend_mean

112012.63920000002

In [40]:
error = RSE/spend_mean
error*100

8.426912391815673

*****SciKit+Learn*****

In [41]:
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=4, step=1)
selector = selector.fit(X,Y)

In [42]:
selector.support_

In [43]:
selector.ranking_

In [45]:
X_pred = X[["Administration", "MarketingSpend","RyDSpend","State_New York"]]

In [47]:
lm = LinearRegression()
lm.fit(X_pred, Y)

LinearRegression()

In [48]:
lm.intercept_

50179.36599153635

In [49]:
lm.coef_

In [24]:
lm.score(X_pred, Y)

0.948956521872247

In [50]:
startups_new["Prediccion2"]= 50179.36599 - 0.02690181 * startups_new["Administration"] + 0.027149874 * startups_new["MarketingSpend"] + 0.805934190 * startups_new["RyDSpend"] - 136.5041814 * startups_new["State_New York"]
startups_new.head()

Unnamed: 0,RyDSpend,Administration,MarketingSpend,State,Profit,State_Florida,State_New York,Prediccion,Prediccion2
0,165349.2,136897.8,471784.1,New York,192261.83,0,1,192390.570269,192429.515643
1,162597.7,151377.59,443898.53,California,191792.06,0,0,189071.319051,189201.869629
2,153441.51,101145.55,407934.54,Florida,191050.39,1,0,182276.185746,182197.498057
3,144372.41,118671.85,383199.62,New York,182901.99,0,1,173584.975264,173608.856959
4,142107.34,91391.77,366168.42,Florida,166187.94,1,0,172277.132915,172191.35238


In [53]:
SSD2 = np.sum((startups_new["Prediccion2"] - startups_new["Profit"])**2)
SSD2

3920649482.965228

In [54]:
RSE2 = np.sqrt(SSD2/(len(startups_new)-len(feature_cols)-1))
RSE2

9439.579973038606

In [57]:
spend_mean=np.mean(startups_new["Profit"])
spend_mean

112012.63920000002

In [58]:
error2 = RSE2/spend_mean
error2*100

8.427245389856509