# Backward Elimination

1. IMPORTING LIBRARIES 
- Pandas 
- Numpy
- Matplotlib.pyplot

In [136]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plyplot

2. IMPORTING DATASET

In [137]:
startup_df = pd.read_csv("50_Startups.csv")
startup_df.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [138]:
#Predict Profit (y)
y = startup_df.iloc[:,-1].values
#By using features (X)
X = startup_df.iloc[:,:-1].values


3. TAKE CARE OF MISSING DATA

In [139]:
startup_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [140]:
startup_df.nunique()

R&D Spend          49
Administration     50
Marketing Spend    48
State               3
Profit             50
dtype: int64

4. ENCODING CATEGORICAL DATA

In [141]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(
     transformers = [('encoder',OneHotEncoder(),[3])],
     remainder = 'passthrough'
)

X = np.array(ct.fit_transform(X))

In [142]:
# Avoiding the Dummy Variable Trap 
## ก่อนที่เราจะ spliting dataset เราจะต้อง eliminate 1 column ที่มาจาก multiple binary columns (ซึ่งเราแปลงมาจาก categorical column) 
## สาเหตุที่เราต้องลบ 1 columnทิ้ง เป็นเพราะว่า เราต้องการให้มันกลายเป็น default Stage ไป หรือก็คือ ค่าคงที่ b ในสมการ multilinear regression -> y = m0x0 + m1X1 + m2x2 + ... +b 
# เราจะต้องลบ 1 คอลัมน์ทิ้งไป ถ้ามี 5 feature columns ให้ ลดเหลือ 4 เสมอ 

X = X[:, 1:]
print(X)

[[0.0 1.0 165349.2 136897.8 471784.1]
 [0.0 0.0 162597.7 151377.59 443898.53]
 [1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 1.0 144372.41 118671.85 383199.62]
 [1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 1.0 131876.9 99814.71 362861.36]
 [0.0 0.0 134615.46 147198.87 127716.82]
 [1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 1.0 120542.52 148718.95 311613.29]
 [0.0 0.0 123334.88 108679.17 304981.62]
 [1.0 0.0 101913.08 110594.11 229160.95]
 [0.0 0.0 100671.96 91790.61 249744.55]
 [1.0 0.0 93863.75 127320.38 249839.44]
 [0.0 0.0 91992.39 135495.07 252664.93]
 [1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 1.0 114523.61 122616.84 261776.23]
 [0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 1.0 94657.16 145077.58 282574.31]
 [1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 1.0 86419.7 153514.11 0.0]
 [0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 1.0 78389.47 153773.43 299737.29]
 [1.0 0.0 73994.56 122782.75 303319.26]
 [1.0 0.0 67532.53 105751.03 304768.73]
 [0.0 1.0 77044.01 99281.34 140574.81]
 [0

5. Spliting the dataset into Training set and Test Set

In [143]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

6. Training the Multiple Linear Regression Model on the Training set

In [144]:
from sklearn.linear_model import LinearRegression
regression_ml = LinearRegression()
regression_ml.fit(X_train,y_train)

7. Predict y_predict from X_test using regression_ml we have built

In [145]:
y_pred = regression_ml.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[114664.42 105008.31]
 [ 90593.16  96479.51]
 [ 75692.84  78239.91]
 [ 70221.89  81229.06]
 [179790.26 191050.39]
 [171576.92 182901.99]
 [ 49753.59  35673.41]
 [102276.66 101004.64]
 [ 58649.38  49490.75]
 [ 98272.03  97483.56]]


8. Building the optimal model using Backward Elimination

เป็นวิธีการคัดเลือกตัวแปรอิสระ (Independent Variables or Predictors) ออกทีละตัว โดยตัวแปรอิสระที่มีความสัมพันธ์กับตัวแปรตามน้อยที่สุดจะถูกคัดออก ตัวแปรอิสระตัวแรกที่ถูกพิจารณาคัดออกจะมีความสำคัญต่อการทำนายค่าของ Y น้อยที่สุด

STEP 1 : Select a significance level to stay in the model (e.g. SL = 0.05)

STEP 2 : Fit the full model with all possible predictors

In [146]:
#Use for compute the P-Value and evaluate the statistical significance of our independent variables
import statsmodels.api as sm

# (2.1) Prepare Matrix X to work well with statsmodels.formula.api
    #เนื่องจากเราต้องการทำ Backward Elimination เราจึงเอา statmodels libraries มาหาค่า P-Value, SL 
    #ทำให้เราต้องแปลง y = b0 + b1x1 + b2x2 + bnxn เป็น y = b0x0 + b1x1 + b2x2 + bnxn (โดยที่ x0 มีค่า = 1) เพื่อที่จะให้สามารถใช้งานกับ Statsmodels ได้

    # เราเลยจะ Append Column ที่มีค่า 1 เข้าไปใน X ที่เป็น array 
    # ใช้ np.append(อาเรย์ที่ต้องการทำ, values ที่ต้องการ append เข้่าไป, แกนที่ต้องการ append เข้าไป)
    # np.ones(shape,dtype=None,order='C')
    # X = np.append(arr=X
    #              ,values= np.ones((50,1)).astype(int)
    #              ,axis=1
    #              )
    # However, ถ้าทำแบบด้านบน column ใหม่ที่ append มันจะไปอยู่ด้านหลังสุด ซึ่งเราต้องการให้มันอยู่ด้านหน้าสุด 
    # So do this instead,

X = np.append(arr= np.ones((50,1)).astype(int)
             ,values= X 
             ,axis=1
             )
#(2.2) before fitting in OLS method in Statsmodels เราต้องสร้าง Optimal Matrix X ขึ้นมา (X_opt)
# โดย (X_opt ในตอนแรกจะมี independent variables ครบทุกตัว เนื่องจาก STEP2 เราต้องการ Fit the full model with all possible predictors หรือ independent variables นั่นเอง) 
    ## X0 = 1
    ## X1 = California
    ## X2 = Florida
    ## X3 = R & D Spend
    ## x4 = Administration Cost
    ## x5 = Marketing Cost

    ## Note : I remove one selecte Dummy Variable Trap (New york -> Default State)  

X_opt = X[:,[0,1,2,3,4,5]] 
# (2.3) fit X_opt to OLS (Ordinary Least Square) to get coefficients of this multilinear regression model
X_opt = X_opt.astype(np.float64) 
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()


array([[1, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [1, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [1, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [1, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [1, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [1, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [1, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [1, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [1, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [1, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [1, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [1, 0.0, 1.0, 94657.16, 145077.58, 282574.31],
       [1, 1.0, 0.0, 9

##### STEP 3 : Consider the predictor with the highest P-Value. if P>SL, go to STEP 4, otherwise go to FIN
_______________________

Question
- As we know that we will remove predictors which have the P>SL  so I this that in the first place when we assume hypothesis 
    - H0 is X1 is not impact to Y
    - H1 is X1 is impact to Y

so when the result come in like x1 have P<SL we will reject H0 and Accept H1 instead 
so H0 is X1 is not impact to Y in the first place right ?

Answer 

- Yes, you're correct. In hypothesis testing within the context of regression analysis, we typically start with the assumption that the null hypothesis (\( H_0 \)) is true,   which often implies that the predictor variable (\( X_1 \)) has no impact on the outcome variable (\( Y \)). 

    So, for example:

    - \( H_0 \): \( X_1 \) has no effect on \( Y \) (i.e., \( \beta_1 = 0 \))
    - \( H_1 \): \( X_1 \) has an effect on \( Y \) (i.e., \( \beta_1 <> 0 \))

    Then, we conduct hypothesis tests, typically using the t-test or F-test, to determine whether there is sufficient evidence to reject the null hypothesis in favor of the alternative hypothesis.

    If the p-value associated with the test statistic is less than the significance level (\( P < SL \)), we reject the null hypothesis (\( H_0 \)) and conclude that there is evidence to suggest that the predictor variable (\( X_1 \)) has a statistically significant impact on the outcome variable (\( Y \)).

    Therefore, when \( P < SL \), we indeed reject the initial assumption that \( X_1 \) has no impact on \( Y \) (i.e., \( H_0 \)), and we accept the alternative hypothesis (\( H_1 \)) that \( X_1 \) does have an impact on \( Y \). This supports the inclusion of \( X_1 \) in the model.

- Conversely, if \( P >= SL \), we fail to reject the null hypothesis (\( H_0 \)), suggesting that there is insufficient evidence to conclude that \( X_1 \) has a significant impact on \( Y \). In this case, we may consider removing \( X_1 \) from the model to improve simplicity and interpretability.

________________________________
Hi Thanabodee,
I'd be glad to help you with the error you're encountering in the context of backward elimination for multiple linear regression using the statsmodels.formula.api module in the Udemy course "Machine Learning A-Z: AI, Python & R + ChatGPT Prize [2024]".

Understanding the Error:

The error message TypeError: Model.from_formula() missing 2 required positional arguments: 'formula' and 'data' indicates that the sm.ols function you're using is expecting a formula string and a data frame as input, but you're providing endog (dependent variable) and exog (independent variables) directly.

Correcting the Code:

Here's the corrected code snippet that addresses this issue:

Python

import statsmodels.formula.api as smf
 
- Assuming y is your dependent variable and X_opt is your DataFrame with independent variables
-    formula = "y ~ " + " + ".join(X_opt.columns)  # Create formula string with all columns from X_opt
-   regressor_OLS = smf.ols(formula, data=X_opt).fit()
-       regressor_OLS.summary()
Explanation:

Import statsmodels.formula.api: This line ensures you're using the correct submodule for formula-based model creation.

Create Formula String: The formula variable constructs a string that specifies the regression model with y as the dependent variable and all columns of X_opt as independent variables, separated by plus signs (+).

Fit the Model: The smf.ols function now takes the formula string (formula) and the data frame (X_opt) as arguments, correctly fitting the model using formula-based specification.

Summary: Finally, you can call regressor_OLS.summary() to display the model summary.

Additional Considerations:

Make sure X_opt is a pandas DataFrame containing your independent variables.

Double-check the column names in X_opt to ensure they match the variable names you intend to use in the formula.

By following these steps, you should be able to successfully use the statsmodels.formula.api module for backward elimination in your Python code.

If you have further questions or need more specific guidance related to the course material, feel free to ask!

Regards,

Irum
__________________________________________

เราได้ค่า P-value ดังนี้

    ## X0 = Constant = 1
    ## X1 = California                  - Px1 = 0.953
    ## X2 = Florida      (x)            - Px2 = 0.990
    ## X3 = R & D Spend                 - Px3 = 0.000
    ## x4 = Administration Cost         - Px4 = 0.608
    ## x5 = Marketing Cost              - Px5 = 0.123






##### STEP 4 : Remove the preditor

เนื่องจากเรา Assume ว่าเราอยู่ใน Universe H0 : independent varaibles ของเราจะไม่มี Impact ต่อ y_pred

โดยเราจะตัดตัวแปรที่ไม่มีนัยยะสำคัญทางสถิติออกก่อน โดยพิจารณาตัวที่ P-value ที่สูงที่สุดออกก่อน

In [147]:
#Remove X2 : เนื่องจากเรา Accept H0 (X ไม่สัมพันธ์กับ y)

    ## X0 = Constant = 1
    ## X1 = California
    ## X2 = Florida      (x)
    ## X3 = R & D Spend
    ## x4 = Administration Cost
    ## x5 = Marketing Cost

X_opt = X[:, [0, 1, 3, 4, 5]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Fri, 12 Apr 2024",Prob (F-statistic):,8.49e-29
Time:,19:05:56,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [152]:
#Remove X1 : เนื่องจากเรา Accept H0 (X ไม่สัมพันธ์กับ y)

    ## X0 = Constant = 1
    ## X1 = California   (x)
    ## X2 = Florida      (x)
    ## X3 = R & D Spend
    ## x4 = Administration Cost
    ## x5 = Marketing Cost

X_opt = X[:, [0, 3, 4, 5]]

X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Fri, 12 Apr 2024",Prob (F-statistic):,4.53e-30
Time:,19:11:17,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [148]:
#Remove X4 : เนื่องจากเรา Accept H0 (X ไม่สัมพันธ์กับ y) 
## Even it call x2 but it accuary is x4 because it rearrange new order.
 
    ## X0 = 1
    ## X1 = California   (x)
    ## X2 = Florida      (x)
    ## X3 = R & D Spend
    ## x4 = Administration Cost (x)
    ## x5 = Marketing Cost

X_opt = X[:, [0, 3, 5]]

X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Fri, 12 Apr 2024",Prob (F-statistic):,2.1600000000000003e-31
Time:,19:05:56,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [149]:
#Remove X5 : เนื่องจากเรา Accept H0 (X ไม่สัมพันธ์กับ y)
## Even it call x2 but it accuary is x4 because it rearrange new order.

    ## X0 = 1
    ## X1 = California   (x)
    ## X2 = Florida      (x)
    ## X3 = R & D Spend
    ## x4 = Administration Cost (x)
    ## x5 = Marketing Cost (x)

X_opt = X[:, [0, 3]]


X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Fri, 12 Apr 2024",Prob (F-statistic):,3.5000000000000004e-32
Time:,19:05:56,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


__________________________________

So we get y = b0x0 + b3x3 + e