In [58]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [59]:
import os
os.chdir('/Users/janybalashiva/Downloads')

In [60]:
data = pd.read_csv("NSSO68.csv")

  data = pd.read_csv("NSSO68.csv")


In [61]:
data['state_1'].unique()

array(['GUJ', 'ORI', 'CHTSD', 'MP', 'JRKD', 'WB', 'AP', 'MH', 'D&D',
       'D&NH', 'MIZ', 'TRPR', 'MANPR', 'ASSM', 'MEG', 'NAG', 'A&N',
       'PNDCRY', 'TN', 'GOA', 'KA', 'KE', 'LKSDP', 'SKM', 'Bhr', 'UP',
       'RJ', 'ARP', 'DL', 'HR', 'Pun', 'HP', 'UT', 'Chandr', 'J$K'],
      dtype=object)

In [62]:
#Subsetting the data
subset_data = data[data['state_1'] == 'ORI'][['foodtotal_q', 'MPCE_MRP', 'MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card', 'Education', 'No_of_Meals_per_day']]
print(subset_data)

       foodtotal_q  MPCE_MRP  MPCE_URP  Age  Meals_At_Home  \
741      33.110413   3844.95   3455.50   31           60.0   
742      31.683645   2377.28   2572.67   42           60.0   
743      25.575244   2039.86   1792.75   53           60.0   
744      24.920166    970.04    880.00   60           60.0   
745      24.742780    935.56    854.50   35           90.0   
...            ...       ...       ...  ...            ...   
87695    27.500300    966.50    926.00   61           90.0   
87696    39.626475   5022.53   1859.83   72           60.0   
87697    20.333953   2050.26   2006.33   30           80.0   
87698    26.916975   1176.12   1422.17   48           90.0   
87699    26.933683    715.75    634.33   55           90.0   

       Possess_ration_card  Education  No_of_Meals_per_day  
741                    2.0       12.0                  2.0  
742                    1.0       12.0                  2.0  
743                    1.0       10.0                  2.0  
744        

In [63]:
#Checking for missing values
print(subset_data['MPCE_MRP'].isna().sum())
print(subset_data['MPCE_URP'].isna().sum())
print(subset_data['Age'].isna().sum())
print(subset_data['Possess_ration_card'].isna().sum())
print(data['Education'].isna().sum())

0
0
0
0
7


In [64]:
#Creating a function to impute th emissing values with the mean of the variable
def impute_with_mean(data, columns):
    for column in columns:
        data[column].fillna(data[column].mean(), inplace=True)
    return data

In [65]:
#Imputiong the columns
columns_to_impute = ['Education', 'MPCE_MRP', 'MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card', 'foodtotal_q']

In [66]:
subset_data = impute_with_mean(subset_data, columns_to_impute)

In [67]:
print(subset_data.isna().sum()) 

foodtotal_q            0
MPCE_MRP               0
MPCE_URP               0
Age                    0
Meals_At_Home          0
Possess_ration_card    0
Education              0
No_of_Meals_per_day    2
dtype: int64


In [68]:
#Fitting the regression model
X = subset_data[['MPCE_MRP', 'MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card', 'Education']]
X = sm.add_constant(X)  # Adds a constant term to the predictor
y = subset_data['foodtotal_q']


In [69]:
model = sm.OLS(y, X).fit()

In [70]:
#Printinf the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            foodtotal_q   R-squared:                       0.233
Model:                            OLS   Adj. R-squared:                  0.232
Method:                 Least Squares   F-statistic:                     203.9
Date:                Sun, 23 Jun 2024   Prob (F-statistic):          1.21e-227
Time:                        22:33:43   Log-Likelihood:                -13277.
No. Observations:                4026   AIC:                         2.657e+04
Df Residuals:                    4019   BIC:                         2.661e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   7.9462    

In [71]:
#Checking for multicollinearity using Inflator Factor (VIF)
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]


In [72]:
print(vif_data)

               feature        VIF
0                const  60.722903
1             MPCE_MRP   3.070729
2             MPCE_URP   2.991975
3                  Age   1.073295
4        Meals_At_Home   1.122845
5  Possess_ration_card   1.157489
6            Education   1.338419


In [73]:
#Extracting the coefficients from the model
coefficients = model.params

In [74]:
#Constructing the equation
equation = f"y = {coefficients[0]:.2f}"
for i in range(1, len(coefficients)):
    equation += f" + {coefficients[i]:.6f}*x{i}"
print(equation)


y = 7.95 + 0.002054*x1 + 0.000921*x2 + 0.107187*x3 + 0.090582*x4 + -0.852766*x5 + 0.131429*x6
