In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [4]:
import os
os.chdir("C:\\Users\\nithe\\OneDrive\\Desktop")

In [5]:
data = pd.read_csv("NSSO68.csv")

  data = pd.read_csv("NSSO68.csv")


In [6]:
data['state_1'].unique()

array(['GUJ', 'ORI', 'CHTSD', 'MP', 'JRKD', 'WB', 'AP', 'MH', 'D&D',
       'D&NH', 'MIZ', 'TRPR', 'MANPR', 'ASSM', 'MEG', 'NAG', 'A&N',
       'PNDCRY', 'TN', 'GOA', 'KA', 'KE', 'LKSDP', 'SKM', 'Bhr', 'UP',
       'RJ', 'ARP', 'DL', 'HR', 'Pun', 'HP', 'UT', 'Chandr', 'J$K'],
      dtype=object)

In [7]:
#Subsetting the data
subset_data = data[data['state_1'] == 'WB'][['foodtotal_q', 'MPCE_MRP', 'MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card', 'Education', 'No_of_Meals_per_day']]
print(subset_data)

       foodtotal_q  MPCE_MRP  MPCE_URP  Age  Meals_At_Home  \
6218     31.463011   3088.75   3079.00   45           90.0   
6219     20.178901   1677.95   1481.00   50           60.0   
6220     14.793985    700.26    619.75   36           60.0   
6221     16.100309    700.26    657.67   58           60.0   
6222     24.567193   1070.02   1027.67   35           90.0   
...            ...       ...       ...  ...            ...   
57794    22.500550    981.47    982.33   30           90.0   
57795    23.120310    921.82   1039.60   47           60.0   
57796    23.275344    748.20    914.00   37           60.0   
57797    40.375775   1552.38   1833.00   25           90.0   
57798    17.585879    771.86    919.29   56           60.0   

       Possess_ration_card  Education  No_of_Meals_per_day  
6218                   1.0       10.0                  3.0  
6219                   1.0       10.0                  2.0  
6220                   1.0       10.0                  2.0  
6221       

In [8]:
#Checking for missing values
print(subset_data['MPCE_MRP'].isna().sum())
print(subset_data['MPCE_URP'].isna().sum())
print(subset_data['Age'].isna().sum())
print(subset_data['Possess_ration_card'].isna().sum())
print(data['Education'].isna().sum())

0
0
0
0
7


In [9]:
#Creating a function to impute th emissing values with the mean of the variable
def impute_with_mean(data, columns):
    for column in columns:
        data[column].fillna(data[column].mean(), inplace=True)
    return data

In [10]:
#Imputiong the columns
columns_to_impute = ['Education', 'MPCE_MRP', 'MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card', 'foodtotal_q']

In [11]:
subset_data = impute_with_mean(subset_data, columns_to_impute)

In [12]:
print(subset_data.isna().sum()) 

foodtotal_q            0
MPCE_MRP               0
MPCE_URP               0
Age                    0
Meals_At_Home          0
Possess_ration_card    0
Education              0
No_of_Meals_per_day    5
dtype: int64


In [13]:
#Fitting the regression model
X = subset_data[['MPCE_MRP', 'MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card', 'Education']]
X = sm.add_constant(X)  # Adds a constant term to the predictor
y = subset_data['foodtotal_q']


In [14]:
model = sm.OLS(y, X).fit()

In [15]:
#Printinf the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:            foodtotal_q   R-squared:                       0.159
Model:                            OLS   Adj. R-squared:                  0.158
Method:                 Least Squares   F-statistic:                     198.5
Date:                Sun, 23 Jun 2024   Prob (F-statistic):          1.70e-232
Time:                        22:13:58   Log-Likelihood:                -21836.
No. Observations:                6315   AIC:                         4.369e+04
Df Residuals:                    6308   BIC:                         4.373e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  13.2794    

In [16]:
#Checking for multicollinearity using Inflator Factor (VIF)
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]


In [17]:
print(vif_data)

               feature        VIF
0                const  60.407254
1             MPCE_MRP   3.369163
2             MPCE_URP   3.352720
3                  Age   1.037776
4        Meals_At_Home   1.059513
5  Possess_ration_card   1.035070
6            Education   1.265411


In [18]:
#Extracting the coefficients from the model
coefficients = model.params

In [19]:
#Constructing the equation
equation = f"y = {coefficients[0]:.2f}"
for i in range(1, len(coefficients)):
    equation += f" + {coefficients[i]:.6f}*x{i}"
print(equation)


y = 13.28 + 0.000515*x1 + 0.000747*x2 + 0.076463*x3 + 0.092386*x4 + -2.277590*x5 + 0.192134*x6
