In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
data = pd.read_csv("/content/NSSO68.csv")

In [5]:
#subset daa for GUJ
subset_data = data[data['state_1'] == 'GUJ'][['foodtotal_q', 'MPCE_MRP', 'MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card', 'Education', 'No_of_Meals_per_day']]
print(subset_data.head())

   foodtotal_q  MPCE_MRP  MPCE_URP  Age  Meals_At_Home  Possess_ration_card  \
0    30.942394   3662.65   3304.80   50           59.0                    1   
1    29.286153   5624.51   7613.00   40           56.0                    1   
2    31.527046   3657.18   3461.40   45           60.0                    1   
3    27.834607   3260.37   3339.00   75           60.0                    1   
4    27.600713   2627.54   2604.25   30           59.0                    1   

   Education  No_of_Meals_per_day  
0          8                    2  
1         12                    2  
2          7                    2  
3          6                    2  
4          7                    2  


In [6]:
# Check for missing values
print(subset_data.isnull().sum())

foodtotal_q            0
MPCE_MRP               0
MPCE_URP               0
Age                    0
Meals_At_Home          0
Possess_ration_card    0
Education              0
No_of_Meals_per_day    0
dtype: int64


In [7]:
# Drop rows with any missing values
subset_data.dropna(inplace=True)

In [9]:
# Define the independent variables (X) and dependent variable (y)
X = subset_data[['MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card', 'Education', 'foodtotal_q']]
y = subset_data['MPCE_MRP']
# Add constant to the features
X = sm.add_constant(X)

# Split data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Fit the OLS model
model = sm.OLS(y_train, X_train).fit()

# Print model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               MPCE_MRP   R-squared:                       0.509
Model:                            OLS   Adj. R-squared:                  0.504
Method:                 Least Squares   F-statistic:                     100.9
Date:                Sun, 23 Jun 2024   Prob (F-statistic):           5.97e-87
Time:                        17:45:07   Log-Likelihood:                -5115.4
No. Observations:                 592   AIC:                         1.024e+04
Df Residuals:                     585   BIC:                         1.028e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                 640.1751    

In [13]:
# Check for multicollinearity using VIF
def calculate_vif(X):
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

In [14]:
# ipython-input-13-7d005dd0a72d
!pip install statsmodels
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Check for multicollinearity using VIF
def calculate_vif(X):
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif



In [15]:
# Calculate VIF for independent variables
vif_data = X_train.drop(columns=['const'])  # Exclude the constant column
vif_scores = calculate_vif(vif_data)
print(vif_scores)


              Variable        VIF
0             MPCE_URP   3.198491
1                  Age  10.318819
2        Meals_At_Home  17.285133
3  Possess_ration_card   7.822056
4            Education   5.909523
5          foodtotal_q  12.272383


In [16]:
# Extract coefficients from the model
coefficients = model.params

# Construct the equation
equation = "y = {:.2f}".format(coefficients['const'])
for i in range(1, len(coefficients)):
    equation += " + {:.6f}*{}".format(coefficients[i], X_train.columns[i])

# Print the equation
print(equation)

y = 640.18 + 0.382287*MPCE_URP + -1.910274*Age + -28.281442*Meals_At_Home + 244.922842*Possess_ration_card + 118.555642*Education + 63.978323*foodtotal_q
