In [1]:
import pandas as pd
import statsmodels.api as sm

# Load the Boston Housing dataset
boston_data = pd.read_csv('/content/HousingData.csv')

# Display the first few rows to understand the structure of the dataset
print(boston_data.head())

# Handle missing values if any (replace '?' with NaN and drop rows with NaN values)
boston_data.replace('?', pd.NA, inplace=True)
boston_data.dropna(inplace=True)

# Convert relevant columns to numeric types (assuming all relevant columns are numeric)
# If there are columns that need conversion, uncomment and adapt the following line:
# boston_data = boston_data.apply(pd.to_numeric, errors='ignore')

# Define the independent variables (select relevant features)
# For example, we'll use 'RM' (average number of rooms), 'CRIM' (per capita crime rate by town), and 'AGE' (proportion of owner-occupied units built prior to 1940)
X = boston_data[['RM', 'CRIM', 'AGE']]

# Define the dependent variable
Y = boston_data['MEDV']  # MEDV is the median value of owner-occupied homes in $1000s

# Add a constant term to the independent variables (intercept)
X = sm.add_constant(X)

# Fit the multiple linear regression model
model = sm.OLS(Y, X).fit()

# Print the summary of the regression model
print(model.summary())


      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622    3  222     18.7   

        B  LSTAT  MEDV  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90    NaN  36.2  
                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.610
Model:                            OLS   Adj. R-squared:                  0.607
Method:                 Least Squares   F-statistic:                     203.6
Date:                Tue, 03 Sep 2024   Prob (F-stati