# Load and Explore Data Set

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston

# Load the dataset
boston = load_boston()
data = pd.DataFrame(boston.data, columns=boston.feature_names)
data['MEDV'] = boston.target

# Explore the dataset
print(data.head())
print(data.describe())
print(data.info())


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['MEDV'])
plt.title('Boxplot of Median Value of Owner-Occupied Homes (MEDV)')
plt.xlabel('MEDV')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
data['CHAS'].value_counts().plot(kind='bar', color=['blue', 'green'])
plt.title('Bar Plot of Charles River Dummy Variable (CHAS)')
plt.xlabel('CHAS')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['No', 'Yes'], rotation=0)
plt.show()


In [None]:
# Discretize AGE into three groups
bins = [0, 35, 70, np.inf]
labels = ['35 years and younger', '35 to 70 years', '70 years and older']
data['AGE_GROUP'] = pd.cut(data['AGE'], bins=bins, labels=labels)

plt.figure(figsize=(12, 6))
sns.boxplot(x=data['AGE_GROUP'], y=data['MEDV'])
plt.title('Boxplot of MEDV vs. AGE Groups')
plt.xlabel('AGE Group')
plt.ylabel('MEDV')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['INDUS'], y=data['NOX'])
plt.title('Scatter Plot of Nitric Oxide Concentrations (NOX) vs. Proportion of Non-Retail Business Acres (INDUS)')
plt.xlabel('INDUS')
plt.ylabel('NOX')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data['PTRATIO'], bins=20, kde=True)
plt.title('Histogram of Pupil-Teacher Ratio (PTRATIO)')
plt.xlabel('PTRATIO')
plt.ylabel('Frequency')
plt.show()

In [None]:
from scipy.stats import ttest_ind

# Separate the data based on CHAS
bounded_river = data[data['CHAS'] == 1]['MEDV']
not_bounded_river = data[data['CHAS'] == 0]['MEDV']

# Perform the T-Test
t_stat, p_value = ttest_ind(bounded_river, not_bounded_river)

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

# Conclusion
alpha = 0.05
if p_value <= alpha:
    print("Reject the null hypothesis: There is a significant difference in MEDV between houses bounded and not bounded by the Charles River.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in MEDV between houses bounded and not bounded by the Charles River.")


In [None]:
from scipy.stats import f_oneway

# Perform ANOVA
groups = [data[data['AGE_GROUP'] == label]['MEDV'] for label in labels]
f_stat, p_value = f_oneway(*groups)

print(f"F-statistic: {f_stat}")
print(f"P-value: {p_value}")

# Conclusion
if p_value <= alpha:
    print("Reject the null hypothesis: There is a significant difference in MEDV across different AGE groups.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in MEDV across different AGE groups.")


In [None]:
import statsmodels.api as sm

# Define the dependent and independent variables
X = data[['DIS']]
y = data['MEDV']

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Print the summary
print(model.summary())

# Conclusion
# Check the p-value for DIS in the regression summary
if model.pvalues['DIS'] <= alpha:
    print("Reject the null hypothesis: The distance has a significant impact on MEDV.")
else:
    print("Fail to reject the null hypothesis: The distance does not have a significant impact on MEDV.")
