Statistics for Data Science with Python 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [None]:

# Load the dataset
from sklearn.datasets import load_boston
boston = load_boston()

# Create DataFrame
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['MEDV'] = boston.target

# Task 1: Dataset Familiarity
print(df.info())
print(df.describe())



In [None]:
# Task 2: Descriptive Statistics and Visualizations

# Boxplot for MEDV
plt.figure(figsize=(6, 4))
sns.boxplot(y=df['MEDV'])
plt.title('Boxplot of Median Value of Homes')
plt.ylabel('MEDV ($1000s)')
plt.show()

# Bar plot for CHAS
plt.figure(figsize=(6, 4))
df['CHAS'].value_counts().plot(kind='bar')
plt.title('Bar Plot of Charles River Variable')
plt.xlabel('Bounds Charles River (1 = Yes, 0 = No)')
plt.ylabel('Number of Tracts')
plt.show()

# Boxplot of MEDV vs AGE groups
age_bins = pd.cut(df['AGE'], bins=[0, 35, 70, 100], labels=['<=35', '36-70', '>70'])
df['AGE_GROUP'] = age_bins
plt.figure(figsize=(8, 5))
sns.boxplot(x='AGE_GROUP', y='MEDV', data=df)
plt.title('MEDV by Age Group')
plt.xlabel('Age Group')
plt.ylabel('MEDV ($1000s)')
plt.show()

# Scatter plot: NOX vs INDUS
plt.figure(figsize=(6, 4))
sns.scatterplot(x='INDUS', y='NOX', data=df)
plt.title('NOX vs INDUS')
plt.xlabel('Non-retail business acres per town (INDUS)')
plt.ylabel('Nitric Oxides Concentration (NOX)')
plt.show()

# Histogram for PTRATIO
plt.figure(figsize=(6, 4))
df['PTRATIO'].hist(bins=15)
plt.title('Histogram of Pupil-Teacher Ratio')
plt.xlabel('Pupil-Teacher Ratio')
plt.ylabel('Frequency')
plt.show()


# Task 3: Statistical Tests

# T-test: MEDV for CHAS
chas_1 = df[df['CHAS'] == 1]['MEDV']
chas_0 = df[df['CHAS'] == 0]['MEDV']
t_stat, p_val = stats.ttest_ind(chas_1, chas_0)
print("T-test: MEDV ~ CHAS")
print(f"t-statistic = {t_stat:.4f}, p-value = {p_val:.4f}")
if p_val < 0.05:
    print("Conclusion: Significant difference in MEDV for CHAS groups")
else:
    print("Conclusion: No significant difference in MEDV for CHAS groups")

# ANOVA: MEDV ~ AGE_GROUP
anova_model = ols('MEDV ~ AGE_GROUP', data=df).fit()
anova_table = sm.stats.anova_lm(anova_model, typ=2)
print("\nANOVA: MEDV ~ AGE_GROUP")
print(anova_table)
if anova_table['PR(>F)'][0] < 0.05:
    print("Conclusion: Significant difference in MEDV across AGE groups")
else:
    print("Conclusion: No significant difference in MEDV across AGE groups")

# Pearson Correlation: NOX vs INDUS
corr, p_value = stats.pearsonr(df['NOX'], df['INDUS'])
print("\nPearson Correlation: NOX vs INDUS")
print(f"Correlation = {corr:.4f}, p-value = {p_value:.4f}")
if p_value < 0.05:
    print("Conclusion: Significant relationship between NOX and INDUS")
else:
    print("Conclusion: No significant relationship between NOX and INDUS")

# Regression: DIS on MEDV
X = sm.add_constant(df['DIS'])
model = sm.OLS(df['MEDV'], X).fit()
print("\nRegression Analysis: MEDV ~ DIS")
print(model.summary())
if model.pvalues['DIS'] < 0.05:
    print("Conclusion: DIS has a significant impact on MEDV")
else:
    print("Conclusion: DIS has no significant impact on MEDV")
