In [2]:
import pandas as pd
import scipy.stats
import statsmodels.api as sm
import statsmodels.formula.api as ols
import seaborn as sns
import matplotlib.pyplot as plt

boston_df=pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ST0151EN-SkillsNetwork/labs/boston_housing.csv')
if 'Unnamed: 0' in boston_df.columns:
    boston_df.drop(labels= 'Unnamed: 0',axis= 'columns', inplace= True)
else:
    pass
boston_df.head(5)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,5.33,36.2


In [17]:
boston_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  LSTAT    506 non-null    float64
 12  MEDV     506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


# Hypothesis Testing

We will set the Alpha to 0.05

### Is there a significant difference in the median value of houses bounded by the Charles river or not?

### T Test

In [25]:
#Check if the variance is equal
levene_result = scipy.stats.levene(
    boston_df[boston_df['CHAS'] == 0]['MEDV']
    ,boston_df[boston_df['CHAS'] == 1]['MEDV']
    ,center= 'median'
)
print(f'P-value is {levene_result.pvalue:0.4f} which is less than alpha(0.05) therefore we will reject the null hypothesis and set the equal_var to False')


P-value is 0.0326 which is less than alpha(0.05) therefore we will reject the null hypothesis and set the equal_var to False


In [26]:
Ttest_result = scipy.stats.ttest_ind(
    boston_df[boston_df['CHAS'] == 0]['MEDV']
    ,boston_df[boston_df['CHAS'] == 1]['MEDV']
    ,equal_var= False
)
print(Ttest_result)
print(f'Since the P-value is less than alpha we will reject the null hypothesis, There\'s significant difference in median home prices between homes that are near the Charles River and those that are not.')

TtestResult(statistic=-3.113291312794837, pvalue=0.0035671700981375174, df=36.876408797611994)
Since the P-value is less than alpha we will reject the null hypothesis, There's significant difference in median home prices between homes that are near the Charles River and those that are not.


### Is there a difference in median values of houses of each proportion of owner-occupied units built before 1940?

- **Null Hypothesis:** There's no difference between datas.
- **Alternative Hypothesis:** There's difference between the datas.

In [35]:
df1 = boston_df.copy()
df1['AGE_Group'] = df1['AGE'].apply(lambda x:'Younger Age' if x <= 35 else 'Middle Age' if x <= 70 else 'Older Age')

scipy.stats.f_oneway(
    df1[df1['AGE_Group'] == 'Younger Age']['MEDV']
    ,df1[df1['AGE_Group'] == 'Middle Age']['MEDV']
    ,df1[df1['AGE_Group'] == 'Older Age']['MEDV']
)

F_onewayResult(statistic=36.40764999196599, pvalue=1.7105011022702984e-15)

### Regression Analysis

In [37]:
X = pd.get_dummies(df1['AGE_Group'], dtype= float)
y = df1['MEDV']
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.125
Model:,OLS,Adj. R-squared:,0.12
Method:,Least Squares,F-statistic:,23.94
Date:,"Fri, 09 May 2025",Prob (F-statistic):,1.69e-14
Time:,06:30:21,Log-Likelihood:,-1806.4
No. Observations:,506,AIC:,3621.0
Df Residuals:,502,BIC:,3638.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.124e+13,1.47e+14,0.621,0.535,-1.98e+14,3.8e+14
Middle Age,-9.124e+13,1.47e+14,-0.621,0.535,-3.8e+14,1.98e+14
Older Age,-9.124e+13,1.47e+14,-0.621,0.535,-3.8e+14,1.98e+14
Younger Age,-9.124e+13,1.47e+14,-0.621,0.535,-3.8e+14,1.98e+14

0,1,2,3
Omnibus:,161.218,Durbin-Watson:,0.639
Prob(Omnibus):,0.0,Jarque-Bera (JB):,406.705
Skew:,1.613,Prob(JB):,4.84e-89
Kurtosis:,5.981,Cond. No.,921000000000000.0


### Can we conclude that there is no relationship between Nitric oxide concentrations and the proportion of non-retail business acres per town?

- **Null Hypothesis:** There's no correlation between two data.
- **Alternative Hypothesis:** There's correlation between the two data.

### Pearson Correlation Test

In [31]:
Pearson_result = scipy.stats.pearsonr(
    boston_df['NOX']
    ,boston_df['INDUS']
    )
print(Pearson_result)
print(f'Since pvalue {Pearson_result.pvalue} is less than alpha 0.05 we will reject the null hypothesis and conclude that there\'s positive correlation between two data based on the pearson coefficient {Pearson_result.correlation}.')

PearsonRResult(statistic=0.763651446920915, pvalue=7.913361061241532e-98)
Since pvalue 7.913361061241532e-98 is less than alpha 0.05 we will reject the null hypothesis and conclude that there's positive correlation between two data based on the pearson coefficient 0.763651446920915.


### Regression Analysis

In [32]:
X = boston_df['NOX']
y = boston_df['INDUS']
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

model.summary()

0,1,2,3
Dep. Variable:,INDUS,R-squared:,0.583
Model:,OLS,Adj. R-squared:,0.582
Method:,Least Squares,F-statistic:,705.1
Date:,"Fri, 09 May 2025",Prob (F-statistic):,7.91e-98
Time:,05:55:22,Log-Likelihood:,-1470.5
No. Observations:,506,AIC:,2945.0
Df Residuals:,504,BIC:,2954.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-13.9414,0.965,-14.450,0.000,-15.837,-12.046
NOX,45.2108,1.703,26.554,0.000,41.866,48.556

0,1,2,3
Omnibus:,46.05,Durbin-Watson:,0.272
Prob(Omnibus):,0.0,Jarque-Bera (JB):,67.622
Skew:,0.649,Prob(JB):,2.07e-15
Kurtosis:,4.233,Cond. No.,11.3


What is the impact of an additional weighted distance to the five Boston employment centres on the median value of owner-occupied homes?

- **Null Hypothesis:** DIS have no impact to MEDV.
- **Alternative Hypothesis:** DIS have impact to MEDV.

In [40]:
X = boston_df['DIS']
y = boston_df['MEDV']
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.062
Model:,OLS,Adj. R-squared:,0.061
Method:,Least Squares,F-statistic:,33.58
Date:,"Fri, 09 May 2025",Prob (F-statistic):,1.21e-08
Time:,06:47:57,Log-Likelihood:,-1823.9
No. Observations:,506,AIC:,3652.0
Df Residuals:,504,BIC:,3660.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,18.3901,0.817,22.499,0.000,16.784,19.996
DIS,1.0916,0.188,5.795,0.000,0.722,1.462

0,1,2,3
Omnibus:,139.779,Durbin-Watson:,0.57
Prob(Omnibus):,0.0,Jarque-Bera (JB):,305.104
Skew:,1.466,Prob(JB):,5.59e-67
Kurtosis:,5.424,Cond. No.,9.32


In [42]:
import math
print(f'correlation coef: {math.sqrt(0.062)}')

correlation coef: 0.24899799195977465
