### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import datetime
import scipy.stats
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',None)
pd.set_option('display.width', 1000)

np.random.seed(0)
np.set_printoptions(suppress=True)

In [None]:
boston_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ST0151EN-SkillsNetwork/labs/boston_housing.csv'
boston_df = pd.read_csv(boston_url)

In [None]:
boston_df.head()

### Exploratory Data Analysis

In [None]:
boston_df.info()

### Exploratory Data Analysis

In [None]:
boston_df.info()

In [None]:
boston_df.describe()

In [None]:
boston_df.columns

### Data Visualization

In [None]:
boston_df.hist(bins=50, figsize=(20,10))
plt.suptitle('Feature Distribution', x=0.5, y=1.02, ha='center', fontsize='large')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20,20))
plt.suptitle('Pairplots of features', x=0.5, y=1.02, ha='center', fontsize='large')
sns.pairplot(boston_df.sample(250))
plt.show()

### Task 4: Generate Descriptive Statistics and Visualizations

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x=boston_df.MEDV)
plt.title("Boxplot for MEDV")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(a=boston_df.CHAS,bins=10, kde=False)
plt.title("Histogram for Charles river")
plt.show()

In [None]:
boston_df.loc[(boston_df["AGE"] <= 35),'age_group'] = '35 years and younger'
boston_df.loc[(boston_df["AGE"] > 35) & (boston_df["AGE"]<70),'age_group'] = 'between 35 and 70 years'
boston_df.loc[(boston_df["AGE"] >= 70),'age_group'] = '70 years and older'
boston_df.head()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x=boston_df.MEDV, y=boston_df.age_group, data=boston_df)
plt.title("Boxplot for the MEDV variable vs the AGE variable")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x=boston_df.NOX, y=boston_df.INDUS, data=boston_df)
plt.title("Relationship between NOX and INDUS")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(a=boston_df.PTRATIO,bins=10, kde=False)
plt.title("Histogram for the pupil to teacher ratio variable")
plt.show()

### Task 5: Use the appropriate tests to answer the questions provided

#### Is there a significant difference in median value of houses bounded by the Charles river or not? (T-test for independent samples)

In [None]:
boston_df["CHAS"].value_counts()

In [None]:
H0 = boston_df[boston_df["CHAS"] == 0]["MEDV"]
H0

In [None]:
H1 = boston_df[boston_df["CHAS"] == 1]["MEDV"]
H1

In [None]:
scipy.stats.ttest_ind(H0,H1,axis=0,equal_var=True)
#there is no statistical significance

#### Is there a difference in Median values of houses (MEDV) for each proportion of owner occupied units built prior to 1940 (AGE)? (ANOVA)

In [None]:
boston_df["AGE"].value_counts()

In [None]:
boston_df.loc[(boston_df["AGE"] <= 35),'age_group'] = '35 years and younger'
boston_df.loc[(boston_df["AGE"] > 35) & (boston_df["AGE"]<70),'age_group'] = 'between 35 and 70 years'
boston_df.loc[(boston_df["AGE"] >= 70),'age_group'] = '70 years and older'

In [None]:
boston_df.head()

In [None]:
M0 = boston_df[boston_df["age_group"] == '35 years and younger']["MEDV"]
M1 = boston_df[boston_df["age_group"] == 'between 35 and 70 years']["MEDV"]
M2 = boston_df[boston_df["age_group"] == '70 years and older']["MEDV"]

In [None]:
f_stats, p_value = scipy.stats.f_oneway(M0,M1,M2,axis=0)

In [None]:
print("F-Statistic={0}, P-value={1}".format(f_stats,p_value))

#### Can we conclude that there is no relationship between Nitric oxide concentrations and proportion of non-retail business acres per town? (Pearson Correlation)

In [None]:
pearson,p_value = scipy.stats.pearsonr(boston_df["NOX"], boston_df["INDUS"])

In [None]:
print("Pearson Coefficient value={0}, P-value={1}".format(pearson,p_value))

In [None]:
#there exists a relationship between Nitric Oxide and non-retail business acres per town.

#### What is the impact of an additional weighted distance to the five Boston employment centres on the median value of owner occupied homes? (Regression analysis)

In [None]:
boston_df.columns

In [None]:
y = boston_df['MEDV']
x = boston_df['DIS']

In [None]:
x = sm.add_constant(x)

In [None]:
results = sm.OLS(y,x).fit()
results.summary()

In [None]:
np.sqrt(0.062)

### Correlation

In [None]:
boston_df.corr()

In [None]:
plt.figure(figsize=(16,9))
sns.heatmap(boston_df.corr(),cmap="coolwarm",annot=True,fmt='.2f',linewidths=2, cbar=False)
plt.show()