# 1. Explore Data

## Import required libraries and schema

In [0]:
import numpy as np #for scientific computation
import pandas as pd #for data analysis and data manipulation
import matplotlib.pyplot as plt #for Data visualization (2D, 3D)
import seaborn as sns #for statistical data visualization and it is build on top of matplotlib

## Load the Data into a dataframe

 pyspark.sql.types.StructField(name, datatype,nullable=True)

 StructType = Schema and StructField = Objects

In [0]:
#Define the schema for dataset

from pyspark.sql.types import DoubleType, StringType, StructType, StructField

schema = StructType([
            StructField("longitude",DoubleType(),True),
            StructField("latitude",DoubleType(),True),
            StructField("housing_median_age",DoubleType(),True),
            StructField("total_rooms",DoubleType(),True),
            StructField("total_bedrooms",DoubleType(),True),
            StructField("population",DoubleType(),True),
            StructField("households",DoubleType(),True),
            StructField("median_income",DoubleType(),True),
            StructField("median_house_value",DoubleType(),True),
            StructField("ocean_proximity",StringType(),True)
            ])

#Read the data
housing_df = spark.read.csv("/FileStore/tables/housing.csv",schema=schema)  

## Display the loaded data frame

In [0]:
display(housing_df)

## Check the dataframe type

In [0]:
type(housing_df)
#Output shows it is a spark dataframe

## Convert the pyspark dataframe to pandas dataframe

In [0]:
housing_df = housing_df.toPandas()

In [0]:
type(housing_df)

#Now the dataframe type changed to pandas type

## Display few lines of dataframe

In [0]:
housing_df.head() #default is 5 rows

In [0]:
housing_df.head(10)

## Determine the dimention

In [0]:
housing_df.shape

#Returns (number of rows, number of columns)
#output is in tuple format
#Pandas df.shape is a property not a function[no parentheses]

## Summary of dataframe

The info() method prints information about the DataFrame.

The information contains the number of columns, column labels, column data types, memory usage, range index, and the number of cells in each column (non-null values).

The info() method actually prints the info. You do not use the print() method to print the info.

The info() method does not return any value, it prints the information.

In [0]:
housing_df.info()

## Statistics summary of numerical column

Get summary statistics of the Pandas dataframe by using the describe() method. 

In [0]:
housing_df.describe()

## Number of non-null values in each column

In [0]:
housing_df.count()

#Parameter is optional. Default is axis=0 (display count column wise)

## Retrieve column names

In [0]:
housing_df.columns

## Examine data type of each column

In [0]:
housing_df.dtypes

## Find the correlation (relationship) between each column in the DataFrame

In [0]:
housing_df.corr()

#range = -1 to 1. Positive correlation means that as one variable increases, the other variable tends to increase. Negative correlation means that as one variable increases, the other variable tends to decrease.

## Remove duplicate values

In [0]:
#Store original shape of dataframe
original_shape = housing_df.shape

#Drop rows with missing values
housing_df = housing_df.dropna()

#calculate duplicated rows
duplicated_rows = original_shape[0] - housing_df.shape[0]

#Print number of duplicated rows
print("Number of duplicated rows: {}".format(duplicated_rows))

## Check for missing values

In [0]:
#Check if we have missing values 
#True - Yes we have missing values
housing_df.isnull().values.any()

In [0]:
#If we have missing values

housing_df.isnull().values.sum()

In [0]:
#Number of missing values in each column
housing_df.isnull().sum()

# 2. Visualize the data

## Correlation matrix and heatmap

### Create a new dataframe without dependent column
Here dependent variable is median_house_value. We are going to predict this column. All other columns are independent columns or feature columns

In [0]:
housing_df.head()

In [0]:
housing_df_2 = housing_df.drop('median_house_value', axis=1) #Axis=1 means drop along column wise

In [0]:
housing_df_2.head()

### Visualize correlation (relationship) between target variable(median_house_value) and other independent columns 

In [0]:
correlation_value = housing_df_2.corrwith(housing_df['median_house_value'])
sorted_correlation_value = correlation_value.sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(sorted_correlation_value.index, sorted_correlation_value.values)

ax.set_xlabel('Features')
ax.set_ylabel('Correlation')
ax.set_title('Correlation between features and target variable')

plt.xticks(rotation=45)
ax.grid(True)

plt.show()

Setting colors to bar

In [0]:
correlation_value = housing_df_2.corrwith(housing_df['median_house_value'])
sorted_correlation_value = correlation_value.sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(sorted_correlation_value.index, sorted_correlation_value.values,
       color=['blue' if value > 0 else 'red' for value in sorted_correlation_value.values])

ax.set_xlabel('Features')
ax.set_ylabel('Correlation')
ax.set_title('Correlation between features and target variable')

plt.xticks(rotation=45)
ax.grid(True)

plt.show()

Conclusion: Most correlated column is median_income

### Display Correlation Headmap

In [0]:
corr = housing_df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidth=0.5) #fmt='.2f' means 2 decimal places

plt.title('Correlation Heatmap')
plt.xticks(rotation=45) #to display x axis labels horizonatally. Default is vertical
plt.show()

## Histogram of numerical values

In [0]:
#Set figure size
plt.figure(figsize=(10, 6)) 

#Histogram
plt.hist(housing_df['median_house_value'], bins=50, color='skyblue', edgecolor='black')

#Add label and title
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.title('Histogram of Median House Value')

#Add grid
plt.grid(True)

#Display plot
plt.show()


## Scaller Plot of two numerical variables

In [0]:
#set figure size
plt.figure(figsize=(10, 6))

#Plot scatter plot
plt.scatter(housing_df['median_income'], housing_df['median_house_value'], color='skyblue', edgecolor='black')

#Add labels
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
plt.title('Scatter Plot of Median Income vs Median House Value')

#Add grid lines
plt.grid(True)

#Display plot
plt.show()

# 3. Pandas Profiling

Pandas profiling provides a solution to this by generating comprehensive reports for datasets that have numerous features.

In [0]:
from pandas_profiling import ProfileReport

In [0]:
housing_df_profile = ProfileReport(housing_df,
                                    correlations={
                                        "auto": {"calculate": True},
                                        "pearson": {"calculate": True},
                                        "spearman": {"calculate": True},
                                        "kendall": {"calculate": True},
                                        "phi_k": {"calculate": True},
                                        "cramers": {"calculate": True}
                                    },
                                    title = "Housing Dataset Profiling Report",
                                    progress_bar = False
                                    )
housing_profile_html = housing_df_profile.to_html()

displayHTML(housing_profile_html)