In [0]:
input_path="/Volumes/ml/default/raw/housing.csv"

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
users_schema=StructType([StructField("longitude",DoubleType(),True),
                         StructField("latitude",DoubleType(),True),
                         StructField("housing_median_age",DoubleType(),True),
                         StructField("total_rooms",DoubleType(),True),
                         StructField("total_bedrooms",DoubleType(),True),
                         StructField("population",DoubleType(),True),
                         StructField("households",DoubleType(),True),
                         StructField("median_income",DoubleType(),True),
                         StructField("median_house_value",DoubleType(),True),
                         StructField("ocean_proximity",StringType(),True)          
])

In [0]:
df=spark.read.csv(input_path,header=True,schema=users_schema)

In [0]:
#convert pyspark df to pandas df
df=df.toPandas()

In [0]:
df.head()

In [0]:
df.shape

In [0]:
#summary of DF
df.info()

In [0]:
df.columns

In [0]:
df.describe()

In [0]:
#df.isnull().values.any()
df.isnull().values.sum()

In [0]:
df.isnull().sum()

In [0]:
df_2=df.drop(columns="median_house_value")

In [0]:
correlation_values=df_2.corrwith(df['median_house_value'])

sorted_correlation_values = correlation_values.sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(12, 8))
ax.bar(
    sorted_correlation_values.index,
    sorted_correlation_values.values,
    color=['#1f77b4' if c > 0 else '#ff7f0e' for c in sorted_correlation_values.values]
)

ax.set_xlabel('Features')
ax.set_ylabel('Correlation')
ax.set_title('Correlation with Median House Value')

plt.xticks(rotation=45)
ax.grid(True)

plt.show()

In [0]:
corr = df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=0)

plt.show()

In [0]:
plt.figure(figsize=(10, 6))  # Set the figure size

# Plot the histogram
plt.hist(df['median_house_value'], bins=20, color='skyblue', edgecolor='black')

# Add labels and title
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.title('Histogram of Median House Value')

# Add gridlines
plt.grid(True, linestyle='--', alpha=0.5)

# Show the plot
plt.show()

In [0]:
display(df)

Databricks data profile. Run in Databricks to view.

In [0]:
from pandas_profiling import ProfileReport
df_profile = ProfileReport(df,
                           correlations={
                               "auto": {"calculate": True},
                               "pearson": {"calculate": True},
                               "spearman": {"calculate": True},
                               "kendall": {"calculate": True},
                               "phi_k": {"calculate": True},
                               "cramers": {"calculate": True},
                           }, title="Profiling Report", progress_bar=False, infer_dtypes=False)
profile_html = df_profile.to_html()

displayHTML(profile_html)