# Imports and loading data

In [20]:
import pandas as pd
import os

# Get the current directory (analyses folder)
current_dir = os.getcwd()

# Navigate up one directory to reach the parent directory (which contains the data folder)
parent_dir = os.path.dirname(current_dir)

# Define the file paths relative to the parent directory using os.path.join
clean_huis_te_koop_path = os.path.join(parent_dir, "data", "cleaned", "clean_house.csv")
clean_apartement_te_koop_path = os.path.join(parent_dir, "data", "cleaned", "clean_app.csv")

house = pd.read_csv(clean_huis_te_koop_path, sep=",")
app = pd.read_csv(clean_apartement_te_koop_path, sep=",")

# Gathering info

In [None]:
house.info()

In [None]:
app.info()

In [None]:
from pandas.plotting import scatter_matrix, andrews_curves, parallel_coordinates, radviz
scatter_matrix(house, alpha=0.2, figsize=(12,12));

In [29]:
house.property_subtype.unique()


array(['HOUSE', 'MIXED_USE_BUILDING', 'APARTMENT_BLOCK', 'BUNGALOW',
       'VILLA', 'EXCEPTIONAL_PROPERTY', 'COUNTRY_COTTAGE', 'MANSION',
       'FARMHOUSE', 'TOWN_HOUSE', 'MANOR_HOUSE', 'CHALET',
       'OTHER_PROPERTY', 'CASTLE'], dtype=object)

In [30]:
app.property_subtype.unique()


array(['FLAT_STUDIO', 'APARTMENT', 'DUPLEX', 'GROUND_FLOOR', 'PENTHOUSE',
       'SERVICE_FLAT', 'LOFT', 'TRIPLEX', 'KOT'], dtype=object)

In [22]:
house.type_of_sale.unique()

array(['BUY_REGULAR', 'PUBLIC_SALE', 'LIFE_ANNUITY'], dtype=object)

In [23]:
house.number_of_rooms.unique()

array([nan,  8.,  1., 13.,  5.,  7., 15., 17., 14., 23.,  9.,  2., 19.,
       12.,  4.,  3., 10., 11., 18., 16.,  6., 22., 20., 28., 24., 27.,
       21., 32., 26., 25.])

In [32]:
app.number_of_rooms.unique()

array([nan,  1.,  5.,  9., 12., 10., 16.,  8., 11., 15., 18.,  4.,  3.,
        2., 13.,  6.,  7., 14., 20., 21., 19., 17., 22.])

In [24]:
house.kitchen_type.unique()

array(['INSTALLED', 'USA_HYPER_EQUIPPED', 'NOT_INSTALLED',
       'HYPER_EQUIPPED', 'SEMI_EQUIPPED', 'USA_SEMI_EQUIPPED', nan,
       'USA_INSTALLED', 'USA_UNINSTALLED'], dtype=object)

In [33]:
app.kitchen_type.unique()

array(['NOT_INSTALLED', 'USA_SEMI_EQUIPPED', 'INSTALLED', nan,
       'SEMI_EQUIPPED', 'HYPER_EQUIPPED', 'USA_INSTALLED',
       'USA_HYPER_EQUIPPED', 'USA_UNINSTALLED'], dtype=object)

In [25]:
house.number_of_facades.unique()

array([ 2.,  3.,  4., nan,  6.,  1.,  5.,  8.])

In [34]:
app.number_of_facades.unique()

array([nan,  2.,  3.,  4.,  1.,  5.])

# Matplotlib

In [None]:
import matplotlib.pyplot as plt

# Define function to count non-null values in each column
def count_non_null_values(df):
    return df.notnull().sum()

# Adjust background -> define this first, before creating the figure and plots
plt.style.use('ggplot')

# Create figure and subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))


# Plot 1: Number of observations for house dataset
axes[0, 0].bar(['House'], [len(house)], alpha=0.5, color='forestgreen')
axes[0, 0].set_title('Number of Observations - House')

# Plot 2: Number of observations for apartment dataset
axes[0, 1].bar(['Apartment'], [len(app)], alpha=0.5, color='orange')
axes[0, 1].set_title('Number of Observations - Apartment')

# Plot 3: Number of non-null values for each feature in house dataset
count_house = count_non_null_values(house)
axes[1, 0].bar(count_house.index, count_house.values, color='forestgreen')
axes[1, 0].set_title('Non-Null Values - House')
axes[1, 0].tick_params(axis='x', rotation=90)

# Plot 4: Number of non-null values for each feature in apartment dataset
count_apartment = count_non_null_values(app)
axes[1, 1].bar(count_apartment.index, count_apartment.values, color='orange')
axes[1, 1].set_title('Non-Null Values - Apartment')
axes[1, 1].tick_params(axis='x', rotation=90)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()


# Matplotlib & Seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define function to count non-null values in each column
def count_non_null_values(df):
    return df.notnull().sum()

sns.set_theme(style="darkgrid") 

# Create figure and subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Plot 1: Number of observations for house dataset
sns.barplot(x=['House'], y=[len(house)], ax=axes[0, 0], color='blue')
axes[0, 0].set_title('Number of Observations - House')

# Plot 2: Number of observations for apartment dataset
sns.barplot(x=['Apartment'], y=[len(app)], ax=axes[0, 1], color='green')
axes[0, 1].set_title('Number of Observations - Apartment')

# Plot 3: Number of non-null values for each feature in house dataset
count_house = count_non_null_values(house)
sns.barplot(x=count_house.index, y=count_house.values, ax=axes[1, 0], color='blue')
axes[1, 0].set_title('Non-Null Values - House')
axes[1, 0].tick_params(axis='x', rotation=90)

# Plot 4: Number of non-null values for each feature in apartment dataset
count_apartment = count_non_null_values(app)
sns.barplot(x=count_apartment.index, y=count_apartment.values, ax=axes[1, 1], color='green')
axes[1, 1].set_title('Non-Null Values - Apartment')
axes[1, 1].tick_params(axis='x', rotation=90)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()


# What variables are most subject to outliers?

### Approach
- create a subplot for each of the features to clearly show outliers

In [None]:
for col in house.columns:
    print("---VALUES FOR " + col + ":")
    print(house[col].value_counts())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Assuming house is your DataFrame

# Create figure and subplots
fig, axes = plt.subplots(4, 4, figsize=(20, 20))

In [None]:
house.columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Assuming house is your DataFrame

# Select only the columns you want
selected_columns = ['property_subtype', 'price', 'type_of_sale',
                    'number_of_rooms', 'living_area', 'kitchen_type',
                    'furnished', 'open_fire', 'terrace',
                    'terrace_area', 'garden', 'garden_area', 'surface_of_good',
                    'number_of_facades', 'swimming_pool', 'state_of_building']

# Create figure and subplots
fig, axes = plt.subplots(4, 4, figsize=(20, 20))

# Flatten the axes array to easily access each subplot
axes = axes.flatten()

# Iterate through each selected column and create a subplot
for i, column in enumerate(selected_columns):
    # Skip plotting if the column is 'price' since it's continuous
    if column == 'price':
        continue
    
    # Create a catplot for the current column
    sns.catplot(data=house, x=column, kind='count', ax=axes[i], aspect=2)
    
    # Set the x-axis labels
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
    
    # Set the title of the subplot
    axes[i].set_title(column)

# Add the price plot as the 16th subplot
sns.histplot(data=house, x='price', ax=axes[-1], kde=True, bins=20, color='skyblue')
axes[-1].set_title('Price')

# Format the price axis
def format_price_tick(x, pos):
    return f'{x:,.0f}'  # Format the price as comma-separated and without decimal places

formatter = FuncFormatter(format_price_tick)
axes[-1].xaxis.set_major_formatter(formatter)

# Remove the extra subplot
plt.close(len(selected_columns) - 1)

# Adjust layout and show plot
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

## setting the theme
sns.set_theme(style="darkgrid") 


'property_subtype', 'price', 'type_of_sale',
'number_of_rooms', 'living_area', 'kitchen_type',
'furnished', 'open_fire', 'terrace',
'terrace_area', 'garden', 'garden_area', 'surface_of_good',
'number_of_facades', 'swimming_pool', 'state_of_building'

# Create figure and subplots
fig, axes = plt.subplots(4, 4, figsize=(12, 10))

sns.catplot(data=house, x='property_subtype', ax=axes[0, 0], aspect=2.5)
plt.xticks(rotation=45)  # Adjust the rotation here
axes[0, 0].set_title('Property_subtype')





sns.barplot(x=['House'], y=[len(house)], ax=axes[0, 0], color='blue')
axes[0, 0].set_title('Number of Observations - House')

# Plot 2: Number of observations for apartment dataset
sns.barplot(x=['Apartment'], y=[len(app)], ax=axes[0, 1], color='green')
axes[0, 1].set_title('Number of Observations - Apartment')

# Plot 3: Number of non-null values for each feature in house dataset
count_house = count_non_null_values(house)
sns.barplot(x=count_house.index, y=count_house.values, ax=axes[1, 0], color='blue')
axes[1, 0].set_title('Non-Null Values - House')
axes[1, 0].tick_params(axis='x', rotation=90)

# Plot 4: Number of non-null values for each feature in apartment dataset
count_apartment = count_non_null_values(app)
sns.barplot(x=count_apartment.index, y=count_apartment.values, ax=axes[1, 1], color='green')
axes[1, 1].set_title('Non-Null Values - Apartment')
axes[1, 1].tick_params(axis='x', rotation=90)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()


In [None]:
sns.catplot(data=house, x='property_subtype', aspect=2.5)
plt.xticks(rotation=45)  # Adjust the rotation here
plt.show()

In [None]:
from matplotlib.ticker import FuncFormatter
sns.set_theme(style="darkgrid") 
sns.catplot(data=house, x='price', aspect=2.5, hue='price')
plt.xticks(rotation=45)  # Adjust the rotation here

# Function to format tick labels
def format_price_tick(x, pos):
    return f'{x:,.0f}'  # Format the price as comma-separated and without decimal places

# Apply the custom tick formatter
formatter = FuncFormatter(format_price_tick)
plt.gca().xaxis.set_major_formatter(formatter)

plt.show()

In [None]:
sns.catplot(data=house, x='type_of_sale', aspect=2.5)
plt.xticks(rotation=45)  # Adjust the rotation here
plt.show()

In [None]:
sns.catplot(data=house, x='number_of_rooms', aspect=2.5, hue='number_of_rooms')
plt.xticks(rotation=45)  # Adjust the rotation here
plt.show()

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(16,6))
plt.title("Immo Houses")
sns.lineplot(data=house)
# sns.catplot(data=house)



In [None]:
sns.pairplot(house)

# Plotly

https://plotly.com/python

### Components:
- `layout` (dictionary controlling style of the figure --> one layout per figure)

- `data` (list of dictionaries setting graph type & data itself)
--> data + type = `a trace`

In [None]:
import plotly.express as px

fig = px.bar(house, x="property_subtype", y="price", color="property_subtype")
fig.show()

In [None]:
fig = px.histogram(house, x="price", nbins=5, marginal="box", hover_data=house.columns, color="property_subtype")
fig.show()

In [None]:
# Box plot showing outliers
# - hover_data to show more information on hover over
# - points='all' to show all points
fig = px.box(house, y="price", hover_data=house.columns, points="all", color="property_subtype")
fig.show()

In [None]:
cr = house.corr(method="pearson")
cr

In [None]:
fig = px.scatter(house, x="price", y="number_of_rooms", color="province", symbol="property_subtype")
fig.show()

In [None]:
house.columns