In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
data_source_path = "data"
directory = list(filter(lambda x: x.endswith(".csv"), os.listdir(data_source_path)))
datasets = []

for file_name in directory:
    file_path = os.path.join(data_source_path, file_name)
    temp_df = pd.read_csv(file_path)
    datasets.append(temp_df)

df = pd.concat(datasets, ignore_index=True)
df = pd.read_csv("cleaned_leases_features.csv")

In [None]:
# Describe the data
df.head()
print('Data Size: ' + str(df.shape))
print(df.info())
print(df.describe())

In [None]:
# These are time columns
TIME_COLS = ["year", "quarter"]

# The following are columns for classificiation purpose
MANY_UNIQUE_VALS = ["zip", "transaction_type"]

# These are columns with only a few unique values, so we can convert them to numerical values
FEW_UNIQUE_VALS = ["industry_cluster",
                    "space_type",
                    "internal_class",
                    "CBD_suburban"]

NUMERIC_COLS = ["unemployment_rate",
                "num_leases", 
                'availability_proportion',
                'occupancy_proportion', 
                ]

In [None]:
# draw the distribution of numeric columns
for col in NUMERIC_COLS:
    plt.figure(figsize=(10, 5))
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.savefig(f'plots/distribution_{col}.png')

In [None]:
# Draw the distribution of the columns with only a few unique values
nrows, ncols = 2, 2
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))
axes = axes.flatten()

# Clean out the extra ax
if len(axes) > len(FEW_UNIQUE_VALS):
    for i in range(len(FEW_UNIQUE_VALS), len(axes)):
        fig.delaxes(axes[i])

for i, col in enumerate(FEW_UNIQUE_VALS):
    plt.sca(axes[i]) # Set the current ax
    data = df[col]
    val_counts = data.value_counts() # count numbers

    if len(val_counts) > 5:
        top_three = val_counts.head(5) # get three top values
        others = pd.Series({'Others': val_counts[5:].sum()}) # label the rest to be 'other'
        data = pd.concat([top_three, others]) # concat the two series
    else:
        data = val_counts
    
    wedges, texts, autotexts = plt.pie(data, autopct='%1.1f%%')
    plt.legend(wedges, data.keys(), loc="center left", bbox_to_anchor=(0.5, -0.1))
    plt.title('Distribution of ' + col)
    axes[i].set_xlabel(col)

fig.tight_layout()
plt.savefig('plots/few_unique_vals.png', dpi=300, bbox_inches='tight')

In [None]:
# draw the distribution of the columns with many unique values
nrows, ncols = 1, 2
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))
axes = axes.flatten()

# Clean out the extra ax
if len(axes) > len(MANY_UNIQUE_VALS):
    for i in range(len(MANY_UNIQUE_VALS), len(axes)):
        fig.delaxes(axes[i])

for i, col in enumerate(MANY_UNIQUE_VALS):
    plt.sca(axes[i]) # Set the current ax
    data = df[col]
    val_counts = data.value_counts() # count numbers

    if len(val_counts) > 5:
        top_three = val_counts.head(5) # get three top values
        others = pd.Series({'Others': val_counts[5:].sum()}) # label the rest to be 'other'
        data = pd.concat([top_three, others]) # concat the two series
    else:
        data = val_counts
    
    wedges, texts, autotexts = plt.pie(data, autopct='%1.1f%%')
    plt.legend(wedges, data.keys(), loc="center left", bbox_to_anchor=(0.5, -0.1))
    plt.title('Distribution of ' + col)
    axes[i].set_xlabel(col)

fig.tight_layout()
plt.savefig('plots/many_unique_distribution.png', dpi=300, bbox_inches='tight')