## Import Competition Data

In [1]:
import numpy as np
import pandas as pd

property_df  = pd.read_csv("data/properties_2017.csv",low_memory = False)
train_df = pd.read_csv("data/train_2017.csv", parse_dates=["transactiondate"])

In [2]:
property_df.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2016.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,5.0,,,,...,1.0,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,6.0,,,...,1.0,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,1.0,,196751.0,440101.0,2016.0,243350.0,5725.17,,,


In [3]:
train_df.head()

Unnamed: 0,parcelid,logerror,transactiondate
0,14297519,0.025595,2017-01-01
1,17052889,0.055619,2017-01-01
2,14186244,0.005383,2017-01-01
3,12177905,-0.10341,2017-01-01
4,10887214,0.00694,2017-01-01


## Data Cleaning

In [4]:
# find columns with missing values
missing_percent = property_df.isnull().sum() * 100 / len(property_df)
missing_values_df = pd.DataFrame({'column_name': property_df.columns, 'percent_missing': missing_percent})
print(missing_values_df[missing_values_df.percent_missing > 0].sort_values('percent_missing', ascending=False))

                                               column_name  percent_missing
storytypeid                                    storytypeid        99.945632
basementsqft                                  basementsqft        99.945498
yardbuildingsqft26                      yardbuildingsqft26        99.911363
fireplaceflag                                fireplaceflag        99.827048
architecturalstyletypeid          architecturalstyletypeid        99.796966
typeconstructiontypeid              typeconstructiontypeid        99.774020
finishedsquarefeet13                  finishedsquarefeet13        99.743034
buildingclasstypeid                    buildingclasstypeid        99.573532
pooltypeid10                                  pooltypeid10        99.430326
decktypeid                                      decktypeid        99.417831
finishedsquarefeet6                    finishedsquarefeet6        99.280387
poolsizesum                                    poolsizesum        99.063452
pooltypeid2 

In [5]:
# remove features with more than 70% null values
threshold = 80
null_cols = missing_values_df[missing_values_df.percent_missing > threshold].column_name.tolist()

In [6]:
property_df = property_df.drop(null_cols, axis=1)

In [7]:
property_df.shape

(2985217, 34)

In [8]:
# impute values for geographic categorical features - use mode of broader geographic feature by grouping
# using mode across the feature itself would provide geographical values that don't make sense 
# eg: find most common zip code in the county to impute missing zip

def impute_geographical_feature(df, target_feature, group_feature):
    # mode of target feature grouped by the group geographic feature
    mode_per_group = df.groupby(group_feature)[target_feature].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    
    # impute missing values 
    missing_mask = df[target_feature].isnull() & df[group_feature].notnull()
    df.loc[missing_mask, target_feature] = df.loc[missing_mask, group_feature].map(mode_per_group)
    
    # drop rows where group feature is missing or if target feature still missing after imputation
    df.dropna(subset=[target_feature, group_feature], inplace=True)

geographical_features = ['fips', 'regionidneighborhood', 'censustractandblock', 'rawcensustractandblock',\
                         'regionidzip', 'regionidcity', 'regionidcounty']
for feature in geographical_features:
    group_feature = 'regionidcounty' if feature != 'regionidcounty' or feature != 'fips' else 'regionidcity'
    impute_geographical_feature(property_df, feature, group_feature)

In [9]:
property_df.shape

(2982285, 34)

In [10]:
# impute categorical feature values
from sklearn.impute import SimpleImputer

cat_imputer = SimpleImputer(strategy='most_frequent') # use mode here as well

categorical_features = ['airconditioningtypeid', 'buildingqualitytypeid', 'heatingorsystemtypeid',
                        'propertycountylandusecode', 'propertylandusetypeid', 'propertyzoningdesc',
                        'regionidcity', 'regionidcounty', 'regionidzip', 'unitcnt',
                        'censustractandblock', 'rawcensustractandblock']

# impute missing values for each feature
for col in categorical_features:
    if col in property_df.columns:
        property_df[[col]] = cat_imputer.fit_transform(property_df[[col]])

In [11]:
# now do the same for the remaining numerical features

num_imputer = SimpleImputer(strategy='median') 

numerical_features = ['basementsqft', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 
                      'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 
                      'finishedsquarefeet12', 'finishedsquarefeet50', 'garagecarcnt', 'garagetotalsqft', 
                      'latitude', 'longitude', 'lotsizesquarefeet', 'poolcnt', 'poolsizesum', 'roomcnt', 
                      'threequarterbathnbr', 'unitcnt', 'yearbuilt', 'numberofstories',
                      'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 
                      'landtaxvaluedollarcnt', 'taxamount', 'fullbathcnt']

for col in numerical_features:
    if col in property_df.columns:
        property_df[[col]] = num_imputer.fit_transform(property_df[[col]])

In [12]:
# confirm all missing values filled
print(property_df.isnull().sum())

parcelid                        0
airconditioningtypeid           0
bathroomcnt                     0
bedroomcnt                      0
buildingqualitytypeid           0
calculatedbathnbr               0
calculatedfinishedsquarefeet    0
finishedsquarefeet12            0
fips                            0
fullbathcnt                     0
garagecarcnt                    0
garagetotalsqft                 0
heatingorsystemtypeid           0
latitude                        0
longitude                       0
lotsizesquarefeet               0
propertycountylandusecode       0
propertylandusetypeid           0
propertyzoningdesc              0
rawcensustractandblock          0
regionidcity                    0
regionidcounty                  0
regionidneighborhood            0
regionidzip                     0
roomcnt                         0
unitcnt                         0
yearbuilt                       0
numberofstories                 0
structuretaxvaluedollarcnt      0
taxvaluedollar

### Combining Training Table

In [13]:
# pulling property data into training data
combined_df = train_df.merge(property_df, how='left', on='parcelid')
combined_df.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,calculatedfinishedsquarefeet,finishedsquarefeet12,...,roomcnt,unitcnt,yearbuilt,numberofstories,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,censustractandblock
0,14297519,0.025595,2017-01-01,1.0,3.5,4.0,6.0,3.5,3100.0,3100.0,...,0.0,1.0,1998.0,1.0,485713.0,1023282.0,2016.0,537569.0,11013.72,60590630000000.0
1,17052889,0.055619,2017-01-01,1.0,1.0,2.0,6.0,1.0,1465.0,1465.0,...,5.0,1.0,1967.0,1.0,88000.0,464000.0,2016.0,376000.0,5672.48,61110010000000.0
2,14186244,0.005383,2017-01-01,1.0,2.0,3.0,6.0,2.0,1243.0,1243.0,...,6.0,1.0,1962.0,1.0,85289.0,564778.0,2016.0,479489.0,6488.3,60590220000000.0
3,12177905,-0.10341,2017-01-01,1.0,3.0,4.0,8.0,3.0,2376.0,2376.0,...,0.0,1.0,1970.0,1.0,108918.0,145143.0,2016.0,36225.0,1777.51,60373000000000.0
4,10887214,0.00694,2017-01-01,1.0,3.0,3.0,8.0,3.0,1312.0,1312.0,...,0.0,1.0,1964.0,1.0,73681.0,119407.0,2016.0,45726.0,1533.89,60371240000000.0


In [14]:
# remove properties that were removed during data cleaning in the new combined DataFrame
final_property_features = [col for col in property_df.columns if col != 'parcelid']
# remove rows where all property features missing:
missing_property_data = combined_df[final_property_features].isnull().all(axis=1) 
combined_df = combined_df[~missing_property_data]

## Dimensionality Reduction

In [15]:
# convert datetime columns to numerical format (to enable scaling)
for col in combined_df.select_dtypes(include=['datetime']):
    combined_df[f'{col}_year'] = combined_df[col].dt.year
    combined_df[f'{col}_month'] = combined_df[col].dt.month
    combined_df[f'{col}_day'] = combined_df[col].dt.day
    combined_df[f'{col}_weekday'] = combined_df[col].dt.weekday
    combined_df.drop(columns=[col], inplace=True)

In [16]:
# convert categorical to numerical for scaling as well
# using label encoding since one-hot encoding creates 2k+ features, which makes dim. red. infeasible 
categorical_cols = combined_df.select_dtypes(include=['object']).columns

from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col])
    label_encoders[col] = le  # store to invert encoding later if needed
    
# NOTE: this can create an implied ordinal ordering - can change this if affecting linear model performance

In [17]:
# define levels: increasing number of components
# Baseline: 100% (no dimensionality reduction)
# Level 1: 10% 
# Level 2: 50%
# Level 3: 75%

features = combined_df.drop(columns=['logerror'])
target = combined_df['logerror']

components_10 = int(features.shape[1] * 0.1)
components_50 = int(features.shape[1] * 0.5)
components_75 = int(features.shape[1] * 0.75)

In [18]:
combined_df.columns

Index(['parcelid', 'logerror', 'airconditioningtypeid', 'bathroomcnt',
       'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr',
       'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'fips',
       'fullbathcnt', 'garagecarcnt', 'garagetotalsqft',
       'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet',
       'propertycountylandusecode', 'propertylandusetypeid',
       'propertyzoningdesc', 'rawcensustractandblock', 'regionidcity',
       'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt',
       'unitcnt', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt',
       'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt',
       'taxamount', 'censustractandblock', 'transactiondate_year',
       'transactiondate_month', 'transactiondate_day',
       'transactiondate_weekday'],
      dtype='object')

In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [20]:
# key metrics: run time, and data size (memory of dataframe)
# functions to measure these
import time
def measure_runtime(func):
    start_time = time.time()
    result = func()
    end_time = time.time()
    runtime = end_time - start_time
    return result, runtime

def dataframe_memory(df):
    return df.memory_usage(deep=True).sum()

In [21]:
pca_results = {} # store new reduced datasets and metrics

In [22]:
def apply_pca(n_components):
    pca = PCA(n_components=n_components)
    pca_result, runtime = measure_runtime(lambda: pca.fit_transform(features_scaled))
    reduced_df = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(n_components)])
    reduced_df['logerror'] = target
    mem_usage = dataframe_memory(reduced_df)
    return reduced_df, runtime, mem_usage

In [23]:
# Baseline - no reduction
baseline_runtime_start = time.time()
baseline_mem_usage = dataframe_memory(combined_df)
baseline_runtime = time.time() - baseline_runtime_start
pca_results['Baseline'] = (combined_df, baseline_runtime, baseline_mem_usage)

In [24]:
# PCA for 10%, 50%, 75%
from sklearn.decomposition import PCA
for components, label in zip([components_10, components_50, components_75], ['10%', '50%', '75%']):
    reduced_df, runtime, mem_usage = apply_pca(components)
    pca_results[label] = (reduced_df, runtime, mem_usage)

In [25]:
# Output Results
for label, (df, runtime, mem_usage) in pca_results.items():
    print(f"Results for {label}:")
    print(f"Runtime: {runtime:.4f} seconds")
    print(f"Data Size: {mem_usage} bytes")
    print(f"DataFrame head:\n{df.head()}\n")

Results for Baseline:
Runtime: 0.0017 seconds
Data Size: 23584016 bytes
DataFrame head:
   parcelid  logerror  airconditioningtypeid  bathroomcnt  bedroomcnt  \
0  14297519  0.025595                    1.0          3.5         4.0   
1  17052889  0.055619                    1.0          1.0         2.0   
2  14186244  0.005383                    1.0          2.0         3.0   
3  12177905 -0.103410                    1.0          3.0         4.0   
4  10887214  0.006940                    1.0          3.0         3.0   

   buildingqualitytypeid  calculatedbathnbr  calculatedfinishedsquarefeet  \
0                    6.0                3.5                        3100.0   
1                    6.0                1.0                        1465.0   
2                    6.0                2.0                        1243.0   
3                    8.0                3.0                        2376.0   
4                    8.0                3.0                        1312.0   

   finishe

### t-distributed Stochastic Neighbor Embedding
#### sklearn.manifold.TSNE: ValueError: 'n_components' should be inferior to 4 for the barnes_hut algorithm as it relies on quad-tree or oct-tree.
##### Using the 'exact' method (O(n^2) instead of O(nlogn) performance but can use n_components larger than 4), crashes notebook
#### openTSNE: t-SNE for >2 dimensions is currently unsupported (and generally a bad idea)

In [26]:
# from sklearn.manifold import TSNE

# # tsne_results = {}

# def apply_tsne(n_components):
#     tsne = TSNE(n_components=n_components, random_state=42, method='exact')
#     tsne_result, runtime = measure_runtime(lambda: tsne.fit_transform(features_scaled))
#     reduced_df = pd.DataFrame(tsne_result, columns=[f'mbedding{i+1}' for i in range(n_components)])
#     reduced_df['logerror'] = target
#     mem_usage = dataframe_memory(reduced_df)
#     return reduced_df, runtime, mem_usage

# # T-SNE for 10%, 50%, 75%
# for components, label in zip([components_10, components_50, components_75], ['10%', '50%', '75%']):
#     reduced_df, runtime, mem_usage = apply_tsne(components)
#     tsne_results[label] = (reduced_df, runtime, mem_usage)
    
# # Output Results
# for label, (df, runtime, mem_usage) in tsne_results.items():
#     print(f"Results for {label}:")
#     print(f"Runtime: {runtime:.4f} seconds")
#     print(f"Data Size: {mem_usage} bytes")
#     print(f"DataFrame head:\n{df.head()}\n")

### Isomap (takes a long time to perform on the large dataset) (even for ideal_n_neighbors = 14)

In [27]:
# from sklearn.manifold import Isomap
# from scipy.sparse import csr_matrix, lil_matrix

# isomap_results = {}

# # Convert features_scaled to lil_matrix
# features_scaled_lil = lil_matrix(features_scaled)

# def apply_isomap(ideal_n_neighbors, n_components):
#     isomapping = Isomap(n_neighbors=ideal_n_neighbors, n_components=n_components)
#     isomap_result, runtime = measure_runtime(lambda: isomapping.fit_transform(features_scaled_lil))
#     reduced_df = pd.DataFrame(isomap_result, columns=[f'IMap{i+1}' for i in range(n_components)])
#     reduced_df['logerror'] = target
#     mem_usage = dataframe_memory(reduced_df)
#     return reduced_df, runtime, mem_usage

# # Cross validation (takes a long time)
# import random
# from sklearn.model_selection import cross_val_score
# from sklearn.neighbors import KNeighborsRegressor


# def isomap_crossvalidation():
#     # Get random subset of features for hyperparameter tuning of isomapping
#     subset_size = 10000

#     # Randomly select indices for the subset
#     subset_indices = random.sample(range(features_scaled.shape[0]), subset_size)
#     subset_features = features_scaled[subset_indices]
#     subset_target = np.array(target)[subset_indices]

#     # Cross validation to find ideal n_neighbors
#     # Using K-NN model for comparing performance
#     # 13 is the minimum so that the "The number of connected components of the neighbors graph is 2 > 1", 
#     # anything lower would mean that "Completing the graph to fit Isomap might be slow"
#     n_neighbors_range = [14, 20, 30, 50, 100]

#     cv_scores = {}

#     # Perform cross-validation for each n_neighbors value
#     for n_neighbors in n_neighbors_range:
#         avg_cv_scores = []
#         for components in [components_10, components_50, components_75]:
#             isomapping = Isomap(n_neighbors=n_neighbors, n_components=components)
#             transformed_features = isomapping.fit_transform(subset_features)
#             scores = -cross_val_score(KNeighborsRegressor(), transformed_features, subset_target, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
#             avg_cv_scores.append(np.mean(scores))

#         # Store the average score across all component percentages
#         cv_scores[n_neighbors] = np.mean(avg_cv_scores)

#     ideal_n_neighbors = min(cv_scores, key=cv_scores.get)
#     print("Ideal n_neighbors:", ideal_n_neighbors)
#     return ideal_n_neighbors

# # ideal_n_neighbors = isomap_crossvalidation()

# ideal_n_neighbors = 14

# # Isomapping for 10%, 50%, 75%
# for components, label in zip([components_10, components_50, components_75], ['10%', '50%', '75%']):
#     reduced_df, runtime, mem_usage = apply_isomap(ideal_n_neighbors, components)
#     isomap_results[label] = (reduced_df, runtime, mem_usage)

# # Output Results
# for label, (df, runtime, mem_usage) in isomap_results.items():
#     print(f"Results for {label}:")
#     print(f"Runtime: {runtime:.4f} seconds")
#     print(f"Data Size: {mem_usage} bytes")
#     print(f"DataFrame head:\n{df.head()}\n")

### Non-Metric Multidimensional Scaling
#### Calculating distances is too large for jupyter notebook to handle, and kernel crashes

In [28]:
# from sklearn.manifold import MDS
# from sklearn.metrics import pairwise_distances

# nmds_results = {}

# def apply_nmds(n_components):
#     nmds_model = MDS(n_components=n_components, dissimilarity='euclidean', random_state=42, metric=False)
#     nmds_result, runtime = measure_runtime(lambda: nmds_model.fit_transform(features_scaled))
#     reduced_df = pd.DataFrame(umap_result, columns=[f'NMDS_{i+1}' for i in range(n_components)])
#     reduced_df['logerror'] = target
#     mem_usage = reduced_df.memory_usage(deep=True).sum()
#     return reduced_df, runtime, mem_usage

# # NMDS for 10%, 50%, 75%
# for components, label in zip([components_10, components_50, components_75], ['10%', '50%', '75%']):
#     reduced_df, runtime, mem_usage = apply_nmds(components)
#     nmds_results[label] = (reduced_df, runtime, mem_usage)

# # Output Results
# for label, (df, runtime, mem_usage) in nmds_results.items():
#     print(f"Results for {label}:")
#     print(f"Runtime: {runtime:.4f} seconds")
#     print(f"Data Size: {mem_usage} bytes")
#     print(f"DataFrame head:\n{df.head()}\n")

### Semidefinite Embedding (takes a very long time and crashes notebook)

In [29]:
# from sklearn.manifold import SpectralEmbedding

# sde_results = {}
# def apply_sde(n_components):
#     sde_model = SpectralEmbedding(n_components=n_components)
#     sde_result, runtime = measure_runtime(lambda: sde_model.fit_transform(features_scaled))
#     reduced_df = pd.DataFrame(sde_result, columns=[f'SDE_{i+1}' for i in range(n_components)])
#     reduced_df['logerror'] = target
#     mem_usage = reduced_df.memory_usage(deep=True).sum()
#     return reduced_df, runtime, mem_usage

# # SDE for 10%, 50%, 75%
# for components, label in zip([components_10, components_50, components_75], ['10%', '50%', '75%']):
#     reduced_df, runtime, mem_usage = apply_sde(components)
#     sde_results[label] = (reduced_df, runtime, mem_usage)

# # Output Results
# for label, (df, runtime, mem_usage) in sde_results.items():
#     print(f"Results for {label}:")
#     print(f"Runtime: {runtime:.4f} seconds")
#     print(f"Data Size: {mem_usage} bytes")
#     print(f"DataFrame head:\n{df.head()}\n")

### UMAP

In [30]:
umap_results = {}

In [31]:
import umap

def find_ideal_n_neighbors(components_list, features_scaled):
    avg_silhouette_scores = {}
    for n_neighbors in range(5, 21, 5):
        silhouette_scores = []
        for components in components_list:
            umap_model = umap.UMAP(n_neighbors=n_neighbors, n_components=components)
            umap_result = umap_model.fit_transform(features_scaled)
            silhouette_score = silhouette_score_umap(umap_result)
            silhouette_scores.append(silhouette_score)
        avg_silhouette_scores[n_neighbors] = np.mean(silhouette_scores)
    ideal_n_neighbors = max(avg_silhouette_scores, key=avg_silhouette_scores.get)
    return ideal_n_neighbors

In [32]:
from sklearn.metrics import silhouette_samples
def silhouette_score_umap(embedding):
    silhouette_vals = silhouette_samples(embedding, target)
    return np.mean(silhouette_vals)

In [33]:
def apply_umap(n_components, n_neighbors):
    umap_model = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors)
    umap_result, runtime = measure_runtime(lambda: umap_model.fit_transform(features_scaled))
    reduced_df = pd.DataFrame(umap_result, columns=[f'UMAP_{i+1}' for i in range(n_components)])
    reduced_df['logerror'] = target
    mem_usage = reduced_df.memory_usage(deep=True).sum()
    return reduced_df, runtime, mem_usage

In [34]:
# ideal_n_neighbors = find_ideal_n_neighbors([components_10, components_50, components_75], features_scaled)
# print(ideal_n_neighbors)
# Found that ideal_n_neighbors = 10
ideal_n_neighbors = 10

10

In [35]:
# UMAP for 10%, 50%, 75%
for components, label in zip([components_10, components_50, components_75], ['10%', '50%', '75%']):
    reduced_df, runtime, mem_usage = apply_umap(components, ideal_n_neighbors)
    umap_results[label] = (reduced_df, runtime, mem_usage)



In [36]:
# Output Results
for label, (df, runtime, mem_usage) in umap_results.items():
    print(f"Results for {label}:")
    print(f"Runtime: {runtime:.4f} seconds")
    print(f"Data Size: {mem_usage} bytes")
    print(f"DataFrame head:\n{df.head()}\n")

Results for 10%:
Runtime: 25.8138 seconds
Data Size: 1551712 bytes
DataFrame head:
      UMAP_1    UMAP_2    UMAP_3  logerror
0   2.539106 -5.432155  6.863309  0.025595
1  -1.226970  1.546156  5.454515  0.055619
2  11.052291 -5.383427 -0.619560  0.005383
3   9.952944  1.524144  5.986571 -0.103410
4  10.785599  2.787758 -5.642270  0.006940

Results for 50%:
Runtime: 16.9893 seconds
Data Size: 6516768 bytes
DataFrame head:
      UMAP_1    UMAP_2     UMAP_3    UMAP_4    UMAP_5     UMAP_6    UMAP_7  \
0   7.098338  3.300787   5.515061  8.105917  2.971231   5.380564  7.139856   
1   7.717453  5.618803   5.581394  3.464144  4.320915  10.056647  4.543162   
2   7.325140  3.563897   5.599204  6.533143  3.809103   5.043808  6.820460   
3   9.905190  4.401132  13.955169  5.246931  5.010491   5.052403  4.954078   
4  11.681363  5.256864   2.792217  4.082621  4.497179   5.276894  6.024670   

     UMAP_8    UMAP_9   UMAP_10   UMAP_11   UMAP_12   UMAP_13   UMAP_14  \
0  4.632673  0.976877  2.615457

### Autoencoder

In [37]:
autoencoder_results = {}

In [38]:
import tensorflow as tf
def autoencoder_dimension_reduction(dataset, n_dimensions):
    input_dim = dataset.shape[1]
    encoding_dim = n_dimensions

    input_data = tf.keras.layers.Input(shape=(input_dim,))
    encoded = tf.keras.layers.Dense(encoding_dim, activation='relu')(input_data)
    decoded = tf.keras.layers.Dense(input_dim, activation='sigmoid')(encoded)

    autoencoder = tf.keras.models.Model(input_data, decoded)
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

    autoencoder.fit(dataset, dataset, epochs=50, batch_size=256, shuffle=True, validation_split=0.2)

    encoder = tf.keras.models.Model(input_data, encoded)
    reduced_dataset = encoder.predict(dataset)
    return reduced_dataset


In [39]:
def apply_autoencoder(n_components):
    autoencoder_result, runtime = measure_runtime(lambda: autoencoder_dimension_reduction(features_scaled, n_components))
    reduced_df = pd.DataFrame(autoencoder_result, columns=[f'AE_{i+1}' for i in range(n_components)])
    reduced_df['logerror'] = target
    mem_usage = reduced_df.memory_usage(deep=True).sum()
    return reduced_df, runtime, mem_usage

In [40]:
# Autoencoder for 10%, 50%, 75%
for components, label in zip([components_10, components_50, components_75], ['10%', '50%', '75%']):
    reduced_df, runtime, mem_usage = apply_autoencoder(components)
    autoencoder_results[label] = (reduced_df, runtime, mem_usage)

Epoch 1/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 698us/step - loss: 0.5918 - val_loss: 0.2605
Epoch 2/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 483us/step - loss: -0.1890 - val_loss: -0.5531
Epoch 3/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 493us/step - loss: -1.3772 - val_loss: -1.6432
Epoch 4/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 568us/step - loss: -3.1234 - val_loss: -3.1742
Epoch 5/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 525us/step - loss: -5.6695 - val_loss: -5.2034
Epoch 6/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 513us/step - loss: -8.9821 - val_loss: -7.6962
Epoch 7/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498us/step - loss: -12.9024 - val_loss: -10.5858
Epoch 8/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 461us/step - loss: -17.5956 - val_loss: -13.8094
Epoch 

[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 530us/step - loss: -2190.0859 - val_loss: -1605.2415
Epoch 27/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 503us/step - loss: -2339.4341 - val_loss: -1714.5277
Epoch 28/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 474us/step - loss: -2489.4089 - val_loss: -1826.6865
Epoch 29/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 493us/step - loss: -2662.3596 - val_loss: -1941.6075
Epoch 30/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 466us/step - loss: -2854.1565 - val_loss: -2059.6565
Epoch 31/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 484us/step - loss: -3014.9873 - val_loss: -2180.9509
Epoch 32/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 485us/step - loss: -3146.9507 - val_loss: -2305.3723
Epoch 33/50
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 526us/step - loss

In [41]:
# Output Results
for label, (df, runtime, mem_usage) in autoencoder_results.items():
    print(f"Results for {label}:")
    print(f"Runtime: {runtime:.4f} seconds")
    print(f"Data Size: {mem_usage} bytes")
    print(f"DataFrame head:\n{df.head()}\n")

Results for 10%:
Runtime: 6.8612 seconds
Data Size: 1551712 bytes
DataFrame head:
         AE_1        AE_2        AE_3  logerror
0    0.000000  104.892067   89.970428  0.025595
1    0.000000  261.691864    0.000000  0.055619
2    0.000000  102.864738    0.000000  0.005383
3  105.319923    0.000000   60.057014 -0.103410
4  124.698112    0.000000  137.115997  0.006940

Results for 50%:
Runtime: 7.4885 seconds
Data Size: 6516768 bytes
DataFrame head:
         AE_1        AE_2       AE_3        AE_4        AE_5        AE_6  \
0    0.000000   83.328300   0.000000  253.764587    0.000000    0.000000   
1    0.000000    0.000000   0.000000   18.190613    0.000000  194.821106   
2    0.000000    0.000000  29.828644  102.480431    0.000000  123.786903   
3  151.293716   69.704300  52.298603    0.000000  168.155075    0.000000   
4   94.642990  147.046478  20.881954    0.000000  158.469345    0.000000   

         AE_7        AE_8        AE_9      AE_10       AE_11       AE_12  \
0  119.085052 