First, install dependencies and download the data

In [104]:
#%pip install -r requirements.txt
#!curl -l -o data.csv "https://phl.carto.com/api/v2/sql?q=SELECT+*,+ST_Y(the_geom)+AS+lat,+ST_X(the_geom)+AS+lng+FROM+opa_properties_public&filename=opa_properties_public&format=csv&skipfields=cartodb_id"

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [2]:
df = pd.read_csv('data.csv')

  df = pd.read_csv('data.csv')


In [4]:
df.shape

(582937, 82)

In [4]:
df.head()

Unnamed: 0,the_geom,the_geom_webmercator,assessment_date,basements,beginning_point,book_and_page,building_code,building_code_description,category_code,category_code_description,...,year_built,year_built_estimate,zip_code,zoning,pin,building_code_new,building_code_description_new,objectid,lat,lng
0,0101000020E6100000288FB7AD4BC452C0C6E4F1E75A01...,0101000020110F00002693FA5D94E05FC1EE7D5D356691...,2022-05-24,,120' NW EADOM ST,987458.0,RB,NON PD PKG LOT COMMERCIAL,6,VACANT LAND,...,,,19137.0,IRMX,1001189957,,,433143301,40.010587,-75.067119
1,0101000020E610000015F6916732CB52C0D4B37C0242FE...,0101000020110F0000F30438544DEC5FC1AAE24928F78D...,2022-05-24,,273' N OF NORRIS ST,,SR,VACANT LAND RESIDE < ACRE,6,VACANT LAND,...,,,19121.0,RSA5,1001505175,,,433143302,39.986389,-75.174951
2,0101000020E610000066AC299B48CA52C053CA1DF46CFD...,0101000020110F00004ACDE132C0EA5FC17F6F06010B8D...,2022-05-24,,"263' 8"" N COLUMBIA AVE",,U50,ROW CONV/APT 3 STY MASON,2,MULTI FAMILY,...,1890.0,,19121.0,RM1,1001622123,22.0,ROW TYPICAL,433143303,39.979887,-75.160682
3,0101000020E6100000000000000000F87F000000000000...,,2022-05-24,,"1,115.962' S PATTISON AVE",,SS,VACANT LAND RESIDE ACRE+,6,VACANT LAND,...,,,,,1001319180,,,433143304,,
4,0101000020E61000000C5CFE385BC452C0E38B05E75601...,0101000020110F0000F7332EC5AEE05FC1240EE0C46191...,2022-05-24,,NEC SCATTERGOOD TO,,RB,NON PD PKG LOT COMMERCIAL,6,VACANT LAND,...,,,19137.0,IRMX,1001189938,,,433143305,40.010465,-75.068068


In [5]:
df.columns

Index(['the_geom', 'the_geom_webmercator', 'assessment_date', 'basements',
       'beginning_point', 'book_and_page', 'building_code',
       'building_code_description', 'category_code',
       'category_code_description', 'census_tract', 'central_air',
       'cross_reference', 'date_exterior_condition', 'depth',
       'exempt_building', 'exempt_land', 'exterior_condition', 'fireplaces',
       'frontage', 'fuel', 'garage_spaces', 'garage_type',
       'general_construction', 'geographic_ward', 'homestead_exemption',
       'house_extension', 'house_number', 'interior_condition', 'location',
       'mailing_address_1', 'mailing_address_2', 'mailing_care_of',
       'mailing_city_state', 'mailing_street', 'mailing_zip', 'market_value',
       'market_value_date', 'number_of_bathrooms', 'number_of_bedrooms',
       'number_of_rooms', 'number_stories', 'off_street_open',
       'other_building', 'owner_1', 'owner_2', 'parcel_number', 'parcel_shape',
       'quality_grade', 'recording_d

Here we can see all of the columns available in the dataset. Many of these columns are not really necesary. For example, geographical data such as street addresses are not very useful because they don't contain any categorical or numerical data that can be easily consumed by a linear regression mode.

In [6]:
COLUMNS_TO_REMOVE = """
the_geom
the_geom_webmercator
beginning_point
book_and_page
building_code_description
category_code_description
cross_reference
geographic_ward
house_number
location
mailing_address_1
mailing_address_2
mailing_care_of
mailing_city_state
mailing_street
mailing_zip
other_building
owner_1
owner_2
parcel_number
registry_number
state_code
street_code
street_designation
street_direction
street_name
suffix
assessment_date
recording_date
sale_date
pin
zip_code
parcel_shape
quality_grade
building_code
building_code_new
building_code_description_new
objectid
unit
"""
df_first_column_drop = df.drop(columns=COLUMNS_TO_REMOVE.split())

In [7]:
df_first_column_drop.to_csv('data_first_clean.csv', index=False)

One thing to note about this dataset is that this data is for all types of properties in Philadelphia. Since we are only interested in housing data, we will filter this data to only be for housing type properties. This is accomplished by filtering for properties with a category code of 1 (single family), 2 (multi family), or 3 (mixed).

In [8]:
# Get all unique values of (category_code, category_code_description)
df[['category_code', 'category_code_description']].drop_duplicates().sort_values(by='category_code')

Unnamed: 0,category_code,category_code_description
11,1,SINGLE FAMILY
72657,1,Single Family
2,2,MULTI FAMILY
103077,2,Multi Family
6,3,MIXED USE
103887,3,Mixed Use
5,4,COMMERCIAL
103634,4,Commercial
105777,5,Industrial
248,5,INDUSTRIAL


In [9]:
# Print number of data points
print(df_first_column_drop.shape)
# Filter category-type to 1, or 2
df_first_column_drop = df_first_column_drop[df_first_column_drop['category_code'] <= 2]
# Print number of data points
print(df_first_column_drop.shape)

(582937, 43)
(503907, 43)


Now that we have removed some of the columns, lets see what we are left with.

In [10]:
df_first_column_drop.dtypes

basements                   object
category_code                int64
census_tract               float64
central_air                 object
date_exterior_condition     object
depth                      float64
exempt_building            float64
exempt_land                float64
exterior_condition         float64
fireplaces                 float64
frontage                   float64
fuel                        object
garage_spaces              float64
garage_type                 object
general_construction        object
homestead_exemption          int64
house_extension             object
interior_condition         float64
market_value               float64
market_value_date          float64
number_of_bathrooms        float64
number_of_bedrooms         float64
number_of_rooms            float64
number_stories             float64
off_street_open            float64
sale_price                 float64
separate_utilities          object
sewer                       object
site_type           

We still have a lot of columns. It is very likely that many more of these will get dropped for the following reasons:
1. Too many missing values
2. Too many unique values
3. Not enough correlation with the target variable

Let's start by dealing with the first case: too many missing values. We will check this by counting the number of missing values in each column and sorting by this number.

There are also a few columns that seem to have a default value put in them instead of being left blank. Lets also change these default values to na, so they can be counted correctly.

In [11]:
df_first_column_drop['depth'] = df_first_column_drop['depth'].replace(0, np.nan)
df_first_column_drop['total_area'] = df_first_column_drop['total_area'].replace(0, np.nan)
df_first_column_drop['total_livable_area'] = df_first_column_drop['total_livable_area'].replace(0, np.nan)
df_first_column_drop['year_built'] = df_first_column_drop['year_built'].replace(0, np.nan)
df_first_column_drop['sale_price'] = df_first_column_drop['sale_price'].replace(1, np.nan)

In [12]:
"""
We still have a lot of columns. It is very likely that many more of these will get dropped for the following reasons:
1. Too many missing values
2. Too many unique values
3. Not enough correlation with the target variable

Let's start by dealing withe the first case: too many missing values. We will check this by counting the number of missing values in each column and sorting by this number.
"""
#print(df_first_column_drop.isna().sum().sort_values(ascending=False))
# Print the number of missing values and the percentage of missing values
missing_values = df_first_column_drop.isna().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
missing_values = pd.DataFrame(missing_values, columns=['missing_values'])
missing_values['percentage_missing'] = missing_values['missing_values'] / len(df_first_column_drop)
print(missing_values)

                         missing_values  percentage_missing
market_value_date                503907            1.000000
unfinished                       503844            0.999875
utility                          503772            0.999732
site_type                        501863            0.995944
date_exterior_condition          501462            0.995148
sewer                            490034            0.972469
number_of_rooms                  489868            0.972140
house_extension                  489308            0.971028
separate_utilities               488822            0.970064
garage_type                      481716            0.955962
fuel                             456444            0.905810
central_air                      211179            0.419083
type_heater                      203555            0.403954
basements                        174364            0.346024
sale_price                       131445            0.260852
year_built_estimate               94314 

Now some of these pieces of missing data won't be that big of a deal because we can either fill or impute the data. We can also drop rows with missing data, but we should do this sparingly. For some categorical columns, you could set a default value, but I will not be doing this much. You cannot assume exactly what the person who entered the data intended by leaving it blank, and filling it could cause innacuracies, especially with columns with a lot of missing data. But for some columns, there is simply too much missing data. For this reason, I will be dropping all columns with more than 25% missing data.

In [13]:
COLUMNS_TO_REMOVE_MISSING_VALUES = missing_values[missing_values['percentage_missing'] > 0.25].index
df_second_column_drop = df_first_column_drop.drop(columns=COLUMNS_TO_REMOVE_MISSING_VALUES)

In [14]:
df_second_column_drop.to_csv('data_second_clean.csv', index=False)

Now let's recheck the list of columns with missing data, and address each one indiviually.

In [15]:
missing_values_2 = df_second_column_drop.isna().sum().sort_values(ascending=False)
missing_values_2 = missing_values_2[missing_values_2 > 0]
missing_values_2 = pd.DataFrame(missing_values_2, columns=['missing_values'])
missing_values_2['percentage_missing'] = missing_values_2['missing_values'] / len(df_second_column_drop)
print(missing_values_2)

                      missing_values  percentage_missing
year_built_estimate            94314            0.187165
depth                          34709            0.068880
total_area                     32821            0.065133
topography                     32339            0.064177
general_construction           15954            0.031661
garage_spaces                   8248            0.016368
fireplaces                      6963            0.013818
number_of_bathrooms             6665            0.013227
off_street_open                 4751            0.009428
number_of_bedrooms              3458            0.006862
interior_condition              3339            0.006626
number_stories                  3323            0.006594
exterior_condition              3238            0.006426
frontage                        3149            0.006249
view_type                       2857            0.005670
zoning                          1177            0.002336
total_livable_area             

Now that we are left with columns with most of their data filled in, it is much safter to start filling in with default or imputed data.

In [16]:
from sklearn.impute import SimpleImputer
median_imputer = SimpleImputer(strategy='median')
mode_imputer = SimpleImputer(strategy='most_frequent')
df_fill = df_second_column_drop
# year_built_estimate can be filled with N's
df_fill['year_built_estimate'] = df_fill['year_built_estimate'].fillna('N')
# garage_spaces can just be filled with 0. It is a fair assumption that if the value is missing, there is no garage
df_fill['garage_spaces'] = df_fill['garage_spaces'].fillna(0)
# fireplaces can just be filled with 0. It is a fair assumption that if the value is missing, there are no fireplaces
df_fill['fireplaces'] = df_fill['fireplaces'].fillna(0)
# number of bathrooms can be filled with the median
df_fill['number_of_bathrooms'] = median_imputer.fit_transform(df_fill[['number_of_bathrooms']])
# interior condition can be filled with the median
df_fill['interior_condition'] = median_imputer.fit_transform(df_fill[['interior_condition']])
# exterior condition can be filled with the median
df_fill['exterior_condition'] = median_imputer.fit_transform(df_fill[['exterior_condition']])
# number of bedrooms can be filled with the median
df_fill['number_of_bedrooms'] = median_imputer.fit_transform(df_fill[['number_of_bedrooms']])
# number of stories can be filled with the median
df_fill['number_stories'] = median_imputer.fit_transform(df_fill[['number_stories']])
# NOTE: Maybe change these to mode
# general construction can be filled with a new category called 'unknown'
df_fill['general_construction'] = df_fill['general_construction'].fillna('unknown')
# quality grade will be skipped for now because it needs to be transformed into a numeric column
# year built can be filled with the median
df_fill['year_built'] = median_imputer.fit_transform(df_fill[['year_built']])
# total livable area can be filled with the median
df_fill['total_livable_area'] = median_imputer.fit_transform(df_fill[['total_livable_area']])
# topography can be filled with a new category called 'unknown'
df_fill['topography'] = df_fill['topography'].fillna('unknown')
# depth can be filled with the median
df_fill['depth'] = median_imputer.fit_transform(df_fill[['depth']])
# total area can be filled with the median
df_fill['total_area'] = median_imputer.fit_transform(df_fill[['total_area']])
# view type can be filled with a new category called 'unknown'
df_fill['view_type'] = df_fill['view_type'].fillna('unknown')
# off street open can be filled with the median
df_fill['off_street_open'] = median_imputer.fit_transform(df_fill[['off_street_open']])
# frontage can be filled with the median
df_fill['frontage'] = median_imputer.fit_transform(df_fill[['frontage']])
# zoning can be filled with a new category called 'unknown'
df_fill['zoning'] = df_fill['zoning'].fillna('unknown')
# census tract can be filled with the median
df_fill['census_tract'] = median_imputer.fit_transform(df_fill[['census_tract']])
# lat and lng can be filled with the median
df_fill['lat'] = median_imputer.fit_transform(df_fill[['lat']])
df_fill['lng'] = median_imputer.fit_transform(df_fill[['lng']])
# taxable building can be filled with the median
df_fill['taxable_building'] = median_imputer.fit_transform(df_fill[['taxable_building']])
# exempt land can be filled with the median
df_fill['exempt_land'] = median_imputer.fit_transform(df_fill[['exempt_land']])
# exempt building can be filled with the median
df_fill['exempt_building'] = median_imputer.fit_transform(df_fill[['exempt_building']])
# taxable land can be filled with the median
df_fill['taxable_land'] = median_imputer.fit_transform(df_fill[['taxable_land']])
# NOTE: Maybe change this to drop
# market value can be filled with the median
df_fill['market_value'] = median_imputer.fit_transform(df_fill[['market_value']])

Lets check again our missing data

In [17]:
missing_values_3 = df_fill.isna().sum().sort_values(ascending=False)
missing_values_3 = missing_values_3[missing_values_3 > 0]
missing_values_3 = pd.DataFrame(missing_values_3, columns=['missing_values'])
missing_values_3['percentage_missing'] = missing_values_3['missing_values'] / len(df_second_column_drop)
print(missing_values_3)

Empty DataFrame
Columns: [missing_values, percentage_missing]
Index: []


In [18]:
# Save to csv
df_fill.to_csv('data_filled.csv', index=False)

Now all

In [19]:
# One hot columns
ONE_HOT_COLUMNS = [
    'category_code',
    'general_construction',
    'topography',
    'view_type',
    'zoning',
]

BINARY_COLUMNS = [
    'year_built_estimate',
    'homestead_exemption',
    'exempt_building',
    'exempt_land'
]

In [20]:
# One hot encode the columns
df_one_hot = pd.get_dummies(df_fill, columns=ONE_HOT_COLUMNS)
# Binary encode the columns
df_one_hot['year_built_estimate'] = df_one_hot['year_built_estimate'].map({'Y': True, 'N': False})
# Fillna with False
df_one_hot['year_built_estimate'] = df_one_hot['year_built_estimate'].fillna(False)
df_one_hot['homestead_exemption'] = df_one_hot['homestead_exemption'].map({80000: True, 0: False})
# Fillna with False
df_one_hot['homestead_exemption'] = df_one_hot['homestead_exemption'].fillna(False)
# Exempt building should be false if 0, else true
df_one_hot['exempt_building'] = df_one_hot['exempt_building'].map({0: False})
# Fillna with True
df_one_hot['exempt_building'] = df_one_hot['exempt_building'].fillna(True)
# Exempt land should be false if 0, else true
df_one_hot['exempt_land'] = df_one_hot['exempt_land'].map({0.0: False})
# Fillna with True
df_one_hot['exempt_land'] = df_one_hot['exempt_land'].fillna(True)

  df_one_hot['year_built_estimate'] = df_one_hot['year_built_estimate'].fillna(False)
  df_one_hot['homestead_exemption'] = df_one_hot['homestead_exemption'].fillna(False)
  df_one_hot['exempt_building'] = df_one_hot['exempt_building'].fillna(True)
  df_one_hot['exempt_land'] = df_one_hot['exempt_land'].fillna(True)


In [21]:
# Check that all columns are numeric
df_one_hot.dtypes.value_counts()

bool       78
float64    19
Name: count, dtype: int64

That is a lot of boolean columns. We can probably drop some of these columns, but we will do that later. For now, we will just convert these columns to 0 and 1.

In [22]:
# Get columns with type bool
bool_columns = df_one_hot.select_dtypes(include=bool).columns
# Change type of columns to int
df_one_hot[bool_columns] = df_one_hot[bool_columns].astype(int)

In [23]:
# Delete outliers
df_outliers = df_one_hot
# Delete census_tract outliers
#df_outliers = df_outliers[df_outliers['census_tract'] < 500]
# Delete depth outliers < 200 and > 32
df_outliers = df_outliers[df_outliers['depth'] < 144]
df_outliers = df_outliers[df_outliers['depth'] > 32]
# Fireplace < 6
df_outliers = df_outliers[df_outliers['fireplaces'] < 6]
# Frontage < 140
df_outliers = df_outliers[df_outliers['frontage'] < 50]
# Garage spaces < 5
df_outliers = df_outliers[df_outliers['garage_spaces'] < 5]
# Market value < 10_000_000
df_outliers = df_outliers[df_outliers['market_value'] < 2_000_000]
# Number of bathrooms < 6
df_outliers = df_outliers[df_outliers['number_of_bathrooms'] < 6]
# Number of bedrooms < 15
df_outliers = df_outliers[df_outliers['number_of_bedrooms'] < 6]
# Number of stories < 6
df_outliers = df_outliers[df_outliers['number_stories'] < 6]
# Taxable building < 4_000_000
df_outliers = df_outliers[df_outliers['taxable_building'] < 1_000_000]
# Taxable land < 1_000_000
df_outliers = df_outliers[df_outliers['taxable_land'] < 200_000]
# Total area < 250_000
df_outliers = df_outliers[df_outliers['total_area'] < 16_000]
# Total livable area < 250_000
df_outliers = df_outliers[df_outliers['total_livable_area'] < 8_000]
# Year built > 1840
df_outliers = df_outliers[df_outliers['year_built'] > 1890]

In [24]:
# Export to CSV excluding columns starting with ONE_HOT_COLUMNS
df_out = df_outliers
for column in ONE_HOT_COLUMNS:
    df_out = df_out.loc[:, ~df_out.columns.str.startswith(column)]
df_out.to_csv('data_outliers.csv', index=False)

#df_outliers.to_csv('data_outliers.csv', index=False)
df_out.shape

(445963, 23)

In [25]:
# Create a correlation graphic agains the market value
correlation = df_out.corr()
# Print the correlation with the target variable
correlation['market_value'].sort_values(ascending=False)

market_value           1.000000
taxable_land           0.937928
taxable_building       0.765416
total_livable_area     0.517348
year_built             0.340224
total_area             0.180573
number_stories         0.180235
fireplaces             0.148861
number_of_bathrooms    0.148615
exempt_building        0.144842
homestead_exemption    0.113693
garage_spaces          0.107790
depth                  0.088641
frontage               0.087172
lng                    0.038749
off_street_open       -0.049713
number_of_bedrooms    -0.119201
lat                   -0.128467
census_tract          -0.138953
year_built_estimate   -0.174337
exempt_land           -0.252257
exterior_condition    -0.423758
interior_condition    -0.456576
Name: market_value, dtype: float64

Now that preprocessing is complete, we can move on to feature engineering. For this step we will be applying PCA to the data. With PCA, we will be having it reduce the dimension so that we retain 95% of the variance.

In [26]:
from sklearn.preprocessing import StandardScaler
# Standardize the Data
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_outliers), columns=df_outliers.columns)

In [27]:
X = df_scaled.drop(columns='market_value')
y = df_scaled['market_value']

In [84]:
from sklearn.decomposition import PCA
PCA_VARIANCE = 0.8
# Create a PCA instance
pca = PCA(PCA_VARIANCE)
pca.fit(X)
# Transform the data
X_pca = pca.transform(X)
print(X.shape)
X_pca.shape

(445963, 96)


(445963, 56)

In [282]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [85]:
from sklearn.model_selection import train_test_split
X_pca_train, X_pca_test, y_pca_train, y_pca_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

In [283]:
# Create a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [284]:
# Score the model
model.score(X_test, y_test)

0.9512293454070778

In [87]:
pca_model = LinearRegression()
pca_model.fit(X_pca_train, y_pca_train)
pca_model.score(X_pca_test, y_pca_test)

0.8361307053062926

In [286]:
# Create a decision tree model
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)
tree_model.score(X_test, y_test)

0.9818018612946061

In [289]:
# Create a random forest model
forest_model = RandomForestRegressor()
forest_model.fit(X_train, y_train)
forest_model.score(X_test, y_test)

0.9902814734126171

In [88]:
# Create a decision tree model with PCA
pca_tree_model = DecisionTreeRegressor()
pca_tree_model.fit(X_pca_train, y_pca_train)
pca_tree_model.score(X_pca_test, y_pca_test)

0.9002268095787773

In [29]:
# Create a random forest model with PCA
# pca_forest_model = RandomForestRegressor()
# pca_forest_model.fit(X_pca_train, y_pca_train)
# pca_forest_model.score(X_pca_test, y_pca_test)

NameError: name 'X_pca_train' is not defined

In [292]:
# Create a gradient boosting model
gradient_model = GradientBoostingRegressor()
gradient_model.fit(X_train, y_train)
gradient_model.score(X_test, y_test)

0.978827266418655

In [None]:
# Create a gradient boosting model with PCA
pca_gradient_model = GradientBoostingRegressor()
pca_gradient_model.fit(X_pca_train, y_pca_train)
pca_gradient_model.score(X_pca_test, y_pca_test)

0.8904228428584422

In [294]:
# Create a model with xgboost
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
xgb_model.score(X_test, y_test)

0.9927289109991099

In [None]:
# Create a model with xgboost with PCA
pca_xgb_model = xgb.XGBRegressor()
pca_xgb_model.fit(X_pca_train, y_pca_train)
pca_xgb_model.score(X_pca_test, y_pca_test)

0.9518887683463301

In [296]:
# Save all of the models
import pickle
pickle.dump(model, open('linear_model.pkl', 'wb'))
pickle.dump(tree_model, open('tree_model.pkl', 'wb'))
pickle.dump(forest_model, open('forest_model.pkl', 'wb'))
pickle.dump(gradient_model, open('gradient_model.pkl', 'wb'))
pickle.dump(xgb_model, open('xgb_model.pkl', 'wb'))

In [None]:
import pickle
pickle.dump(pca_model, open(f'pca_{PCA_VARIANCE}_linear_model.pkl', 'wb'))
pickle.dump(pca_tree_model, open(f'pca_{PCA_VARIANCE}_tree_model.pkl', 'wb'))
pickle.dump(pca_forest_model, open(f'pca_{PCA_VARIANCE}_forest_model.pkl', 'wb'))
pickle.dump(pca_gradient_model, open(f'pca_{PCA_VARIANCE}_gradient_model.pkl', 'wb'))
pickle.dump(pca_xgb_model, open(f'pca_{PCA_VARIANCE}_xgb_model.pkl', 'wb'))

In [30]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import logging
logging.basicConfig(filename='pca.log', level=logging.INFO, format="%(asctime)s:%(levelname)s:%(message)s")
scores = pd.DataFrame(columns=["variance", "linear", "tree", "forest", "gradient", "xgb"])
# scores: variance, linear, tree, forest, gradient, xgb
scores.set_index("variance", inplace=True)
for n in range(10, 11):
    variance = n / 10
    logging.info(f"{variance=}")
    scores.loc[variance, "variance"] = variance
    pca = PCA(variance)
    pca.fit(X)
    X_pca = pca.transform(X)
    X_pca_train, X_pca_test, y_pca_train, y_pca_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)
    pca_model = LinearRegression()
    pca_model.fit(X_pca_train, y_pca_train)
    logging.info(f"{pca_model.score(X_pca_test, y_pca_test)=}")
    scores.loc[variance, "linear"] = pca_model.score(X_pca_test, y_pca_test)
    pca_tree_model = DecisionTreeRegressor()
    pca_tree_model.fit(X_pca_train, y_pca_train)
    logging.info(f"{pca_tree_model.score(X_pca_test, y_pca_test)=}")
    scores.loc[variance, "tree"] = pca_tree_model.score(X_pca_test, y_pca_test)
    pca_forest_model = RandomForestRegressor()
    pca_forest_model.fit(X_pca_train, y_pca_train)
    logging.info(f"{pca_forest_model.score(X_pca_test, y_pca_test)=}")
    scores.loc[variance, "forest"] = pca_forest_model.score(X_pca_test, y_pca_test)
    pca_gradient_model = GradientBoostingRegressor()
    pca_gradient_model.fit(X_pca_train, y_pca_train)
    logging.info(f"{pca_gradient_model.score(X_pca_test, y_pca_test)=}")
    scores.loc[variance, "gradient"] = pca_gradient_model.score(X_pca_test, y_pca_test)
    pca_xgb_model = xgb.XGBRegressor()
    pca_xgb_model.fit(X_pca_train, y_pca_train)
    logging.info(f"{pca_xgb_model.score(X_pca_test, y_pca_test)=}")
    scores.loc[variance, "xgb"] = pca_xgb_model.score(X_pca_test, y_pca_test)
    scores.to_csv("scores.csv", index="variance")

InvalidParameterError: The 'n_components' parameter of PCA must be an int in the range [0, inf), a float in the range (0.0, 1.0), a str among {'mle'} or None. Got 1.0 instead.