# Loading data

In [None]:
%cd ..

c:\Users\nick\OneDrive\Desktop\Prospect 33\Mini_DIVA


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

pd.set_option('display.max_columns', None)

In [None]:
# the new data is in a folder called new_datasets
data_dir = "../Mini_DIVA/new_datasets/"
file_dir = data_dir + 'King_county.csv'

# read the data
df = pd.read_csv(file_dir)
df.head()

In [None]:
print(df.shape)
display(df.isna().sum())

(21613, 21)


id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [None]:
# drop all nan values
df.dropna(inplace=True)

# confirm no missing value remaining
assert all(df.isna().sum()) == 0

# Subsampling

The original datasets are too large use in DIVA as they are. Therefore, I need to resample the datasets to a sample size that is more favourable for the resources at hand.

Originally, I was to use a sample size of 10000, however, such a sample size took to long to impute. I subsampled to 5000, then 1000 and finally settled on 1500.

## Random sampling

This method was used on data with continuous target variables as it is not likely to mess up the distribution alot.


In [None]:
# resample the data by random sampling
samp1_df = df.sample(n=1500, replace=False, random_state=42, ignore_index=False)

# export as a csv file
samp1_df.to_csv(path_or_buf=file_dir.removesuffix(".csv") + "_resampled.csv")

## Stratified sampling

This method is better than random sampling for data with categorical target variables as it maintains the original data distribution in the sample.

In [None]:
# resample the data by stratified sampling (FOR CATEGORICAL TARGET VARIABLES)
sampler = StratifiedShuffleSplit(n_splits=1, random_state=42, train_size=1500/df.shape[0], )

# specify target variable
target = "repay_fail"

X = df.drop(target, axis=1)
df[target].replace({"yes": 1, "no":0}, inplace=True)
y = df[target]

original_ratio = df[target].sum() / df.shape[0]
train, test = None, None

for train_idx, test_idx in sampler.split(X=X, y=y):
    train = train_idx
    test = test_idx

samp2_df = df.loc[train]
test_data = df.loc[test]

# check if original ditribution is maintained
print("Original ratio:", original_ratio)
print(samp2_df[target].sum() / samp2_df.shape[0])
print(test_data[target].sum() / test_data.shape[0])

# export as a csv file
samp2_df.to_csv(path_or_buf=file_dir.removesuffix(".csv") + "_resampled.csv")

Original ratio: 0.1478893974678123
0.148
0.14788477090827964


For small datasets that do not require subsampling, replacement of the original data is done inplace.

In [None]:
# # Export cleaned data (Replacement)
# df.to_csv(file_dir)