In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer


In [2]:
df_train = pd.read_csv("data/vehicle_dataset_train.csv", index_col=0)
df_test = pd.read_csv("data/vehicle_dataset_test.csv", index_col=0)
X_train, y_train = df_train.drop(columns=['selling_price']), df_train['selling_price']
X_test, y_test = df_test.drop(columns=['selling_price']), df_test['selling_price']
# Check missing data
X_train.isna().sum()

year          0
mileage       0
max_power    44
dtype: int64

Simple imputer to impute missing data

In [3]:
imputer = SimpleImputer(strategy='mean')
X_train_1 = imputer.fit_transform(X_train)
X_test_1 = imputer.transform(X_test)

kNN regression to impute missing data. Need scaling before imputation.

In [4]:
scaler = StandardScaler().fit(np.concatenate([X_train, X_test], axis=0))
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn_imputer = KNNImputer(n_neighbors=2)
X_train_2 = knn_imputer.fit_transform(X_train_scaled)
X_test_2 = knn_imputer.transform(X_test_scaled)

Check correlation, use `np.where` to get the index of the element we want.

In [5]:
df = pd.read_csv("data/communities_and_crime.csv", index_col = 0)
X,y = df.drop(columns=['ViolentCrimesPerPop']), df['ViolentCrimesPerPop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
col_names = X_train.columns
df_corr = X_train.corr()
itemindex = np.where((df_corr > 0.99) | (df_corr < -0.99))
print("The following pairs of predictor variables have correlation greater than 0.99 or less than -0.99:")
{(col_names[i],col_names[j]): round(df_corr.iloc[i,j],4) for (i,j) in zip(itemindex[0], itemindex[1]) if i > j}

The following pairs of predictor variables have correlation greater than 0.99 or less than -0.99:


{('numbUrban', 'population'): 0.9998,
 ('PctRecImmig5', 'PctRecentImmig'): 0.992,
 ('PctRecImmig8', 'PctRecImmig5'): 0.9963,
 ('PctRecImmig10', 'PctRecImmig5'): 0.9901,
 ('PctRecImmig10', 'PctRecImmig8'): 0.9966,
 ('OwnOccMedVal', 'OwnOccLowQuart'): 0.9917,
 ('MedRent', 'RentMedian'): 0.993,
 ('LemasSwFTFieldOps', 'LemasSwornFT'): -0.9903,
 ('PolicPerPop', 'LemasSwFTPerPop'): 1.0}