# Model assumptions

In [1]:
import pandas as pd
from auxiliar_func import *
from plot_func import *
import os
from sklearn.covariance import EmpiricalCovariance

In this notebook we are going to check if the assumptions made by the models are satisfied by our dataset

In [2]:
target = 'income_50k'               # target variable
df_tr = pd.read_csv('../train.csv') # training set

TARGET_METRIC = 'f1_macro'          # metric to be used in the grid search
SEED = 42                           # seed for reproducibility

### Discriminant Analysis

In [3]:
preprocessing_params = {}
if os.path.exists('./results/results_lda.csv'):
    preprocessing_params, _ = get_best_params('results/results_lda.csv')

preproc_lda = preprocessing(df_tr, **preprocessing_params)

cov1 = EmpiricalCovariance().fit(preproc_lda[preproc_lda[target] == 0].drop(columns=target)).covariance_
cov2 = EmpiricalCovariance().fit(preproc_lda[preproc_lda[target] == 1].drop(columns=target)).covariance_

print("The maximum value of both covariance matrices is:", max(np.max(np.abs(cov1)), np.max(np.abs(cov2))))
print("The maximum difference between the covariance matrices is:", np.max(np.abs(cov1 - cov2)))
print("The minimum difference between the covariance matrices is:", np.min(np.abs(cov1 - cov2)))

print('Covariance matrices are equal with a delta error of 0.1:', np.allclose(cov1, cov2, atol=0.1))

The maximum value of both covariance matrices is: 9.796603255272977
The maximum difference between the covariance matrices is: 9.605063550677642
The minimum difference between the covariance matrices is: 0.0
Covariance matrices are equal with a delta error of 0.1: False


### Logistic regression

In [32]:
preprocessing_params = {}
if os.path.exists('./results/results_logreg.csv'):
    preprocessing_params, _ = get_best_params('results/results_logreg.csv')

preproc_logreg = preprocessing(df_tr, **preprocessing_params)

It assumes that there is minimal or no multicolinearity among the independent variables

In [30]:
# get only numerical columns
preproc_logreg_numerical = preproc_logreg.select_dtypes(include=['float64', 'int64'])

# calculate VIF for each feature
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(preproc_logreg_numerical.values, i) for i in range(preproc_logreg_numerical.shape[1])]
vif["features"] = preproc_logreg_numerical.columns

print(vif)

   VIF Factor         features
0    1.028530    wage_per_hour
1    1.024095    capital_gains
2    1.011231   capital_losses
3    1.017368  stock_dividends
4    1.857902          num_emp
5    1.872220     weeks_worked


It usually requires a larga sample size to predict properly

In [15]:
preproc_logreg.shape

(58633, 483)

It assumes independent observations

In [33]:
# check if there are duplicated rows
print('There are duplicated rows:', preproc_logreg.duplicated().any())

There are duplicated rows: True


### K-Nearest Neighbors

In [3]:
preprocessing_params = {}
if os.path.exists('./results/results_knn.csv'):
    preprocessing_params, _ = get_best_params('results/results_knn.csv')

preproc_knn = preprocessing(df_tr, **preprocessing_params)

The data is in feature space, which means data in feature space can be measured by distance metrics such as Manhattan, Euclidean, etc.

In [5]:
# calculate distances between all points (there are categorical features)
from scipy.spatial.distance import pdist
distances = pdist(preproc_knn.drop(columns=target), metric='hamming')
print(distances[:10])

Each of the training data points consists of a set of vectors and a class label associated with each vector.

In [None]:
# print random observation
print(preproc_knn.iloc[0])