# Model assumptions

In [1]:
import pandas as pd
from auxiliar_func import *
from plot_func import *
import os

In this notebook we are going to check if the assumptions made by the models are satisfied by our dataset

In [2]:
target = 'income_50k'               # target variable
df_tr = pd.read_csv('../train.csv') # training set

### Discriminant Analysis

In [15]:
preprocessing_params = {}
if os.path.exists('./results/results_lda.csv'):
    preprocessing_params, _ = get_best_params('results/results_lda.csv')

preproc_lda = preprocessing(df_tr, **preprocessing_params)

With all variables: 

In [16]:
from sklearn.covariance import EmpiricalCovariance

cov1 = EmpiricalCovariance().fit(preproc_lda[preproc_lda[target] == 0].drop(columns=target)).covariance_
cov2 = EmpiricalCovariance().fit(preproc_lda[preproc_lda[target] == 1].drop(columns=target)).covariance_

print("The maximum value of both covariance matrices is:", max(np.max(np.abs(cov1)), np.max(np.abs(cov2))))
print("The maximum difference between the covariance matrices is:", np.max(np.abs(cov1 - cov2)))
print("The minimum difference between the covariance matrices is:", np.min(np.abs(cov1 - cov2)))

print('Covariance matrices are equal with a delta error of 0.1:', np.allclose(cov1, cov2, atol=0.1))

The maximum value of both covariance matrices is: 12.594888924778843
The maximum difference between the covariance matrices is: 12.376619667775989
The minimum difference between the covariance matrices is: 0.0
Covariance matrices are equal with a delta error of 0.1: False


Only numerical variables: 

In [39]:
from scipy.stats import bartlett

preproc_lda_numerical = preproc_lda.select_dtypes(include=['int64', 'float64'])
preproc_lda_numerical[target] = preproc_lda[target]

class1 = preproc_lda_numerical[preproc_lda_numerical[target] == 0].drop(columns=target)
class2 = preproc_lda_numerical[preproc_lda_numerical[target] == 1].drop(columns=target)

bartlett_test = bartlett(*[class1[col] for col in class1.columns], *[class2[col] for col in class2.columns])
print("Bartlett test p-value:", bartlett_test.pvalue)

# The null hypothesis of the Bartlett test is that all input samples are from populations with equal variances.
# The p-value is very small, so we can reject the null hypothesis and conclude that the input samples are not from populations with equal variances.

Bartlett test p-value: 0.0


### Logistic regression

In [3]:
preprocessing_params = {}
if os.path.exists('./results/results_logreg.csv'):
    preprocessing_params, _ = get_best_params('results/results_logreg.csv')

preproc_logreg = preprocessing(df_tr, **preprocessing_params)

It assumes that there is minimal or no multicolinearity among the independent variables

In [10]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# get only numerical columns
preproc_logreg_numerical = preproc_logreg.select_dtypes(include=['float64', 'int64'])

# calculate VIF for each feature
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(preproc_logreg_numerical.values, i) for i in range(preproc_logreg_numerical.shape[1])]
vif["features"] = preproc_logreg_numerical.columns

print(vif)


   VIF Factor         features
0    1.030398    wage_per_hour
1    1.026089    capital_gains
2    1.011783   capital_losses
3    1.017805  stock_dividends
4    1.872908          num_emp
5    1.891849     weeks_worked


It usually requires a larga sample size to predict properly

In [5]:
preproc_logreg.shape

(58339, 483)

It assumes independent observations

In [6]:
# check if there are duplicated rows
print('There are duplicated rows:', preproc_logreg.duplicated().any())

There are duplicated rows: False
