In [3]:
# EDA
import pandas as pd
import pingouin as pg
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [4]:

dfAluno = pd.read_csv('datasets/MBA.csv')

In [5]:
dfAluno.head(10)


Unnamed: 0,application_id,gender,international,gpa,major,race,gmat,work_exp,work_industry,admission
0,1,Female,False,3.3,Business,Asian,620.0,3.0,Financial Services,Admit
1,2,Male,False,3.28,Humanities,Black,680.0,5.0,Investment Management,
2,3,Female,True,3.3,Business,,710.0,5.0,Technology,Admit
3,4,Male,False,3.47,STEM,Black,690.0,6.0,Technology,
4,5,Male,False,3.35,STEM,Hispanic,590.0,5.0,Consulting,
5,6,Male,False,3.18,Business,White,610.0,6.0,Consulting,
6,7,Female,False,2.93,STEM,Other,590.0,3.0,Technology,Admit
7,8,Male,True,3.02,Business,,630.0,6.0,Financial Services,
8,9,Male,False,3.24,Business,White,590.0,2.0,Nonprofit/Gov,
9,10,Male,False,3.27,Humanities,Asian,690.0,3.0,Consulting,


In [6]:
dfAluno.columns = ['id', 'sexo', 'international', 'mediaNotas', 'graduacao', 'raca', 'gmat', 'anosExperiencia', 'setorExperiencia', 'admissao']

In [7]:
dfAluno.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6194 entries, 0 to 6193
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                6194 non-null   int64  
 1   sexo              6194 non-null   object 
 2   international     6194 non-null   bool   
 3   mediaNotas        6194 non-null   float64
 4   graduacao         6194 non-null   object 
 5   raca              4352 non-null   object 
 6   gmat              6194 non-null   float64
 7   anosExperiencia   6194 non-null   float64
 8   setorExperiencia  6194 non-null   object 
 9   admissao          1000 non-null   object 
dtypes: bool(1), float64(3), int64(1), object(5)
memory usage: 441.7+ KB


In [8]:
# Exclusão de variavel irrelevate

dfAluno.drop('id', axis=1, inplace=True)

In [9]:
# Valores posssiveis

dfAluno['graduacao'].unique()

array(['Business', 'Humanities', 'STEM'], dtype=object)

In [10]:
# Valores posssiveis

dfAluno['raca'].unique()

array(['Asian', 'Black', nan, 'Hispanic', 'White', 'Other'], dtype=object)

In [11]:
# Valores posssiveis

dfAluno['setorExperiencia'].unique()

array(['Financial Services', 'Investment Management', 'Technology',
       'Consulting', 'Nonprofit/Gov', 'PE/VC', 'Health Care',
       'Investment Banking', 'Other', 'Retail', 'Energy', 'CPG',
       'Real Estate', 'Media/Entertainment'], dtype=object)

In [12]:
# Valores posssiveis

dfAluno['admissao'].unique()

array(['Admit', nan, 'Waitlist'], dtype=object)

In [13]:
# Substituição de valores ausentes

dfAluno.fillna(value={"admissao": "deny"}, inplace=True)
contagem_target = dfAluno.value_counts("admissao")
contagem_target

admissao
deny        5194
Admit        900
Waitlist     100
Name: count, dtype: int64

In [14]:
# Lista ordenada da admissao

lista_admissao = ['deny', 'Waitlist', 'Admit']

In [15]:
# Distribuição da variavel
px.bar(contagem_target, color=contagem_target.index, category_orders={"admissao": lista_admissao})

In [16]:
contagem_target = dfAluno.value_counts("graduacao")
px.bar(contagem_target, color=contagem_target.index, category_orders={"graduacao": lista_admissao})

In [17]:
contagem_target = dfAluno.value_counts("raca")
px.bar(contagem_target, color=contagem_target.index, category_orders={"raca": lista_admissao})

In [18]:
contagem_target = dfAluno.value_counts("sexo")
px.bar(contagem_target, color=contagem_target.index, category_orders={"sexo": lista_admissao})

In [23]:
# Tabela de contigência

crosstab = pd.crosstab(dfAluno["international"], dfAluno["admissao"], margins=True)[lista_admissao].reset_index()

tabela = ff.create_table(crosstab)

In [24]:
tabela

In [27]:
# distribuicao de notas

px.histogram(dfAluno, x="gmat")

In [28]:
# distribuicao de media de notas

px.histogram(dfAluno, x="mediaNotas")

In [29]:
dfAluno.describe()

Unnamed: 0,mediaNotas,gmat,anosExperiencia
count,6194.0,6194.0,6194.0
mean,3.250714,651.092993,5.016952
std,0.151541,49.294883,1.032432
min,2.65,570.0,1.0
25%,3.15,610.0,4.0
50%,3.25,650.0,5.0
75%,3.35,680.0,6.0
max,3.77,780.0,9.0


In [36]:
# Boxplot

px.box(dfAluno, y="mediaNotas", x="admissao", category_orders={"admissao": lista_admissao})

In [35]:
px.box(dfAluno, y="gmat", x="admissao", category_orders={"admissao": lista_admissao})

In [37]:
px.box(dfAluno, y="anosExperiencia", x="admissao", category_orders={"admissao": lista_admissao})

In [49]:
# Teste de Qui-Quadrado de Person

valorEsperado, valorObservado,estatisticas  = pg.chi2_independence(dfAluno, 'admissao', 'graduacao')

In [41]:
valorEsperado

graduacao,Business,Humanities,STEM
admissao,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Admit,267.064902,360.494026,272.441072
Waitlist,29.673878,40.054892,30.27123
deny,1541.261221,2080.451082,1572.287698


In [42]:
valorObservado

graduacao,Business,Humanities,STEM
admissao,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Admit,270,367,263
Waitlist,22,41,37
deny,1546,2073,1575


In [47]:
estatisticas

Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,0.677178,2.0,0.712775,0.010456,0.104517
1,cressie-read,0.666667,0.675832,2.0,0.713255,0.010446,0.104403
2,log-likelihood,0.0,0.673176,2.0,0.714203,0.010425,0.104176
3,freeman-tukey,-0.5,0.671214,2.0,0.714904,0.01041,0.104008
4,mod-log-likelihood,-1.0,0.669277,2.0,0.715597,0.010395,0.103843
5,neyman,-2.0,0.665481,2.0,0.716956,0.010365,0.103519


In [52]:
# Teste de Qui-Quadrado de Person

valorEsperado, valorObservado,estatisticas  = pg.chi2_independence(dfAluno, 'admissao', 'mediaNotas')


Low count on observed frequencies.


Low count on expected frequencies.


divide by zero encountered in power


invalid value encountered in multiply


divide by zero encountered in divide



In [53]:
estatisticas

Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,746.737281,200.0,4.497179e-64,0.245518,1.0
1,cressie-read,0.666667,746.104804,200.0,5.675193e-64,0.245414,1.0
2,log-likelihood,0.0,819.195386,200.0,7.710609e-76,0.257154,1.0
3,freeman-tukey,-0.5,,200.0,,,
4,mod-log-likelihood,-1.0,inf,200.0,0.0,inf,
5,neyman,-2.0,,200.0,,,


In [56]:
# Teste de Qui-Quadrado de Person

valorEsperado, valorObservado,estatisticas  = pg.chi2_independence(dfAluno, 'admissao', 'anosExperiencia')


Low count on observed frequencies.


Low count on expected frequencies.


divide by zero encountered in power


invalid value encountered in multiply


divide by zero encountered in divide



In [57]:
estatisticas

Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,13.244879,16.0,0.654776,0.032698,0.286777
1,cressie-read,0.666667,13.407468,16.0,0.642764,0.032898,0.290515
2,log-likelihood,0.0,14.532575,16.0,0.559102,0.034251,0.316634
3,freeman-tukey,-0.5,,16.0,,,
4,mod-log-likelihood,-1.0,inf,16.0,0.0,inf,
5,neyman,-2.0,,16.0,,,


p-value > 0.05, ou seja, hipótese nul. Caso contrário, há relação entre as variáveis. Ou seja, a admissão no MBA tem relação com a nota.

### Treinamento do modelo

In [59]:
x = dfAluno.drop("admissao", axis=1)
y = dfAluno["admissao"]

In [63]:
# Pipeline
# OneHotEncoder
# Treinamento do modelo

# Variaveis categoricas 
categoricas = ["graduacao", "sexo", "raca", "setorExperiencia"]

# transformador de variaveis categoricas com OneHotEncoder
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categoricas)
    ])

dt_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', DecisionTreeClassifier())])



### Validação cruzada

In [66]:
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
reusltadoMetricas = cross_validate(dt_model, x, y, cv=cv_folds, scoring='accuracy', return_estimator=True)

In [67]:
reusltadoMetricas

{'fit_time': array([0.03816772, 0.03500581, 0.03533459, 0.03300285, 0.03300166]),
 'score_time': array([0.00599623, 0.00500274, 0.00499821, 0.00600052, 0.00700378]),
 'estimator': [Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('cat',
                                                    Pipeline(steps=[('onehot',
                                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                                    ['graduacao', 'sexo', 'raca',
                                                     'setorExperiencia'])])),
                  ('classifier', DecisionTreeClassifier())]),
  Pipeline(steps=[('preprocessor',
                   ColumnTransformer(transformers=[('cat',
                                                    Pipeline(steps=[('onehot',
                                                                     OneHotEncoder(handle_unknown='ignore'))]),
                             