## Proyecto Census Income

In [1]:
#Importamos las librerias principales
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

Cargamos los datos

In [2]:
df = pd.read_csv('adult.csv')

In [3]:
df.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,EducationNum,Marital Status,Occupation,Relationship,Race,Gender,Capital Gain,capital loss,Hours per Week,Native Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Análisis exploratorio de datos

In [4]:
df.shape

(32561, 15)

In [5]:
## Visualizamos la info de los datos
df.dtypes

Age                int64
Workclass         object
Final Weight       int64
Education         object
EducationNum       int64
Marital Status    object
Occupation        object
Relationship      object
Race              object
Gender            object
Capital Gain       int64
capital loss       int64
Hours per Week     int64
Native Country    object
Income            object
dtype: object

#### tenemos 6 variables númerica y 9 varibles categórica

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Age             32561 non-null  int64 
 1   Workclass       32561 non-null  object
 2   Final Weight    32561 non-null  int64 
 3   Education       32561 non-null  object
 4   EducationNum    32561 non-null  int64 
 5   Marital Status  32561 non-null  object
 6   Occupation      32561 non-null  object
 7   Relationship    32561 non-null  object
 8   Race            32561 non-null  object
 9   Gender          32561 non-null  object
 10  Capital Gain    32561 non-null  int64 
 11  capital loss    32561 non-null  int64 
 12  Hours per Week  32561 non-null  int64 
 13  Native Country  32561 non-null  object
 14  Income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
# Exploramos un poco la variable target
df['Income'].value_counts()

Income
<=50K    24720
>50K      7841
Name: count, dtype: int64

In [8]:
# Verificamos valores de missings
df.isnull().sum()

Age               0
Workclass         0
Final Weight      0
Education         0
EducationNum      0
Marital Status    0
Occupation        0
Relationship      0
Race              0
Gender            0
Capital Gain      0
capital loss      0
Hours per Week    0
Native Country    0
Income            0
dtype: int64

### Procesamiento de datos

In [9]:
# Separamos X e y
X = df.drop('Income', axis=1)
y = df['Income']

In [10]:
# Separamos datos de train y de test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size= 0.3, random_state=42)

In [11]:
# shape de x_train y x_test
X_train.shape, X_test.shape

((22792, 14), (9769, 14))

### Entrenamiento de modelo de clasificación de árbol de decisión

In [12]:
import category_encoders as ce
encoder = ce.OrdinalEncoder()
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [13]:
## Verificamos la transformación
X_train.head()

Unnamed: 0,Age,Workclass,Final Weight,Education,EducationNum,Marital Status,Occupation,Relationship,Race,Gender,Capital Gain,capital loss,Hours per Week,Native Country
19749,34,1,56460,1,9,1,1,1,1,1,0,2179,12,1
1216,48,1,243631,2,10,1,2,2,2,2,7688,0,40,1
27962,23,2,56402,2,10,1,3,2,1,2,0,0,30,1
23077,56,3,255406,1,9,2,4,3,1,1,0,0,40,1
10180,17,4,297246,3,7,3,5,4,1,1,0,0,9,1


In [15]:
## Importar árbol de decisión
from sklearn.ensemble import RandomForestClassifier
# Creación de modelo 
tree =RandomForestClassifier(n_estimators=10, random_state=42)

In [None]:
tree.fit(X_train,y_train)

In [None]:
#Calculo de las predicciones en Train y Test
y_train_pred_tree = tree.predict(X_train)

In [None]:
y_test_pred_tree = tree.predict(X_test)

### Entrenamiento de modelo de clasificación 

In [None]:
#Calculo de metricas 
from sklearn.metrics import accuracy_score

#Calculo el accuracy en Train
train_accurace_tree = accuracy_score(y_train,y_train_pred_tree)

#Calculo el accuracy en Test
test_accurace_tree = accuracy_score(y_test,y_test_pred_tree)
print('El accuracy en train es: ',train_accurace_tree )
print('El accuracy en test es: ',test_accurace_tree )

In [None]:
#Verificamos el feature importances
feature_importances_df = pd.DataFrame(
    {"feature":list(X.columns),
     "importances":tree.feature_importances_}
).sort_values("importances",ascending=False)
# Mostrar
feature_importances_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.barplot(x=feature_importances_df.feature , y=feature_importances_df.importances)
plt.xlabel('Features Importances Score')
plt.ylabel('Features')
plt.title("Visualización de la importancia de cada Feature")
plt.xticks(rotation=45, fontsize="x-large", horizontalalignment="right", fontweight='light')
plt.show()