In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Leer Datos

In [2]:
df = pd.read_csv("data_processed.csv")
df.head()

Unnamed: 0,sex,Age,Married,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_salary,incoming_own_farm,...,11 family members,12 family members,2 family members,3 family members,4 family members,5 family members,6 family members,7 family members,8 family members,9 family members
0,1,28,1,28912201,22861940,23399979,26692283,28203066,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,23,1,28912201,22861940,23399979,26692283,28203066,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,22,1,28912201,22861940,23399979,26692283,28203066,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,27,1,52667108,19698904,49647648,397715,44042267,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,59,0,82606287,17352654,23399979,80877619,74503502,1,0,...,0,0,0,0,0,0,1,0,0,0


In [3]:
print('\nEl numero de filas y columnas en el data set: ',df.shape)
print('')


El numero de filas y columnas en el data set:  (1409, 60)



## Genero datos de entrenamiento y test

In [4]:
X = df.drop('depressed',axis=1) # Variable independiente
y = df['depressed'] # Variable dependiente

# KNN 
## RFE

In [5]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [6]:
model = LogisticRegression(solver='lbfgs', max_iter=4000)

In [7]:
# Se aplica RFE, Se pide los 5 atributos top
rfe = RFE(estimator=model, n_features_to_select=5)
rfe = rfe.fit(X,y)

In [8]:
# Imprimo la seleccion de atributos
print(rfe.support_)
print(rfe.ranking_)
print(X.columns.values)

[False False False  True False  True  True False False False False False
  True  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False]
[ 7  9  6  1  5  1  1  3 36 34 20 16  1  1 25  2  4 14  8 53 43 30 17 41
 32 27 21 40 51 15 11 23 31 46 24 50 49 45 54 48 52 37 19 44 33 26 10 38
 35 47 55 22 13 29 12 39 18 42 28]
['sex' 'Age' 'Married' 'gained_asset' 'durable_asset' 'save_asset'
 'living_expenses' 'other_expenses' 'incoming_salary' 'incoming_own_farm'
 'incoming_business' 'incoming_no_business' 'incoming_agricultural'
 'farm_expenses' 'labor_primary' 'lasting_investment'
 'no_lasting_investmen' '0 children' '1 children' '10 children'
 '11 children' '2 children' '3 children' '4 children' '5 children'
 '6 children' '7 children' '8 children' '9 children' '1 education

In [9]:
# Los 5 atributos top son 'education.num' ,  'marital.status' , 'sex_ Male'
X_rfe = X[['gained_asset','save_asset','living_expenses','incoming_agricultural', 'farm_expenses']]

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_rfe,y,test_size=0.2,random_state=23)

In [11]:
print("Numero de datos de entrenamiento: ", len(X_train))
print("Numero de datos en test: ", len(X_test))

Numero de datos de entrenamiento:  1127
Numero de datos en test:  282


In [12]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

over_sampler = RandomOverSampler(random_state=42)
X_res, y_res = over_sampler.fit_resample(X_train, y_train)
print(f"Training target statistics: {Counter(y_res)}")
print(f"Testing target statistics: {Counter(y_test)}")

Training target statistics: Counter({0: 933, 1: 933})
Testing target statistics: Counter({0: 241, 1: 41})


## Aplicando KNN

In [13]:
leaf_size = list(range(1,20))
n_neighbors = list(range(1,10))
p=[1,2]
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_res, y_res)

KNeighborsClassifier()

In [15]:
from sklearn.model_selection import GridSearchCV
gsv = GridSearchCV(knn, hyperparameters, cv=10)

In [16]:
best_model = gsv.fit(X_res, y_res)

print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

In [17]:
best_knn = KNeighborsClassifier(leaf_size=1, n_neighbors=1, p=1)
best_knn.fit(X_res, y_res)

KNeighborsClassifier(leaf_size=1, n_neighbors=1, p=1)

In [18]:
# Realizamos la prediccion
y_pred = best_knn.predict(X_test)

In [19]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

acc_knn = metrics.accuracy_score(y_test,y_pred)
acc_knn #0.8439716312056738

0.7375886524822695

In [20]:
# Genero la matriz de confusion
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,y_pred))

[[204  37]
 [ 37   4]]


In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       241
           1       0.10      0.10      0.10        41

    accuracy                           0.74       282
   macro avg       0.47      0.47      0.47       282
weighted avg       0.74      0.74      0.74       282

