<a href="https://colab.research.google.com/github/LabSWPP12023S2G2/TPInicial/blob/main/datarefUNC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importamos librerias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Leemos el dataset desde el repo
url = 'https://raw.githubusercontent.com/LabSWPP12023S2G2/TPInicial/main/datasetUNC.csv'
df = pd.read_csv(url, delimiter=';')

In [3]:
# Eliminamos posibles records que esten vacíos
df = df.dropna(axis=0)

In [4]:
# Vemos las variables del dataset
df.columns

Index(['SUB PERIODS', 'EDUCATION', 'PROVINCE', 'SEX', 'AGE',
       'MENTAL DISORDER HISTORY', 'SUIC ATTEMPT HISTORY',
       'LIVING WITH SOMEBODY', 'ECONOMIC INCOME', 'DEPRESSION', 'SUIC RISK',
       'ANXIETY STATE', 'ANXIETY TRAIT'],
      dtype='object')

In [5]:
# Vamos a refinar el dataset con las variables que deseamos
variables = ['ANXIETY STATE', 'DEPRESSION', 'SUIC RISK', 'AGE', 'SUIC ATTEMPT HISTORY', 'PROVINCE']

In [6]:
# Refinamos el dataset
df_refinado = df[variables]

In [7]:
# Dataset preparado
df_refinado

Unnamed: 0,ANXIETY STATE,DEPRESSION,SUIC RISK,AGE,SUIC ATTEMPT HISTORY,PROVINCE
0,54,21,37,30,ideation,CABA (Buenos Aires capital)
1,34,26,46,30,ideation,Tierra del Fuego
2,33,8,21,39,no,Jujuy
3,42,27,70,36,no,Jujuy
4,11,1,28,49,no,other
...,...,...,...,...,...,...
1095,51,41,80,28,yes,Córdoba
1096,31,10,19,39,no,Córdoba
1097,27,7,25,22,no,CABA (Buenos Aires capital)
1098,14,6,30,31,no,Córdoba


In [8]:
# Dividimos el dataset, uno para entrenar y otro para testear, 70/30
df_training, df_testing = train_test_split(df_refinado, test_size=0.3, random_state=42)

# Guardamos los datasets!
df_training.to_csv('training_datasetUNC.csv', index=False)
df_testing.to_csv('testing_datasetUNC.csv', index=False)

In [9]:
# Codificar la columna 'PROVINCE' numéricamente
province_label_encoder = LabelEncoder()
all_provinces = pd.concat([df_training['PROVINCE'], df_testing['PROVINCE']])
province_label_encoder.fit(all_provinces)
df_training['COD_PROV'] = province_label_encoder.transform(df_training['PROVINCE'])
df_testing['COD_PROV'] = province_label_encoder.transform(df_testing['PROVINCE'])

In [10]:
# Codificar la columna 'SUIC ATTEMPT HISTORY' numéricamente
suic_attempt_encoder = LabelEncoder()
all_suic_attempt = pd.concat([df_training['SUIC ATTEMPT HISTORY'], df_testing['SUIC ATTEMPT HISTORY']])
suic_attempt_encoder.fit(all_suic_attempt)
df_training['SUIC ATTEMPT_COD'] = suic_attempt_encoder.transform(df_training['SUIC ATTEMPT HISTORY'])
df_testing['SUIC ATTEMPT_COD'] = suic_attempt_encoder.transform(df_testing['SUIC ATTEMPT HISTORY'])

In [16]:
# Agrupar y calcular estadísticas por provincia
grouped_data = df_training.groupby('PROVINCE').median(numeric_only=True)

'''# Renombrar las columnas para mayor claridad
grouped_data.columns = ['median_ANXIETY STATE', 'std_ANXIETY STATE',
                        'median_DEPRESSION', 'std_DEPRESSION',
                        'median_SUIC RISK', 'std_SUIC RISK',
                        'median_AGE', 'std_AGE',
                        'median_SUIC ATTEMPT_COD', 'std_SUIC ATTEMPT_COD',]

# Resetear el índice para tener 'PROVINCE' como columna
grouped_data.reset_index(inplace=True)

# Guardar los resultados en un nuevo archivo CSV
grouped_data.to_csv('estadisticas_por_provincia.csv', index=False)
'''

# Calcular la mediana total de las variables de interés por provincia
grouped_data['median_combined'] = grouped_data.median(axis=1, numeric_only=True)

# Ordenar las provincias por la mediana total combinada
sorted_provinces = grouped_data['median_combined'].sort_values(ascending=False)

In [18]:
sorted_provinces

PROVINCE
San Luis                       32.00
Catamarca                      31.50
Río Negro                      28.00
Formosa                        26.50
Tucumán                        25.50
other                          24.00
Chubut                         23.50
Salta                          23.25
CABA (Buenos Aires capital)    22.50
Santa Fe                       22.00
Buenos Aires provincia         21.75
Tierra del Fuego               21.00
Otro                           20.75
Santiago del Estero            20.50
Mendoza                        20.25
Entre Ríos                     19.50
Jujuy                          18.50
Chaco                          18.50
Neuquén                        18.00
Santa Cruz                     18.00
Corrientes                     17.00
Córdoba                        16.75
Misiones                       16.75
La Pampa                       13.50
San Juan                       13.00
Name: median_combined, dtype: float64

In [11]:
df_training

Unnamed: 0,ANXIETY STATE,DEPRESSION,SUIC RISK,AGE,SUIC ATTEMPT HISTORY,PROVINCE,COD_PROV,SUIC ATTEMPT_COD
221,25,8,23,30,no,Córdoba,6,1
235,26,5,18,57,no,Buenos Aires provincia,0,1
433,54,25,73,22,ideation,Buenos Aires provincia,0,0
599,5,4,7,21,no,Santa Fe,21,1
305,46,14,36,30,ideation,Córdoba,6,0
...,...,...,...,...,...,...,...,...
466,16,4,18,47,no,Tucumán,24,1
121,28,20,51,49,ideation,Córdoba,6,0
1044,60,46,60,31,yes,Córdoba,6,2
1095,51,41,80,28,yes,Córdoba,6,2


In [12]:
# Definir características (X) y etiquetas (y) para entrenamiento y prueba
X_train = df_training.drop(['PROVINCE', 'COD_PROV', 'SUIC ATTEMPT HISTORY', 'SUIC ATTEMPT_COD'], axis=1)
y_train = df_training['COD_PROV']
X_test = df_testing.drop(['PROVINCE', 'COD_PROV', 'SUIC ATTEMPT HISTORY', 'SUIC ATTEMPT_COD'], axis=1)
y_test = df_testing['COD_PROV']

In [13]:
# Crear y entrenar el modelo de regresión logística
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
# Predecir etiquetas en los datos de prueba
y_pred = model.predict(X_test)

In [16]:
# Obtener las clases únicas de las etiquetas de prueba
unique_classes = df_training['COD_PROV'].unique()

In [18]:
# Evaluar el modelo con etiquetas específicas
print(classification_report(y_test, y_pred, labels=unique_classes, target_names=province_label_encoder.classes_, zero_division=1))

                             precision    recall  f1-score   support

     Buenos Aires provincia       0.24      0.59      0.34        61
CABA (Buenos Aires capital)       0.00      0.00      0.00        67
                  Catamarca       0.33      0.67      0.45        86
                      Chaco       1.00      0.00      0.00         1
                     Chubut       1.00      0.00      0.00         7
                 Corrientes       1.00      0.00      0.00         5
                    Córdoba       1.00      0.00      0.00         3
                 Entre Ríos       1.00      0.00      0.00        35
                    Formosa       1.00      0.00      0.00         4
                      Jujuy       1.00      0.00      0.00         7
                   La Pampa       1.00      0.00      0.00         3
                   La Rioja       1.00      0.00      0.00        13
                    Mendoza       1.00      0.00      0.00         5
                   Misiones      



In [None]:
y = df['SUIC RISK']

In [None]:
df_features = ['DEPRESSION', 'ANXIETY STATE']

In [None]:
x = df[df_features]

In [None]:
x.describe()

Unnamed: 0,DEPRESSION,ANXIETY STATE
count,1100.0,1100.0
mean,15.695455,31.775455
std,11.101032,14.473519
min,0.0,1.0
25%,8.0,21.0
50%,13.0,31.0
75%,22.0,42.0
max,60.0,66.0


In [None]:
x.head()

Unnamed: 0,DEPRESSION,ANXIETY STATE
0,21,54
1,26,34
2,8,33
3,27,42
4,1,11


In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
df_model = DecisionTreeRegressor(random_state=1)

In [None]:
df_model.fit(x, y)

In [None]:
print("Informar el riesgo de suicidio de las siguientes personas")
print(x.head())
print("El riesgo de suicidio es:")
print(df_model.predict(x.head()))

Informar el riesgo de suicidio de las siguientes personas
   DEPRESSION  ANXIETY STATE
0          21             54
1          26             34
2           8             33
3          27             42
4           1             11
El riesgo de suicidio es:
[37. 46. 27. 70. 19.]


In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
pred_suicrisk = df_model.predict(x)

In [None]:
mean_absolute_error(y, pred_suicrisk)

3.245757575757576

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_x, val_x, train_y, val_y = train_test_split(x, y, random_state = 0)

In [None]:
df_model = DecisionTreeRegressor()

In [None]:
df_model.fit(train_x, train_y)

In [None]:
val_pred = df_model.predict(val_x)

In [None]:
print(mean_absolute_error(val_y, val_pred))

10.65139393939394
