<a href="https://colab.research.google.com/github/Neyder2502/salud-mental-analisis-datos/blob/main/analisis_csv_con_errores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EXPLORACION Y PREPARACION DEL DATASET

## LIBRERIAS

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, linregress
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## ARCHIVO CSV

In [None]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)
print("Archivo cargado correctamente")

## ESTRUCTURA Y CALIDAD DE DATOS

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.isna().sum()

## DETECCION DE VALORES ATIPICOS (algo raro pues)



In [None]:
df.boxplot(column=['Daily_Screen_Time(hrs)'])
plt.title('Outliers - Horas de pantalla')
plt.show()

df.boxplot(column=['Sleep_Quality(1-10)'])
plt.title('Outliers - Calidad de sueño')
plt.show()

df.boxplot(column=['Stress_Level(1-10)'])
plt.title('Outliers - Estrés')
plt.show()

df.boxplot(column=['Happiness_Index(1-10)'])
plt.title('Outliers - Felicidad')
plt.show()

In [None]:
print("Edades fuera de rango:")
display(df[(df['Age'] < 10) | (df['Age'] > 80)])

print("\nHoras de pantalla fuera de rango:")
display(df[(df['Daily_Screen_Time(hrs)'] < 0) | (df['Daily_Screen_Time(hrs)'] > 24)])

print("\nCalidad de sueño fuera de 1-10:")
display(df[(df['Sleep_Quality(1-10)'] < 1) | (df['Sleep_Quality(1-10)'] > 10)])

print("\nEstrés fuera de 1-10:")
display(df[(df['Stress_Level(1-10)'] < 1) | (df['Stress_Level(1-10)'] > 10)])

print("\nFelicidad fuera de 1-10:")
display(df[(df['Happiness_Index(1-10)'] < 1) | (df['Happiness_Index(1-10)'] > 10)])

print("\nDías sin redes fuera de rango:")
display(df[(df['Days_Without_Social_Media'] < 0) | (df['Days_Without_Social_Media'] > 30)])

## LIMPIEZA

### ARREGLOS EN EL GENERO

In [None]:
df['Gender'] = df['Gender'].str.strip().str.lower()

df['Gender'] = df['Gender'].replace({
    'hombre': 'male',
    'h': 'male',
    'm': 'female',
    'f': 'female',
    'mujer': 'female',
    'hombre ': 'male'
})

### CORREGIR VALORES FUERA DE RANGO

In [None]:
df.loc[(df['Daily_Screen_Time(hrs)'] < 0) | (df['Daily_Screen_Time(hrs)'] > 24), 'Daily_Screen_Time(hrs)'] = np.nan

In [None]:
cols_10 = ["Sleep_Quality(1-10)", "Stress_Level(1-10)", "Happiness_Index(1-10)"]

for col in cols_10:
    df.loc[(df[col] < 1) | (df[col] > 10), col] = np.nan

### RELLENAR FALTANTES

In [None]:
df = df.fillna(df.mean(numeric_only=True))

### ELIMINAR DUPLICADOS

In [None]:
df = df.drop_duplicates()

### LLEVAR A TIPO NUMERICO

In [None]:
numeric_cols = [
    "Age","Daily_Screen_Time(hrs)","Sleep_Quality(1-10)",
    "Stress_Level(1-10)","Days_Without_Social_Media",
    "Exercise_Frequency(week)","Happiness_Index(1-10)"
]

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")


## CODIGO ARREGLADO

In [None]:
print(df)

## ESTADISTICAS Y DISTRIBUICION DE VARIABLES

### estadistica

In [None]:
df.describe().round(3)

### distribucion

In [None]:
df['Age'].value_counts()

In [None]:
df['Gender'].value_counts()

In [None]:
df['Daily_Screen_Time(hrs)'].value_counts()

In [None]:
df['Sleep_Quality(1-10)'].value_counts()

In [None]:
df['Stress_Level(1-10)'].value_counts()

In [None]:
df['Days_Without_Social_Media'].value_counts()

In [None]:
df['Exercise_Frequency(week)'].value_counts()

In [None]:
df['Social_Media_Platform'].value_counts()

In [None]:
df['Happiness_Index(1-10)'].value_counts()