# Exploración del dataset

## 1. Objetivos:
- Conocer a profundidad la información que contiene el dataset, los tipos de columnas.
- Descartar columnas que no aportan valor.
 

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/shop_customer_data_2022.csv',index_col=0)

df.head()

Unnamed: 0_level_0,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Male,32,15000,39,Healthcare,1,4
2,Male,24,35000,81,Engineer,3,3
3,Female,28,86000,6,Engineer,1,1
4,Female,23,59000,77,Lawyer,0,2
5,Female,31,38000,40,Entertainment,2,6


### Comprobar estado del Dataframe

In [4]:
#Verificar si hay valores "NA"
df.isna().sum() 

Gender                     0
Age                        0
Annual Income ($)          0
Spending Score (1-100)     0
Profession                79
Work Experience            0
Family Size                0
dtype: int64

In [5]:
#Conteo de registros por columna
df.count()

Gender                    2000
Age                       2000
Annual Income ($)         2000
Spending Score (1-100)    2000
Profession                1921
Work Experience           2000
Family Size               2000
dtype: int64

### Media del puntaje asignado por la tienda
-  Media general
-  Media de puntaje para mujeres
-  Media de puntaje para hombres

In [6]:
df['Spending Score (1-100)'].mean()

50.9625

In [10]:
dfWomen = df[df['Gender'] == 'Female']
dfWomen['Spending Score (1-100)'].mean()

50.974704890387855

In [9]:
dfMen = df[df['Gender'] == 'Male']
dfMen['Spending Score (1-100)'].mean()

50.94471744471745

### Mediana

In [8]:
#Mediana de la columna Edad
df['Age'].median()

49.0

In [7]:
#Mediana de la columna Ingresos anuales
df['Annual Income ($)'].median()

106762.5

### Variables Cualitativas

In [50]:
#Conteo de registros por genero
df['Gender'].value_counts()

Female    1186
Male       814
Name: Gender, dtype: int64

In [11]:
#Conteo de registros por profesión
df['Profession'].value_counts()

Artist           598
Healthcare       328
Entertainment    229
Engineer         176
Doctor           158
Executive        152
Lawyer           138
Marketing         82
Homemaker         60
Name: Profession, dtype: int64

In [12]:
df.describe()

Unnamed: 0,Age,Annual Income ($),Spending Score (1-100),Work Experience,Family Size
count,2000.0,2000.0,2000.0,2000.0,2000.0
mean,50.209,107601.5205,50.9625,3.9765,3.7685
std,19.920611,48201.610445,27.934661,3.895964,1.970749
min,12.0,0.0,0.0,0.0,1.0
25%,32.0,71558.75,28.0,1.0,2.0
50%,49.0,106762.5,50.0,3.0,4.0
75%,67.0,147590.75,75.0,7.0,5.0
max,95.0,189974.0,100.0,17.0,9.0


### Renombrar columnas

In [13]:
df.rename(
    columns={"Annual Income ($)": "Annual Inconme", "Spending Score (1-100)": "Spending Score"},
    inplace=True
)

### Rango

In [14]:
print(f"Rango de la variable Edad es: {df['Age'].max()-df['Age'].min()}")
print(f"Rango de la variable Ingresos anuales es: {df['Annual Inconme'].max()-df['Annual Inconme'].min()}")
print(f"Rango de la variable Puntaje de compra es: {df['Spending Score'].max()-df['Spending Score'].min()}")
print(f"Rango de la variable Tamaño de la familia es: {df['Family Size'].max()-df['Family Size'].min()}")

Rango de la variable Edad es: 83
Rango de la variable Ingresos anuales es: 189974
Rango de la variable Puntaje de compra es: 100
Rango de la variable Tamaño de la familia es: 8


### Percentiles

In [15]:
# Variable: Edad
print(f'Percentil 0:   {df["Age"].quantile(0)}')
print(f'Percentil 10:  {df["Age"].quantile(0.1)}')
print(f'Percentil 25:  {df["Age"].quantile(0.25)}')
print(f'Percentil 50:  {df["Age"].quantile(0.5)}') #Median
print(f'Percentil 75:  {df["Age"].quantile(0.75)}')
print(f'Percentil 90:  {df["Age"].quantile(0.9)}')
print(f'Percentil 100: {df["Age"].quantile(1)}')

Percentil 0:   12.0
Percentil 10:  25.0
Percentil 25:  32.0
Percentil 50:  49.0
Percentil 75:  67.0
Percentil 90:  79.0
Percentil 100: 95.0


In [16]:
# Variable: Ingresos Anuales
print(f'Percentil 0:   {df["Annual Inconme"].quantile(0)}')
print(f'Percentil 10:  {df["Annual Inconme"].quantile(0.1)}')
print(f'Percentil 25:  {df["Annual Inconme"].quantile(0.25)}')
print(f'Percentil 50:  {df["Annual Inconme"].quantile(0.5)}') #Median
print(f'Percentil 75:  {df["Annual Inconme"].quantile(0.75)}')
print(f'Percentil 90:  {df["Annual Inconme"].quantile(0.9)}')
print(f'Percentil 100: {df["Annual Inconme"].quantile(1)}')

Percentil 0:   0.0
Percentil 10:  50773.200000000004
Percentil 25:  71558.75
Percentil 50:  106762.5
Percentil 75:  147590.75
Percentil 90:  172766.30000000002
Percentil 100: 189974.0


### Rango Intercuartilico
Diferencia entre el percentil 75 y el percentil 25

In [17]:
# Variable: Ingresos Anuales
df["Annual Inconme"].quantile(0.75) - df["Annual Inconme"].quantile(0.25)

76032.0