# Agregación de datos por categoría

In [None]:
import numpy as np
import pandas as pd

In [None]:
gender = ["Male", "Female"]
income = ["Poor", "Middle Class", "Rich"]

In [None]:
n = 500

gender_data = []
income_data = []

for i in range(0,500):
    gender_data.append(np.random.choice(gender))
    income_data.append(np.random.choice(income))

In [None]:
gender_data[1:10]

In [None]:
income_data[1:10]

In [None]:
#Z -> N(0,1)
#N(m, s) -> m + s * Z
height = 160 + 30 * np.random.randn(n)
weight = 65 + 25 * np.random.randn(n)
age = 30 + 12 * np.random.randn(n)
income = 18000 + 3500 * np.random.rand(n)

In [None]:
data = pd.DataFrame(
    {
        "Gender" : gender_data,
        "Economic Status" : income_data,
        "Height" : height,
        "Weight" : weight,
        "Age" : age,
        "Income" : income
    }
)

In [None]:
data.head()

## Agrupación de datos

In [None]:
grouped_gender = data.groupby("Gender")

In [None]:
grouped_gender.groups

In [None]:
for names, groups in grouped_gender:
    print(names)
    print(groups)

In [None]:
grouped_gender.get_group("Female")

In [None]:
double_group = data.groupby(["Gender", "Economic Status"])

In [None]:
len(double_group)

In [None]:
for names, groups in double_group:
    print(names)
    print(groups)

## Operaciones sobre datos agrupados

In [None]:
double_group.sum()

In [None]:
double_group.mean()

In [None]:
double_group.size()

In [None]:
double_group.describe()

In [None]:
grouped_income = double_group["Income"]

In [None]:
grouped_income.describe()

In [None]:
double_group.aggregate(
    {
        "Income": np.sum,
        "Age" : np.mean,
        "Height" : np.std
    }
)

In [None]:
double_group.aggregate(
    {
        "Age" : np.mean,
        "Height" : lambda h:(np.mean(h))/np.std(h)
    }
)

In [None]:
double_group.aggregate([np.sum, np.mean, np.std])

In [None]:
double_group.aggregate([lambda x: np.mean(x) / np.std(x)])

## Filtrado de datos

In [None]:
double_group["Age"].filter(lambda x: x.sum()>2400)

## Transformación de variables

In [None]:
zscore = lambda x : (x - x.mean())/x.std()

In [None]:
z_group = double_group.transform(zscore)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(z_group["Age"])

In [None]:
fill_na_mean = lambda x : x.fillna(x.mean())

In [None]:
double_group.transform(fill_na_mean)

## Operaciones diversas muy útiles

In [None]:
double_group.head(1)

In [None]:
double_group.tail(1)

In [None]:
double_group.nth(32)

In [None]:
double_group.nth(82)

In [None]:
data_sorted = data.sort_values(["Age", "Income"])

In [None]:
data_sorted.head(10)

In [None]:
age_grouped = data_sorted.groupby("Gender")

In [None]:
age_grouped.head(1)

In [None]:
age_grouped.tail(1)

# Conjunto de entrenamiento y conjunto de testing

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("/content/drive/My Drive/Curso Machine Learning con Python/datasets/customer-churn-model/Customer Churn Model.txt")

In [None]:
len(data)

## Dividir utilizando la distribución normal

In [None]:
a = np.random.randn(len(data))

In [None]:
plt.hist(a)

In [None]:
check = (a<0.75) # No es el 75% de los datos, son los números que son < 0.75!!! 

In [None]:
check

In [None]:
plt.hist(check.astype(int))#Ha cambiado en la versión 3.7 de python y necesita hacer un cast de bool a entero

In [None]:
training = data[check]
testing = data[~check]

In [None]:
len(training)

In [None]:
len(testing)

## Con la libreria sklearn

In [None]:
from sklearn.model_selection import train_test_split# Ha cambiado en la 3.7 de Python

In [None]:
train, test = train_test_split(data, test_size = 0.2)

In [None]:
len(train)

In [None]:
len(test)

## Usando una función de shuffle

In [None]:
import numpy as np

In [None]:
data.head()

In [None]:
import sklearn

In [None]:
data = sklearn.utils.shuffle(data)

In [None]:
cut_id = int(0.75*len(data))
train_data = data[:cut_id]
test_data = data[cut_id+1:]

In [None]:
len(train_data)

In [None]:
len(test_data)