In [29]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

### 1. INFORMARSE DEL NEGOCIO

In [30]:
# Cargar conjunto de datos
data_cash_request = pd.read_csv("./project_dataset/extract - cash request - data analyst.csv")
data_fee = pd.read_csv("./project_dataset/extract - fees - data analyst - .csv")

In [31]:
# Primeras filas del DataFrame
data_cash_request.head()

Unnamed: 0,id,amount,status,created_at,updated_at,user_id,moderated_at,deleted_account_id,reimbursement_date,cash_request_received_date,money_back_date,transfer_type,send_at,recovery_status,reco_creation,reco_last_update
0,5,100.0,rejected,2019-12-10 19:05:21.596873+00,2019-12-11 16:47:42.40783+00,804.0,2019-12-11 16:47:42.405646+00,,2020-01-09 19:05:21.596363+00,,,regular,,,,
1,70,100.0,rejected,2019-12-10 19:50:12.34778+00,2019-12-11 14:24:22.900054+00,231.0,2019-12-11 14:24:22.897988+00,,2020-01-09 19:50:12.34778+00,,,regular,,,,
2,7,100.0,rejected,2019-12-10 19:13:35.82546+00,2019-12-11 09:46:59.779773+00,191.0,2019-12-11 09:46:59.777728+00,,2020-01-09 19:13:35.825041+00,,,regular,,,,
3,10,99.0,rejected,2019-12-10 19:16:10.880172+00,2019-12-18 14:26:18.136163+00,761.0,2019-12-18 14:26:18.128407+00,,2020-01-09 19:16:10.879606+00,,,regular,,,,
4,1594,100.0,rejected,2020-05-06 09:59:38.877376+00,2020-05-07 09:21:55.34008+00,7686.0,2020-05-07 09:21:55.320193+00,,2020-06-05 22:00:00+00,,,regular,,,,


In [32]:
# Información general del DataFrame
data_cash_request.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23970 entries, 0 to 23969
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          23970 non-null  int64  
 1   amount                      23970 non-null  float64
 2   status                      23970 non-null  object 
 3   created_at                  23970 non-null  object 
 4   updated_at                  23970 non-null  object 
 5   user_id                     21867 non-null  float64
 6   moderated_at                16035 non-null  object 
 7   deleted_account_id          2104 non-null   float64
 8   reimbursement_date          23970 non-null  object 
 9   cash_request_received_date  16289 non-null  object 
 10  money_back_date             16543 non-null  object 
 11  transfer_type               23970 non-null  object 
 12  send_at                     16641 non-null  object 
 13  recovery_status             333

In [33]:
# Estadísticas descriptivas
data_cash_request.describe()

Unnamed: 0,id,amount,user_id,deleted_account_id
count,23970.0,23970.0,21867.0,2104.0
mean,13910.966124,82.720818,32581.250789,9658.755228
std,7788.117214,26.528065,27618.565773,7972.743249
min,3.0,1.0,34.0,91.0
25%,7427.25,50.0,10804.0,3767.0
50%,14270.5,100.0,23773.0,6121.5
75%,20607.75,100.0,46965.0,16345.0
max,27010.0,200.0,103719.0,30445.0


In [34]:
# Contar las opciones distintas en la columna 'status'
def count_status_options(df, column_name):
    status_counts = df[column_name].value_counts()
    return status_counts

print(count_status_options(data_cash_request, 'status'))
print(count_status_options(data_cash_request, 'transfer_type'))
print(count_status_options(data_cash_request, 'recovery_status'))

status
money_back               16397
rejected                  6568
direct_debit_rejected      831
active                      59
transaction_declined        48
direct_debit_sent           34
canceled                    33
Name: count, dtype: int64
transfer_type
instant    13882
regular    10088
Name: count, dtype: int64
recovery_status
completed               2468
pending                  845
pending_direct_debit      16
cancelled                  1
Name: count, dtype: int64


In [35]:
# Para ver el primer valor no nulo de una columna del df
def first_non_null_value(column):
    return column.dropna().iloc[0]

first_val = first_non_null_value(data_cash_request['reco_creation'])

first_val

'2020-06-12 22:27:04.837525+00'

### 2. ANALASIS EXPLORATORIO

In [36]:
# i. Tipo de dato y clasificación
data_cash_request['amount'].dtypes

dtype('float64')

In [38]:
# Clasificación
def clasificacion_tipo(dtypes):
    clasificaciones = {}
    for column, dtype in dtypes.items():
        if np.issubdtype(dtype, np.number):
            clasificaciones[column] = 'nm'  # numérico
        elif dtype == 'object':
            clasificaciones[column] = 'n'   # nominal
        elif dtype == 'int64' and column.lower() == 'indice':
            clasificaciones[column] = 'i'   # índice
        else:
            clasificaciones[column] = 'Desconocido'
    return clasificaciones

clasificacion = clasificacion_tipo(data_cash_request[''].dtypes)
print(clasificacion)

KeyError: ''

In [None]:
# ii. Estadística descriptiva
estadisticas_descriptivas = data_cash_request['amount'].describe(include='all')
print(estadisticas_descriptivas)

count    23970.000000
mean        82.720818
std         26.528065
min          1.000000
25%         50.000000
50%        100.000000
75%        100.000000
max        200.000000
Name: amount, dtype: float64


In [None]:
# iii. Funciones de distribución
# Ejemplo con una variable numérica
numerical_data = data_cash_request['amount']

# Calcular y mostrar una distribución normal
mu, std = stats.norm.fit(numerical_data)
xmin, xmax = min(numerical_data), max(numerical_data)
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, std)

plt.hist(numerical_data, bins=5, density=True, alpha=0.6, color='g')
plt.plot(x, p, 'k', linewidth=2)
title = f"Fit results: mu = {mu:.2f},  std = {std:.2f}"
plt.title(title)
plt.show()

NameError: name 'plt' is not defined