# Preprocesado de datos

In [85]:
import pandas as pd
import numpy as np

In [86]:
__RAW_DATA_PATH__ = "../data/raw"
__INTERIM_DATA_PATH__ = "../data/interim"

## Import data

In [87]:
# Load products_df
products_df = pd.read_csv(f"{__RAW_DATA_PATH__}/products_df.csv")
del (products_df["Unnamed: 0"])
print(f"* products_df loaded           *  shape: {products_df.shape}")

# Load commercial_activity_df
commercial_activity_df = pd.read_csv(f"{__RAW_DATA_PATH__}/commercial_activity_df.csv")
del (commercial_activity_df["Unnamed: 0"])
print(f"* commercial_activity_dfloaded *  shape: {commercial_activity_df.shape}")

# Load sociodemographic_df
sociodemographic_df = pd.read_csv(f"{__RAW_DATA_PATH__}/sociodemographic_df.csv")
del (sociodemographic_df["Unnamed: 0"])
print(f"* sociodemographic_df loaded   *  shape: {sociodemographic_df.shape}")

* products_df loaded           *  shape: (5962924, 17)
* commercial_activity_dfloaded *  shape: (5962924, 6)
* sociodemographic_df loaded   *  shape: (5962924, 8)


## Merge Dataframes

In [88]:
_df = pd.merge(left=commercial_activity_df, right=sociodemographic_df, how="inner", on=["pk_cid", "pk_partition"])
df = pd.merge(left=_df, right=products_df, how="inner", on=["pk_cid", "pk_partition"])
print(f"* Dataframes merged *  shape: {df.shape}")

* Dataframes merged *  shape: (5962924, 27)


## Data Preprocessing

In [89]:
df_prep = df.copy(deep=True)

##### pk_partition

In [90]:
df_prep["pk_partition"] = pd.to_datetime(df_prep["pk_partition"])

##### entry_date

In [91]:
# Obteníamos un error al convertir nuestra variable a fecha debido a las fechas correspondientes
# al 29 de Febrero de los años 2015 y 2019 puesto que no existen.
# Por este motivo, decidimos reemplazar estos valores por la fecha más cercana (día anterior)
df_prep["entry_date"] = df_prep["entry_date"].replace("2015-02-29", "2015-02-28")
df_prep["entry_date"] = df_prep["entry_date"].replace("2019-02-29", "2019-02-28")

df_prep["entry_date"] = pd.to_datetime(df_prep["entry_date"])

##### entry_channel

In [92]:
df_prep["entry_channel"].fillna("OTROS", inplace=True)
df_prep["entry_channel"].value_counts(dropna=False)

KHE    3113947
KFC     890620
KHQ     590280
KAT     416084
KHK     230197
        ...   
KEJ          8
KHS          5
KDA          2
KFP          2
KDS          1
Name: entry_channel, Length: 69, dtype: int64

##### active_customer

In [93]:
df_prep["active_customer"] = df_prep["active_customer"].astype(int)

##### segment

In [94]:
df_prep["segment"].fillna("00 - OTROS", inplace=True)
df_prep["segment"].value_counts(dropna=False)

03 - UNIVERSITARIO    3900166
02 - PARTICULARES     1830875
00 - OTROS             133944
01 - TOP                97939
Name: segment, dtype: int64

##### region_code

In [95]:
df_prep["region_code"] = df_prep["region_code"].fillna(-1).astype(int)
df_prep["region_code"].value_counts(dropna=False)

 28    1185757
 8      600362
 46     358142
 30     296856
 41     290447
 15     254891
 29     187911
 36     175850
 3      171358
 11     160894
 33     134562
 50     132983
 6      122587
 35     121982
 47     116948
 18     102789
 45     102328
 37      92538
 10      83283
 14      75405
 2       72071
 21      72037
 13      70092
 39      69589
 12      68709
 7       62436
 32      54639
 27      52783
 43      51782
 17      50749
 9       47746
 25      46887
 16      36094
 24      35426
 48      35256
 26      34790
 4       32680
 31      31241
 49      30965
 23      30955
 38      29001
 34      27545
 19      26762
 40      21658
 5       21300
 22      21112
 20      18833
 1       12999
 44      12616
 42       8500
 51       2896
 52       2638
-1        2264
Name: region_code, dtype: int64

##### gender

In [96]:
df_prep["gender"].fillna("non-binary", inplace=True)
df_prep["gender"].replace("H", "female", inplace=True)
df_prep["gender"].replace("V", "male", inplace=True)
df_prep["gender"].value_counts(dropna=False)

female        3087502
male          2875397
non-binary         25
Name: gender, dtype: int64

##### deceased

In [97]:
df_prep["deceased"].replace("N", 0, inplace=True)
df_prep["deceased"].replace("S", 1, inplace=True)
df_prep["deceased"].value_counts()

0    5961849
1       1075
Name: deceased, dtype: int64

##### salary

In [98]:
df_prep["salary"].fillna(-1, inplace=True)
df_prep["salary"].value_counts(dropna=False)

-1.00         1512103
 451931.22        760
 288997.44        546
 135522.15        466
 128318.52        458
               ...   
 188185.68          1
 37292.10           1
 59427.63           1
 179561.43          1
 71905.29           1
Name: salary, Length: 258630, dtype: int64

##### payroll and payroll_account

In [99]:
df_prep.dropna(subset=["payroll", "payroll_account"], inplace=True)
df_prep["payroll"] = df_prep["payroll"].astype(int)
df_prep["payroll"].value_counts(dropna=False)

0    5757076
1     205787
Name: payroll, dtype: int64

##### pension_plan

In [100]:
df_prep["pension_plan"] = df_prep["pension_plan"].astype(int)

### ONE HOT ENCODING

In [102]:
df_prep_not_null = df_prep.copy(deep=True)

In [103]:
def one_hot_encode(dataframe: pd.DataFrame, column_name: str) -> pd.DataFrame:
    _dummy_dataset = pd.get_dummies(dataframe[column_name], prefix=column_name)
    return pd.concat([dataframe, _dummy_dataset], axis=1).drop([column_name], axis=1)

In [104]:
cat_columns = [col for col in df_prep_not_null.select_dtypes(exclude=[np.number, "datetime"]).columns]
print(f"columns to encode: {cat_columns}")

print(f"df_prep_not_null before one-hot encoding: {df_prep_not_null.shape}")
for cat_col in cat_columns:
    df_prep_not_null = one_hot_encode(df_prep_not_null, cat_col)

print(f"df_prep_not_null after one-hot encoding:  {df_prep_not_null.shape}")

columns to encode: ['entry_channel', 'segment', 'country_id', 'gender']
df_prep_not_null before one-hot encoding: (5962863, 27)
df_prep_not_null after one-hot encoding:  (5962863, 140)


## Export Dataframe

In [107]:
def export_to_csv(dataframe: pd.DataFrame, filename: str, path: str = __INTERIM_DATA_PATH__):
    exported_path = f"{path}/{filename}.csv"
    dataframe.to_csv(exported_path)
    print(f"Dataframe exported to {exported_path}")

In [108]:
export_to_csv(df_prep_not_null, "easy_money_preprocessed_df")

Dataframe exported to ../data/interim/easy_money_preprocessed_df.csv
