In [1]:
import pandas as pd


In [2]:
demo = pd.read_csv("df_final_demo.txt", delimiter=",") 

In [3]:
demo.head()

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0


In [4]:
print(demo.shape)

(70609, 9)


demo.columns = (
    demo.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("-", "_")
)

In [5]:
demo.isnull().sum()

client_id            0
clnt_tenure_yr      14
clnt_tenure_mnth    14
clnt_age            15
gendr               14
num_accts           14
bal                 14
calls_6_mnth        14
logons_6_mnth       14
dtype: int64

In [14]:
#Check duplicates client_id   --------------------------   CLIENT_ID -------------------------
duplicated = demo['client_id'].duplicated().sum()
print(f"Duplicates clients: {duplicated}")

Duplicates clients: 0


In [15]:
#Cleaning the clnt_tenure_yr column ----------------------- clnt_tenure_yr--------------------

# Convert to integers and correct out-of-range values (0–50)
demo['clnt_tenure_yr'] = demo['clnt_tenure_yr'].round().astype('Int64')
demo.loc[~demo['clnt_tenure_yr'].between(0, 50), 'clnt_tenure_yr'] = pd.NA

In [16]:
demo['clnt_tenure_yr'].max()

np.int64(50)

In [18]:
#Cleaing the clnt_tenure_mnth column ----------------------------clnt_tenure_mnth-----------------------------------------

# Round and convert to integers
demo['clnt_tenure_mnth'] = demo['clnt_tenure_mnth'].round().astype('Int64')

# If there are months >= 12, convert the excess to years
overflow = demo['clnt_tenure_mnth'] // 12
demo['clnt_tenure_yr'] += overflow
demo['clnt_tenure_mnth'] = demo['clnt_tenure_mnth'] % 12

In [20]:
#Cleaning the clnt_age column -------------------------------clnt_age------------------------------------

# Fill the single null with the median
median_age = demo['clnt_age'].median()
demo['clnt_age'] = demo['clnt_age'].fillna(median_age).round().astype('Int64')

# Nullify ages outside of 18–100
demo.loc[~demo['clnt_age'].between(18, 100), 'clnt_age'] = pd.NA

In [22]:
#Cleaning the gender column --------------------------------------gendr-----------------------------------------
#use dictionary for map
map_gender = {
    'm': 'male',
    'male': 'male',
    'f': 'female',
    'female': 'female'
}
#convert to object and nulls are unknow:
demo['gendr'] = (
    demo['gendr']
        .astype(str)
        .str.strip()
        .str.lower()
        .map(map_gender)
        .fillna('unknown')
)

In [23]:
print(demo)

       client_id  clnt_tenure_yr  clnt_tenure_mnth  clnt_age    gendr  \
0         836976              12                 1        60  unknown   
1        2304905              14                10        58  unknown   
2        1439522              10                 4        32  unknown   
3        1562045              32                 6        49     male   
4        5126305              24                 1        33   female   
...          ...             ...               ...       ...      ...   
70604    7993686               8                 8        38  unknown   
70605    8981690              24                 4        31     male   
70606     333913              32                 6        62   female   
70607    1573142              42                 3        68     male   
70608    5602139              42                 2        60   female   

       num_accts         bal  calls_6_mnth  logons_6_mnth  
0            2.0    45105.30           6.0            9.0  
1  

In [24]:
## The data look very clean. We need only to change the datatype of columns: num_accts, calls_6_mnth, logons_6_mnth
import numpy as np
demo["num_accts"] = demo["num_accts"].fillna(0).astype(int)
demo['num_accts'] = demo['num_accts'].apply(np.int64)
demo["calls_6_mnth"] = demo["calls_6_mnth"].fillna(0).astype(int)
demo['calls_6_mnth'] = demo['calls_6_mnth'].apply(np.int64)
demo["logons_6_mnth"] = demo["logons_6_mnth"].fillna(0).astype(int)
demo['logons_6_mnth'] = demo['logons_6_mnth'].apply(np.int64)

In [25]:
print(demo)

       client_id  clnt_tenure_yr  clnt_tenure_mnth  clnt_age    gendr  \
0         836976              12                 1        60  unknown   
1        2304905              14                10        58  unknown   
2        1439522              10                 4        32  unknown   
3        1562045              32                 6        49     male   
4        5126305              24                 1        33   female   
...          ...             ...               ...       ...      ...   
70604    7993686               8                 8        38  unknown   
70605    8981690              24                 4        31     male   
70606     333913              32                 6        62   female   
70607    1573142              42                 3        68     male   
70608    5602139              42                 2        60   female   

       num_accts         bal  calls_6_mnth  logons_6_mnth  
0              2    45105.30             6              9  
1  

In [30]:
from pathlib import Path


Path("data/interim").mkdir(parents=True, exist_ok=True)

demo.to_csv("data/interim/demo_clean.csv", index=False)

print("✔️ Archivo guardado en data/interim/demo_clean.csv")

✔️ Archivo guardado en data/interim/demo_clean.csv
