In [1]:
import pandas as pd
from rapidfuzz import process # for categorical cleaning

df = pd.read_csv("../data/clean_customer_churn.csv")

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
 11  id               10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,id
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,2
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,3
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0,4
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,5


In [2]:
df.reset_index(drop=True, inplace=True)
df['Customer_index'] = df.index + 1

In [3]:
df.isna().sum().sort_values(ascending=False)

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
id                 0
Customer_index     0
dtype: int64

In [4]:
def standardized_text(x):
    return str(x).strip().title()

def autocorrect(x, valid_values, threshold=80):
    match = process.extractOne(x, valid_values)
    if match and match[1] >= threshold:
        return match[0]
    return x

# Cleaning configuration
cleaning_rules = {
    "Geography": ["France", "Spain", "Germany"],
    "Gender": ["Male", "Female"]
}

# Apply
for col, valid_vals in cleaning_rules.items():
    df[col] = df[col].apply(standardized_text)
    df[col] = df[col].apply(lambda x: autocorrect(x, valid_vals))

In [5]:
df[['Age', 'Tenure', 'CreditScore', 'Balance', 'EstimatedSalary']].describe()

Unnamed: 0,Age,Tenure,CreditScore,Balance,EstimatedSalary
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,38.9218,5.0128,650.5288,76485.889288,100090.239881
std,10.487806,2.892174,96.653299,62397.405202,57510.492818
min,18.0,0.0,350.0,0.0,11.58
25%,32.0,3.0,584.0,0.0,51002.11
50%,37.0,5.0,652.0,97198.54,100193.915
75%,44.0,7.0,718.0,127644.24,149388.2475
max,92.0,10.0,850.0,250898.09,199992.48


In [6]:
# Age band
df["Age_band"] = pd.cut(
    df["Age"],
    bins=[18,25,35,45,55,65,100],
    labels=["18-24", "25-34", "35-44", "45-54", "55-64", "65+"]
)

# Tenure band
df["Tenure_band"] = pd.cut(
    df["Tenure"],
    bins=[-1, 1, 3, 5, 7, 10],
    labels=["0-1", "2-3", "4-5", "6-7", "8-10"]
)

# Balance flag
df["Balance_flag"] = (df["Balance"] > 0).astype(int)

# Multi-product flag
df["Multi_product_flag"] = (df["NumOfProducts"] >= 2).astype(int)

In [7]:
df.to_csv("../data/clean_customer_churn.csv", index=False)