In [None]:
import pandas as pd
from sklearn import preprocessing
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [1042]:
def meta_data_info(churnDataFrame: pd.DataFrame) -> None:
    print(f'Column Count: {len(churnDataFrame.columns)}')
    print(f'Row Count: {len(churnDataFrame)}')
    print(f'Data Columns: {list(churnDataFrame.columns)}')

In [1043]:
# Remove Error Rows

def remove_errors(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    rowCount = len(churnDataFrame)
    errorCounts = len(churnDataFrame[churnDataFrame['avg_frequency_login_days'] == 'Error'])
    churnDataFrame = churnDataFrame[churnDataFrame['avg_frequency_login_days'] != 'Error']
    print(f'{errorCounts} Errors Rows Removed out of {rowCount}')
    return churnDataFrame

In [1044]:
# Duplicate removal function

def remove_duplicates(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    originalSize = len(churnDataFrame)
    churnDataFrame = churnDataFrame.drop_duplicates(subset='security_no', keep='first')
    NewSize = len(churnDataFrame)
    numDuplicates = originalSize - NewSize
    print(f"Number of duplicates removed: {numDuplicates}")
    return churnDataFrame

In [1045]:
# Churn and Non Churn Distribution

def churn_distrubtion(churnDataFrame: pd.DataFrame) -> None:
    print((churnDataFrame['churn_risk_score'].value_counts(
        normalize=True) * 100).round(2).to_frame().T.rename(columns={1:'Percenrage At Risk',0:'Percentage Not At Risk'}))

In [1046]:
def numeric_transform(churnDataFrame: pd.DataFrame) -> pd.DataFrame:

    intColsTransform = ['days_since_last_login','age','avg_frequency_login_days']
    floatColsTransform = ['avg_time_spent','avg_transaction_value','points_in_wallet']

    for col in intColsTransform:
        print(f'Processing Integer values in "{col}"...')
        churnDataFrame[col] = abs(pd.to_numeric(churnDataFrame[col], errors='coerce')).round(0).astype('Int64')
        Q1 = churnDataFrame[col].quantile(0.10)
        Q3 = churnDataFrame[col].quantile(0.90)
        IQR = Q3 - Q1
        lower_bound = Q1 - 4 * IQR
        upper_bound = Q3 + 4 * IQR
        far_outliers = churnDataFrame[col][(churnDataFrame[col] < lower_bound) | (churnDataFrame[col] > upper_bound)]
        if far_outliers.empty:
            print(f'No far outliers found in {col}...')
        else:
            print(f'{len(far_outliers)} far outliers found in {col}: {sorted(set(far_outliers))}...')
            medianVal = churnDataFrame[col].loc[~churnDataFrame.index.isin(far_outliers.index)].median()
            churnDataFrame.loc[churnDataFrame.index.isin(far_outliers.index), col] = medianVal
            print(f'The median valuse of {medianVal} imputed for all outliers')

    print("-------------------")

    for col in floatColsTransform:
        print(f'Processing float values in "{col}"...')
        churnDataFrame[col] = abs(pd.to_numeric(churnDataFrame[col], errors='coerce')).round(2).astype('float64')
        Q1 = churnDataFrame[col].quantile(0.10)
        Q3 = churnDataFrame[col].quantile(0.90)
        IQR = Q3 - Q1
        lower_bound = Q1 - 4 * IQR
        upper_bound = Q3 + 4 * IQR
        far_outliers = churnDataFrame[col][(churnDataFrame[col] < lower_bound) | (churnDataFrame[col] > upper_bound)]
        if far_outliers.empty:
            print(f'No far outliers found in {col}...')
        else:
            print(f'{len(far_outliers)} far outliers found in {col}: {sorted(set(far_outliers))}...')
            medianVal = churnDataFrame[col].loc[~churnDataFrame.index.isin(far_outliers.index)].median()
            churnDataFrame.loc[churnDataFrame.index.isin(far_outliers.index), col] = medianVal
            print(f'The median valuse of {medianVal} imputed for all outliers')

    return churnDataFrame

In [1047]:
def process_nulls(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    missingDataCols = list(churnDataFrame.isna().sum()[churnDataFrame.isna().sum() > 0].to_frame().T.columns)
    for col in missingDataCols:
        print(f"{col} contains: {churnDataFrame[col].isna().sum()} null values")

    # Isolate a list of columns that are numeric and string based that contain missing values using list comprehensions
    boolCols = ['used_special_discount','offer_application_preference', 'past_complaint']
    stringCols = [col for col in list(churnDataFrame.select_dtypes(include='object').columns) if col in missingDataCols and col not in boolCols]
    numericCols = [col for col in list(churnDataFrame.select_dtypes(include=['int64','float64']).columns) if col in missingDataCols]

    # Loop over string and numeric cols and perform low level imputation
    for col in stringCols:
        churnDataFrame[col] = churnDataFrame[col].apply(lambda row: 'Unknown' if pd.isnull(row) else row)
    for col in numericCols:
        churnDataFrame[col] = (churnDataFrame[col].apply(lambda row: 0 if pd.isnull(row) else row)).round(2)
    for col in boolCols:
        churnDataFrame[col] = churnDataFrame[col].apply(lambda row: 'No' if pd.isnull(row) else row)
    print('Null records processed...')
    return churnDataFrame

In [1048]:
def date_transformation(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    if 'joining_date' in churnDataFrame.columns:
        churnDataFrame['joining_date'] = pd.to_datetime(churnDataFrame['joining_date'])
        churnDataFrame['join_year'] = churnDataFrame['joining_date'].dt.year.astype('Int64')
        churnDataFrame['join_month'] = churnDataFrame['joining_date'].dt.month.astype('Int64')
        churnDataFrame['join_day'] = churnDataFrame['joining_date'].dt.day.astype('Int64')
        churnDataFrame = churnDataFrame.drop(columns=['joining_date'])
        print('joining_date field transformed...')
        return churnDataFrame
    else:
        print('join_date field not in data...')
        return churnDataFrame

def time_transformation(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    if 'last_visit_time' in churnDataFrame.columns:
        churnDataFrame['last_visit_time'] = pd.to_datetime(churnDataFrame['last_visit_time'], format='%H:%M:%S')
        churnDataFrame['last_visit_hour'] = churnDataFrame['last_visit_time'].dt.hour.astype('Int64')
        churnDataFrame['last_visit_min'] = churnDataFrame['last_visit_time'].dt.minute.astype('Int64')
        churnDataFrame['last_visit_sec'] = churnDataFrame['last_visit_time'].dt.second.astype('Int64')
        churnDataFrame = churnDataFrame.drop(columns=['last_visit_time'])
        print('last_visit_time field transformed...')
        return churnDataFrame
    else:
        print('last_visit_time field not in data...')
        return churnDataFrame

In [1049]:
def special_characters(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    specialChars = r'[!?#]'
    cols_with_special = [
        col for col in churnDataFrame.columns
        if churnDataFrame[col].astype(str).str.contains(specialChars, regex=True).any()]
    print(f'Columns containing special character values: {cols_with_special}')

    for col in cols_with_special:
        uniqueVals = list(set(churnDataFrame[col].to_list()))
        SpecialCharCount = churnDataFrame[col].str.count(specialChars).sum()
        if col == 'joined_through_referral':
            churnDataFrame[col] = churnDataFrame.apply(
                lambda row: 'No' if row['referral_id'] in ['xxxxxxxx'] and row['joined_through_referral'] in uniqueVals else 'Yes',
                axis=1)
        else:
            churnDataFrame[col] = churnDataFrame[col].apply(lambda row: 'Unknown' if row in specialChars else row)
        print(f'Processed {SpecialCharCount} records of special charcters in "{col}" column.....')
    return churnDataFrame

In [1050]:
def feedback_transform(churnDataFrame: pd.DataFrame) -> pd.DataFrame:

    feedBackMapping = {'Products always in Stock':1,
    'User Friendly Website':1,
    'Poor Customer Service':0,
    'Poor Product Quality':0,
    'Reasonable Price':1,
    'Quality Customer Care':1,
    'Too many ads':0,
    'Poor Website':0,
    'No reason specified':3}
    for k,v in feedBackMapping.items():
        print(f'Apply the following mapping: {k} : {v}')

    churnDataFrame['feedback'] = churnDataFrame['feedback'].str.strip()
    churnDataFrame['feedback'] = churnDataFrame['feedback'].apply(lambda row: feedBackMapping[row] if row in feedBackMapping else row)
    churnDataFrame['feedback'] = churnDataFrame['feedback'].astype('Int64')
    return churnDataFrame

In [1051]:
def label_encode(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    categoricalCols = list(churnDataFrame.select_dtypes(include='object').columns)
    for col in categoricalCols:
        print(f'Label Encoding values in {col}...')
        print(f'Unique values in {col}:{list(set(churnDataFrame[col]))}')
        labelEncoder = preprocessing.LabelEncoder()
        churnDataFrame[col] = labelEncoder.fit_transform(churnDataFrame[col])
        print(f'Categorical values mapped in {col} to integer values: {list(set(churnDataFrame[col]))}')
    return churnDataFrame

In [1052]:
def remove_features(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    churnDataFrame = churnDataFrame.drop(columns=['security_no','referral_id'])
    print('Removed "security_no" and "referral_id" columns.....')
    return churnDataFrame

In [1053]:
def execute_transform_pipeline(rawFileName):

    data = pd.read_csv(rawFileName)
    print('Executing Transformation Pipeline...', end='\n')
    print()

    print("----- Raw Churn Meta Data -----",end='\n')
    meta_data_info(data)
    print()

    print("----- Removing Errenous Rows -----",end='\n')
    data = remove_errors(data)
    print()

    print("----- Removing Duplicate Rows -----",end='\n')
    data = remove_duplicates(data)
    print()

    print("----- Distribution (%) of Customers at risk of Churn -----",end='\n')
    churn_distrubtion(data)
    print()

    print("----- Transform Numeric Values -----",end='\n')
    data = numeric_transform(data)
    print()

    print("----- Impute Values for Null Records -----",end='\n')
    data = process_nulls(data)
    print()

    print("----- Transform Date and Time Fields -----",end='\n')
    data = date_transformation(data)
    data = time_transformation(data)
    print()

    print("----- Impute Values for Special Characters -----",end='\n')
    data = special_characters(data)
    print()

    print("----- Transform Feedback Column -----",end='\n')
    data = feedback_transform(data)
    print()

    print("----- Removing Unnecessary Features -----",end='\n')
    data = remove_features(data)
    print()

    print("----- Encode Categorical Variables -----",end='\n')
    data = label_encode(data)
    print()

    return data

In [1054]:
csvFiles = [str(file) for file in Path('.').glob('*.csv')]
for fileName in csvFiles:
    churn_clean = execute_transform_pipeline(fileName)
churn_clean

Executing Transformation Pipeline...

----- Raw Churn Meta Data -----
Column Count: 23
Row Count: 36992
Data Columns: ['age', 'gender', 'security_no', 'region_category', 'membership_category', 'joining_date', 'joined_through_referral', 'referral_id', 'preferred_offer_types', 'medium_of_operation', 'internet_option', 'last_visit_time', 'days_since_last_login', 'avg_time_spent', 'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount', 'offer_application_preference', 'past_complaint', 'complaint_status', 'feedback', 'churn_risk_score']

----- Removing Errenous Rows -----
3522 Errors Rows Removed out of 36992

----- Removing Duplicate Rows -----
Number of duplicates removed: 0

----- Distribution (%) of Customers at risk of Churn -----
churn_risk_score  Percenrage At Risk  Percentage Not At Risk
proportion                     54.07                   45.93

----- Transform Numeric Values -----
Processing Integer values in "days_since_last_login"...
1

Unnamed: 0,age,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,days_since_last_login,avg_time_spent,...,past_complaint,complaint_status,feedback,churn_risk_score,join_year,join_month,join_day,last_visit_hour,last_visit_min,last_visit_sec
0,18,0,3,3,0,1,3,2,17,300.63,...,0,1,1,0,2017,8,17,16,8,2
1,32,0,0,4,1,1,1,1,16,306.34,...,1,2,1,0,2017,8,28,12,38,13
2,44,0,1,2,1,1,1,2,14,516.16,...,1,3,0,1,2016,11,11,22,53,21
3,37,1,0,2,1,1,1,1,11,53.27,...,1,4,0,1,2016,10,29,15,57,50
4,31,0,0,2,0,0,2,1,20,113.13,...,1,2,0,1,2017,9,12,15,46,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36985,12,0,3,4,0,1,1,0,13,418.38,...,1,4,1,0,2016,10,25,3,30,17
36986,27,1,1,3,1,0,1,1,13,135.83,...,0,1,3,0,2015,9,7,5,29,19
36987,46,0,2,0,0,0,1,2,2,650.68,...,1,0,3,1,2017,9,21,4,14,5
36988,29,0,1,0,0,3,2,2,13,638.12,...,0,1,0,1,2016,6,27,23,18,31
