In [1]:
# Import libraries
import pandas as pd
import os
from sklearn import preprocessing
from pathlib import Path
import uuid
import warnings
warnings.filterwarnings('ignore')

In [2]:
def meta_data_info(churnDataFrame: pd.DataFrame) -> None:
    """
    Prints metadata of raw churn data set.
    :param churnDataFrame: churn data set.
    :returns: None.
    """
    print(f'Column Count: {len(churnDataFrame.columns)}')
    print(f'Row Count: {len(churnDataFrame)}')
    print(churnDataFrame.info())

In [3]:
def remove_errors(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    """
    Removes rows where avg_frequency_login_days has an 'Error' value present.
    :param churnDataFrame: churn data set.
    :returns: processed churn data set with error rows removed.
    """
    rowCount = len(churnDataFrame)
    errorCounts = len(churnDataFrame[churnDataFrame['avg_frequency_login_days'] == 'Error'])
    churnDataFrame = churnDataFrame[churnDataFrame['avg_frequency_login_days'] != 'Error']
    print(f'{errorCounts} Errors Rows Removed out of {rowCount}')
    return churnDataFrame

In [4]:
def remove_duplicates(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    """
    Removes rows if there are duplicated "security_no" values in the data set.
    :param churnDataFrame: churn data set.
    :returns: processed churn datas set with duplicate rows removed if present.
    """
    originalSize = len(churnDataFrame)
    churnDataFrame = churnDataFrame.drop_duplicates(subset='security_no', keep='first')
    NewSize = len(churnDataFrame)
    numDuplicates = originalSize - NewSize
    print(f"Number of duplicates removed: {numDuplicates}")
    return churnDataFrame

In [5]:
def churn_distrubtion(churnDataFrame: pd.DataFrame) -> None:
    """
    Displays The percentage of customers that are and are not at risk of churn
    if the "churn_risk_score" field is present. Some data sets in future may
    not have this field present.
    :param churnDataFrame: churn data set.
    :returns: None.
    """
    cols = churnDataFrame.columns
    if 'churn_risk_score' in cols:
        print((churnDataFrame['churn_risk_score'].value_counts(
            normalize=True) * 100).round(2).to_frame().T.rename(columns={1:'Percenrage At Risk',0:'Percentage Not At Risk'}))
    else:
        print('"churn_risk_score" filed not present in data set')

In [6]:
def numeric_transform(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    """
    Finds extreme outlier values for known integer and float columns using IQR method
    which are invalid records. Median value (excluding errenous values) is calculated
    and imputed in place of invalid record. Results are rounded to two decimal places.
    :param churnDataFrame: churn data set.
    :returns: processed churn data set with errenous errors imputed with the median and rounded.
    """

    intColsTransform = ['days_since_last_login','age','avg_frequency_login_days']
    floatColsTransform = ['avg_time_spent','avg_transaction_value','points_in_wallet']

    for col in intColsTransform:
        print(f'Processing Integer values in "{col}"...')
        churnDataFrame[col] = abs(pd.to_numeric(churnDataFrame[col], errors='coerce')).round(0).astype('Int64')
        Q1 = churnDataFrame[col].quantile(0.10)
        Q3 = churnDataFrame[col].quantile(0.90)
        IQR = Q3 - Q1
        lower_bound = Q1 - 4 * IQR
        upper_bound = Q3 + 4 * IQR
        intOutliers = churnDataFrame[col][(churnDataFrame[col] < lower_bound) | (churnDataFrame[col] > upper_bound)]
        if intOutliers.empty:
            print(f'No far outliers found in {col}...')
        else:
            print(f'{len(intOutliers)} far outliers found in {col}: {sorted(set(intOutliers))}...')
            medianVal = churnDataFrame[col].loc[~churnDataFrame.index.isin(intOutliers.index)].median()
            churnDataFrame.loc[churnDataFrame.index.isin(intOutliers.index), col] = medianVal
            print(f'The median valuse of {medianVal} imputed for all outliers')

    for col in floatColsTransform:
        print(f'Processing float values in "{col}"...')
        churnDataFrame[col] = abs(pd.to_numeric(churnDataFrame[col], errors='coerce')).round(2).astype('float64')
        Q1 = churnDataFrame[col].quantile(0.10)
        Q3 = churnDataFrame[col].quantile(0.90)
        IQR = Q3 - Q1
        lower_bound = Q1 - 4 * IQR
        upper_bound = Q3 + 4 * IQR
        floatOutliers = churnDataFrame[col][(churnDataFrame[col] < lower_bound) | (churnDataFrame[col] > upper_bound)]
        if floatOutliers.empty:
            print(f'No far outliers found in {col}...')
        else:
            print(f'{len(floatOutliers)} far outliers found in {col}: {sorted(set(floatOutliers))}...')
            medianVal = churnDataFrame[col].loc[~churnDataFrame.index.isin(floatOutliers.index)].median()
            churnDataFrame.loc[churnDataFrame.index.isin(floatOutliers.index), col] = medianVal
            print(f'The median valuse of {medianVal} imputed for all outliers')

    return churnDataFrame

In [7]:
def process_nulls(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    """
    Identify fields containing null values and impute values based on field data type.
    :param churnDataFrame: churn data set.
    :returns: processed churn data set with null values imputed based on filed type
    """
    # Identify columns containing null values
    print("Identify columns containing null records...")
    missingDataCols = list(churnDataFrame.isna().sum()[churnDataFrame.isna().sum() > 0].to_frame().T.columns)
    for col in missingDataCols:
        print(f"{col} contains: {churnDataFrame[col].isna().sum()} null values")

    # Isolate a list of columns that are numeric, string and boolean based that contain missing values using list comprehensions
    boolCols = ['used_special_discount','offer_application_preference', 'past_complaint']
    stringCols = [col for col in list(churnDataFrame.select_dtypes(include='object').columns) if col in missingDataCols and col not in boolCols]
    numericCols = [col for col in list(churnDataFrame.select_dtypes(include=['int64','float64']).columns) if col in missingDataCols]

    print()
    print("Processing null values in string columns...")
    for col in stringCols:
        colNullCount = churnDataFrame[col].isna().sum()
        print(f"processing null {colNullCount} records in {col}...")
        churnDataFrame[col] = churnDataFrame[col].apply(lambda row: 'Unknown' if pd.isnull(row) else row)
    print("Processing null values in numeric columns...")
    for col in numericCols:
        colNullCount = churnDataFrame[col].isna().sum()
        print(f"processing null {colNullCount} records in {col}...")
        churnDataFrame[col] = (churnDataFrame[col].apply(lambda row: 0 if pd.isnull(row) else row)).round(2)
    print("Processing null values in boolean columns...")
    for col in boolCols:
        colNullCount = churnDataFrame[col].isna().sum()
        print(f"processing null {colNullCount} records in {col}...")
        churnDataFrame[col] = churnDataFrame[col].apply(lambda row: 'No' if pd.isnull(row) else row)
    print()
    print('Null records processed...')
    return churnDataFrame

In [8]:
def date_transformation(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    """
    Convert "joining_date" field to a datetime data type allowing for date based splits
    by year, month and day into separate fields.
    :param churnDataFrame: churn data set.
    :returns: processed churn data set with date field split into separate fields.
    """
    if 'joining_date' in churnDataFrame.columns:
        churnDataFrame['joining_date'] = pd.to_datetime(churnDataFrame['joining_date'])
        churnDataFrame['join_year'] = churnDataFrame['joining_date'].dt.year.astype('Int64')
        churnDataFrame['join_month'] = churnDataFrame['joining_date'].dt.month.astype('Int64')
        churnDataFrame['join_day'] = churnDataFrame['joining_date'].dt.day.astype('Int64')
        churnDataFrame = churnDataFrame.drop(columns=['joining_date'])
        print('joining_date field transformed...')
        return churnDataFrame
    else:
        print('join_date field not in data...')
        return churnDataFrame

def time_transformation(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    """
    Convert "last_visit_time" field to a datetime data type in hours, minutes and seconds format to allow for
    time based splits into separate fields.
    :param churnDataFrame: churn data set.
    :returns: processed churn data set with time field split into separate fields.
    """
    if 'last_visit_time' in churnDataFrame.columns:
        churnDataFrame['last_visit_time'] = pd.to_datetime(churnDataFrame['last_visit_time'], format='%H:%M:%S')
        churnDataFrame['last_visit_hour'] = churnDataFrame['last_visit_time'].dt.hour.astype('Int64')
        churnDataFrame['last_visit_min'] = churnDataFrame['last_visit_time'].dt.minute.astype('Int64')
        churnDataFrame['last_visit_sec'] = churnDataFrame['last_visit_time'].dt.second.astype('Int64')
        churnDataFrame = churnDataFrame.drop(columns=['last_visit_time'])
        print('last_visit_time field transformed...')
        return churnDataFrame
    else:
        print('last_visit_time field not in data...')
        return churnDataFrame

In [9]:
def special_characters(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    """
    Identifies columns containing special characters as values meaning they are
    errenous and imputes other values. "join_through_referral" handled separatley.
    :param churnDataFrame: churn data set.
    :returns: processed churn data set with errenous special characters removed and values imputed.
    """
    specialChars = r'[!?#]'
    cols_with_special = [
        col for col in churnDataFrame.columns
        if churnDataFrame[col].astype(str).str.contains(specialChars, regex=True).any()]
    print(f'Columns containing special character values: {cols_with_special}')

    for col in cols_with_special:
        uniqueVals = list(set(churnDataFrame[col].to_list()))
        SpecialCharCount = churnDataFrame[col].str.count(specialChars).sum()
        if col in 'joined_through_referral':
            churnDataFrame[col] = churnDataFrame.apply(
                lambda row: 'No' if row['referral_id'] in ['xxxxxxxx'] and row['joined_through_referral'] in uniqueVals else 'Yes',
                axis=1)
        else:
            churnDataFrame[col] = churnDataFrame[col].apply(lambda row: 'Unknown' if row in specialChars else row)
        print(f'Processed {SpecialCharCount} records of special charcters in "{col}" column.....')
    return churnDataFrame

In [10]:
def feedback_transform(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    """
    Transforms feedback column into the following numeric classifications:
    0: Negative
    1: Positive
    2: Netural
    Results in this field are finite and can be transformed with harcoded mappings.
    :param churnDataFrame: churn data set.
    :returns: processed churn data set with feedback values mapped to pre defined integers.
    """

    # Transforming feedback values to positive, negative or neutral integer mapping
    feedBackMapping = {'Products always in Stock':1,
    'User Friendly Website':1,
    'Poor Customer Service':0,
    'Poor Product Quality':0,
    'Reasonable Price':1,
    'Quality Customer Care':1,
    'Too many ads':0,
    'Poor Website':0,
    'No reason specified':2}
    # Display value mappings to log out put
    for k,v in feedBackMapping.items():
        print(f'Mapping value: "{k}", to integer value: {v}')

    churnDataFrame['feedback'] = churnDataFrame['feedback'].str.strip()
    churnDataFrame['feedback'] = churnDataFrame['feedback'].apply(lambda row: feedBackMapping[row] if row in feedBackMapping else row)
    churnDataFrame['feedback'] = churnDataFrame['feedback'].astype('Int64')
    return churnDataFrame

In [11]:
def label_encode(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    """
    Perform label encoding on all categorical fields, converting discrete values to integers
    for tree based modelling downstream. Value mappings of categorical variables displayed
    in log out put.
    :param churnDataFrame: churn data set.
    :returns: processed churn data set with categorical variables conerted to integers
    """
    categoricalCols = list(churnDataFrame.select_dtypes(include='object').columns)
    for col in categoricalCols:
        print(f'Label Encoding values in {col}...')
        categoricalVals = list(set(churnDataFrame[col]))
        labelEncoder = preprocessing.LabelEncoder()
        churnDataFrame[col] = labelEncoder.fit_transform(churnDataFrame[col])
        print(f'Categorical values mapped from {categoricalVals} to integer values: {list(set(churnDataFrame[col]))}')
    return churnDataFrame

In [12]:
def remove_features(churnDataFrame: pd.DataFrame) -> pd.DataFrame:
    """
    Remove "security_no" and "referral_id" fields from the data set and re-order
    the "churn_risk_score" field as the final colum reading left to right.
    :param churnDataFrame: churn data set.
    :returns: processed churn data set with certain fields removed and columns re-ordered.
    """
    churnDataFrame = churnDataFrame.drop(columns=['security_no','referral_id'])
    print('Removed "security_no" and "referral_id" columns.....')
    cols = [col for col in churnDataFrame.columns if col not in ['churn_risk_score']]
    if 'churn_risk_score' in churnDataFrame.columns:
        newCols = cols + ['churn_risk_score']
        churnDataFrame = churnDataFrame.reindex(columns=newCols)
        print('Re-Indexed "churn_risk_score" field')
    return churnDataFrame

In [13]:
def execute_transform_pipeline() -> None:
    """
    Function to execute the cleaning and transformation pipeline. This function
    will call all specified functions sequentially to conduct the cleaning and
    analysis process. Files will be taken based on file name and processed in a loop.
    The cleaned CSV file will be written out to the "clean" folder.
    Logging outputs will be printed below for visibility.
    :returns: None
    """
    # Extract a list of file names to be processed
    csvFiles = [str(file) for file in Path('.').glob('*.csv')]
    for fileName in csvFiles:
        count = 0
        data = pd.read_csv(fileName)

        # Creating random ID to simulate a real data transformation pipeline in log output
        randomId = uuid.uuid4()
        stringID = str(randomId)
        print(f"Executing Data Trasnformation Pipeline...")
        print()
        print(f"Run ID: {stringID}")
        print()

        print("----- Raw Churn Meta Data -----",end='\n')
        meta_data_info(data)
        print()

        print("----- Removing Errenous Rows -----",end='\n')
        data = remove_errors(data)
        print()

        print("----- Removing Duplicate Rows -----",end='\n')
        data = remove_duplicates(data)
        print()

        print("----- Distribution (%) of Customers at risk of Churn -----",end='\n')
        churn_distrubtion(data)
        print()

        print("----- Transform Numeric Values -----",end='\n')
        data = numeric_transform(data)
        print()

        print("----- Impute Values for Null Records -----",end='\n')
        data = process_nulls(data)
        print()

        print("----- Transform Date and Time Fields -----",end='\n')
        data = date_transformation(data)
        data = time_transformation(data)
        print()

        print("----- Impute Values for Special Characters -----",end='\n')
        data = special_characters(data)
        print()

        print("----- Transform Feedback Column -----",end='\n')
        data = feedback_transform(data)
        print()

        print("----- Removing Unnecessary Features -----",end='\n')
        data = remove_features(data)
        print()

        print("----- Encode Categorical Variables -----",end='\n')
        data = label_encode(data)
        print()

        print(f"Data Transformation Run ID: {stringID} Complete...")
        fileName = f"churn_clean_{count}.csv"
        cleanFolderPath = f"clean/{fileName}"
        os.makedirs("clean", exist_ok=True)
        data.to_csv(cleanFolderPath, index=False)
        print(f"Transformed data set: {fileName} written to 'Clean' folder...")
        count += 1

In [14]:
# Execute transformation pipeline
execute_transform_pipeline()

Executing Data Trasnformation Pipeline...

Run ID: 67c08f0b-a190-415b-8cad-80d3506e1aaa

----- Raw Churn Meta Data -----
Column Count: 23
Row Count: 36992
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36992 entries, 0 to 36991
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           36992 non-null  int64  
 1   gender                        36992 non-null  object 
 2   security_no                   36992 non-null  object 
 3   region_category               31564 non-null  object 
 4   membership_category           36992 non-null  object 
 5   joining_date                  36992 non-null  object 
 6   joined_through_referral       36992 non-null  object 
 7   referral_id                   36992 non-null  object 
 8   preferred_offer_types         36704 non-null  object 
 9   medium_of_operation           36992 non-null  object 
 10  internet_option        