# Churn Rate Cleaning and Preprocessing

In [1]:
# imports
import pandas as pd
import re


## Load Data

In [2]:
# Load data
df = pd.read_csv('data/metadata_dataset.csv')
# show column names
df.columns.tolist()

["[1]{'Version'}[1]{'ContractItem'}[1]{'CancellationStatusCode'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ContractItem'}[1]{'CustomerContractLifeCycleStatusCode'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ContractItem'}[1]{'Description'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ContractItem'}[1]{'InternalID'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ContractItem'}[1]{'ProductCategory'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ContractItem'}[1]{'ProductDescription'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ContractItem'}[1]{'SupportEndDate'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ContractItem'}[1]{'SupportStartDate'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ContractItem'}[1]{'ValidityStatusCode'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ContractItem'}[1]{'ConcurrentSessions'}[1]{'Content'}",
 "[1]{'Version'}[1]{'Description'}[1]{'Content'}",
 "[1]{'Version'}[1]{'EndDateTime'}[1]{'Content'}",
 "[1]{'Version'}[1]{'StartDateTime'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ItemCount'}[1]{'Content'}",
 "[1]{'Version'}[1]{'ItemList

## Rename columns for readability and consistency

In [3]:
# remove useless column name parts
df.columns = df.columns.str.replace(r"\[1\]", "", regex=True)
df.columns = df.columns.str.replace(r"\{'Content'\}", "", regex=True)
df.columns = df.columns.str.replace(r"\{'Version'\}", "", regex=True)
df.columns = df.columns.str.replace(r'[{}\']', '', regex=True)

# convert camel to snake case
def camel_to_snake(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)  # Insert underscores between lower and upper case letters
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()  # Handle transition from lowercase/number to uppercase

df.columns = [camel_to_snake(col) for col in df.columns]

# Print the updated column names
print(df.columns.tolist())

['contract_item_cancellation_status_code', 'contract_item_customer_contract_life_cycle_status_code', 'contract_item_description', 'contract_item_internal_id', 'contract_item_product_category', 'contract_item_product_description', 'contract_item_support_end_date', 'contract_item_support_start_date', 'contract_item_validity_status_code', 'contract_item_concurrent_sessions', 'description', 'end_date_time', 'start_date_time', 'item_count', 'item_list_cancellation_status_code', 'item_list_customer_contract_life_cycle_status_code', 'item_list_validity_status_code', 'sap_internal_id', 'contract_label', 'customer_earliest_start', 'customer_latest_end', 'customer_label', 'sla', 'product_category', 'service_level_regex', 'otrs_version', 'system_type', 'feature_add_ons', 'cancellation_date', 'customer_country', 'concat_volume', 'concat_currency', 'cancellation_date_orca', 'xml_key', 'end_customer_id', 'otrs_system_id']


## Convert all time columns to datetime

In [4]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Could not infer format.*")

# Convert columns with 'start', 'end', 'date' in the column name to datetime
date_columns = df.columns[df.columns.str.contains('start|_end|end_date|date', case=False)]
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

## Convert all numerical columns to numeric if they are not already

In [5]:
# concat_volume column contains commas and dots, remove commas and replace dots with commas
df['concat_volume'] = df['concat_volume'].str.replace('.', '', regex=False)  
df['concat_volume'] = df['concat_volume'].str.replace(',', '.', regex=False) 
df['concat_volume'] = pd.to_numeric(df['concat_volume'])


## Get a general overview of the data types and missing values

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 36 columns):
 #   Column                                                  Non-Null Count  Dtype         
---  ------                                                  --------------  -----         
 0   contract_item_cancellation_status_code                  208 non-null    object        
 1   contract_item_customer_contract_life_cycle_status_code  208 non-null    object        
 2   contract_item_description                               208 non-null    object        
 3   contract_item_internal_id                               208 non-null    int64         
 4   contract_item_product_category                          208 non-null    object        
 5   contract_item_product_description                       208 non-null    object        
 6   contract_item_support_end_date                          208 non-null    datetime64[ns]
 7   contract_item_support_start_date                        208 no

## Clean the ID columns

### Check for duplicates in ID rows

In [7]:
# check number of distinct duplicates in columns with id in the name
for col in df.columns:
    if '_id' in col:
        print(f'{col}: {df[col].duplicated().sum()} duplicates')

print(f'xml_key: {df['xml_key'].duplicated().sum()} duplicates')

contract_item_internal_id: 204 duplicates
sap_internal_id: 0 duplicates
end_customer_id: 33 duplicates
otrs_system_id: 0 duplicates
xml_key: 0 duplicates


### Drop ID columns with no duplicates

In [8]:
print(len(df.columns))
# drop columns with id in the name with no duplicates
df = df.drop(columns=[col for col in df.columns if '_id' in col and not df[col].duplicated().any()])
# drop column xml_key
df = df.drop(columns=['xml_key'])
print(len(df.columns))

36
33


## Save to CSV

In [9]:
df

Unnamed: 0,contract_item_cancellation_status_code,contract_item_customer_contract_life_cycle_status_code,contract_item_description,contract_item_internal_id,contract_item_product_category,contract_item_product_description,contract_item_support_end_date,contract_item_support_start_date,contract_item_validity_status_code,contract_item_concurrent_sessions,...,service_level_regex,otrs_version,system_type,feature_add_ons,cancellation_date,customer_country,concat_volume,concat_currency,cancellation_date_orca,end_customer_id
0,Not Canceled,In Process,OTRS GOLD,10,Contracts Managed OTRS,OTRS GOLD,2023-12-06 23:00:00,2022-12-06 23:00:00,Active,50.0,...,Gold,7.0.48,managed,"['SaaSPortalConnector', 'OTRSCalendarResourceP...",NaT,GERMANY,17495.0,Euro,NaT,CFHbGks3
1,Not Canceled,In Process,OTRS On-Premise PLATINUM,10,Contracts On-Premise OTRS,OTRS On-Premise PLATINUM,2023-12-11 23:00:00,2022-12-11 23:00:00,Active,300.0,...,Platinum,7.0.23,auto,"['GeneralCatalog', 'OTRSSystemConfigurationHis...",NaT,SPAIN,37800.0,Euro,NaT,laVvIOXe
2,Not Canceled,In Process,OTRS On-Premise GOLD,10,Contracts On-Premise OTRS,OTRS On-Premise GOLD,2023-12-31 23:00:00,2022-12-31 23:00:00,Active,50.0,...,Gold,7.0.22,auto,"['OTRSHideShowDynamicFields', 'OTRSEscalationS...",2017-05-16 08:38:00,GERMANY,5995.0,Euro,NaT,Yc6VmmVi
3,Not Canceled,In Process,OTRS On-Premise GOLD (Testsystem),10,Contracts On-Premise OTRS,OTRS On-Premise GOLD,2023-12-31 23:00:00,2022-12-31 23:00:00,Active,50.0,...,Gold,7.0.12,auto,"['GeneralCatalog', 'OTRSSystemConfigurationHis...",2017-05-16 08:38:00,GERMANY,5995.0,Euro,NaT,Yc6VmmVi
4,Not Canceled,In Process,OTRS On-Premise SILVER,10,Contracts On-Premise OTRS,OTRS On-Premise SILVER,2023-12-31 23:00:00,2022-12-31 23:00:00,Active,10.0,...,Silver,7.0.12,auto,"['OTRSAdvancedEscalations', 'AkquinetAssetSear...",2017-05-16 08:38:00,GERMANY,3995.0,Euro,NaT,Yc6VmmVi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,Cancellation Requested,In Process,OTRS Contract - Annual Support ENTRY ADV,20,Contracts On-Premise Old,Old Basic,2024-08-15 22:00:00,2023-08-15 22:00:00,Active,,...,Silver,5.0.1,manual,"['FAQ', 'OTRSMasterSlave']",NaT,GERMANY,4995.0,Euro,2023-06-29 06:36:00,5gw4WtWX
204,Not Canceled,Completed,OTRS On-Premise GOLD,10,Contracts On-Premise OTRS,OTRS On-Premise GOLD,2023-01-11 23:00:00,2022-10-11 22:00:00,Expired,50.0,...,Gold,8.0.36,auto,"['OTRSCloneDB', 'Survey', 'OTRSConfigurationMa...",NaT,UNITED STATES OF AMERICA,15047.0,US-Dollar,NaT,U00PI7xM
205,Cancellation Requested,In Process,OTRS On-Premise PLATINUM,20,Contracts On-Premise OTRS,OTRS On-Premise PLATINUM,2024-10-16 22:00:00,2023-10-16 22:00:00,Active,200.0,...,Platinum,7.0.40,auto,"['OTRSMasterSlave', 'OTRSDynamicFieldDatabase'...",NaT,AUSTRIA,25995.0,Euro,NaT,Fs3qolwK
206,Not Canceled,In Process,OTRS GOLD,40,Contracts Managed OTRS,OTRS GOLD,2024-10-26 22:00:00,2023-10-26 22:00:00,Active,50.0,...,Gold,2023.1.1,managed,"['OTRSStatePreselectionResponseTemplates', 'OT...",NaT,GERMANY,15995.0,Euro,NaT,JuFKu1yu


In [10]:
# save the clean dataframe to csv
df.to_csv("data/data_after_processing.csv")