# Churn Rate Analysis and Insights

In [16]:
# imports
import pandas as pd


## Load Data

In [17]:
# Load data
df = pd.read_csv('data/data_after_processing.csv')
# show column names
df.columns.tolist()

['Unnamed: 0',
 'contract_item_cancellation_status_code',
 'contract_item_customer_contract_life_cycle_status_code',
 'contract_item_description',
 'contract_item_internal_id',
 'contract_item_product_category',
 'contract_item_product_description',
 'contract_item_support_end_date',
 'contract_item_support_start_date',
 'contract_item_validity_status_code',
 'contract_item_concurrent_sessions',
 'description',
 'end_date_time',
 'start_date_time',
 'item_count',
 'item_list_cancellation_status_code',
 'item_list_customer_contract_life_cycle_status_code',
 'item_list_validity_status_code',
 'contract_label',
 'customer_earliest_start',
 'customer_latest_end',
 'customer_label',
 'sla',
 'product_category',
 'service_level_regex',
 'otrs_version',
 'system_type',
 'feature_add_ons',
 'cancellation_date',
 'customer_country',
 'concat_volume',
 'concat_currency',
 'cancellation_date_orca',
 'end_customer_id']

## Analyze the ID column

In [18]:
# show the rows where the end_customer_id is duplicated, sorted by end_customer_id
df[df.end_customer_id.duplicated(keep=False)].sort_values('end_customer_id')

Unnamed: 0.1,Unnamed: 0,contract_item_cancellation_status_code,contract_item_customer_contract_life_cycle_status_code,contract_item_description,contract_item_internal_id,contract_item_product_category,contract_item_product_description,contract_item_support_end_date,contract_item_support_start_date,contract_item_validity_status_code,...,service_level_regex,otrs_version,system_type,feature_add_ons,cancellation_date,customer_country,concat_volume,concat_currency,cancellation_date_orca,end_customer_id
8,8,Not Canceled,Completed,OTRS On-Premise GOLD,10,Contracts On-Premise OTRS,OTRS On-Premise GOLD,2023-01-25 23:00:00,2022-01-25 23:00:00,Expired,...,Gold,8.0.27,auto,"['OTRSCIsInCustomerFrontend', 'OTRSConfigurati...",,GERMANY,14995.0,Euro,,1ybA3NXK
9,9,Not Canceled,Completed,OTRS On-Premise SILVER,10,Contracts On-Premise OTRS,OTRS On-Premise SILVER,2023-01-25 23:00:00,2022-01-25 23:00:00,Expired,...,Silver,8.0.27,auto,"['OTRSCIsInCustomerFrontend', 'OTRSConfigurati...",,GERMANY,3995.0,Euro,,1ybA3NXK
89,89,Not Canceled,In Process,OTRS On-Premise GOLD,20,Contracts On-Premise OTRS,OTRS On-Premise GOLD,2024-11-06 23:00:00,2023-11-06 23:00:00,Active,...,Gold 100 CA,7.0.29,auto,"['OTRSReady2AdoptProcesses', 'OTRSAdvancedEsca...",,GERMANY,9495.0,Euro,,3FGHqzJj
90,90,Not Canceled,In Process,OTRS On-Premise GOLD,20,Contracts On-Premise OTRS,OTRS On-Premise GOLD,2024-11-06 23:00:00,2023-11-06 23:00:00,Active,...,Gold 100 CA,8.0.27,auto,"['OTRSConfigurationManagement', 'OTRSReady2Ado...",,GERMANY,9495.0,Euro,,3FGHqzJj
74,74,Not Canceled,In Process,OTRS GOLD,20,Contracts Managed OTRS,OTRS GOLD,2024-09-18 22:00:00,2023-09-18 22:00:00,Active,...,Gold,2023.1.1,managed,"['OTRSReady2AdoptWebServices', 'SaaSPortalConn...",2023-05-03 07:27:00,GERMANY,17495.0,Euro,2023-05-03 07:27:00,41W93rIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,170,Canceled,Completed,OTRS On-Premise PLATINUM,10,Contracts On-Premise OTRS,OTRS On-Premise PLATINUM,2023-09-30 22:00:00,2022-09-30 22:00:00,Active,...,Platinum,7.0.38,auto,"['ImportExport', 'OTRSSystemConfigurationHisto...",2021-11-19 07:49:00,GERMANY,34995.0,EUR,2023-04-28 09:57:00,vGFcrZOZ
69,69,Not Canceled,In Process,OTRS SILVER,20,Contracts Managed OTRS,OTRS SILVER,2024-08-31 22:00:00,2023-08-31 22:00:00,Active,...,Silver,2023.1.1,managed,"['OTRSPrimarySecondary', 'OTRSSystemConfigurat...",,GERMANY,5495.0,Euro,,wuvJX3hp
70,70,Not Canceled,In Process,Storage Extension 1TB,30,Other Contract Related Business,Storage Extension 1TB,2024-08-31 22:00:00,2023-09-27 22:00:00,Active,...,Gold,2023.1.1,managed,"['SaaSPortalConnector', 'OTRSAutomation', 'OTR...",,GERMANY,17495.0,Euro,,wuvJX3hp
31,31,Not Canceled,Completed,OTRS TITANIUM,10,Contracts Managed OTRS,OTRS TITANIUM,2023-04-10 22:00:00,2022-04-10 22:00:00,Expired,...,Titanium,2023.1.1,managed,"['OTRSHideShowDynamicFields', 'SaaSPortalConne...",,GERMANY,23995.0,Euro,,zrp7aSpq


## Analyze value counts for categorical columns

In [19]:
# List of columns to include in the analysis (status-related columns + contract_label and customer_label)
status_columns = df.columns[df.columns.str.contains('status', case=False)]
additional_columns = ['contract_label', 'customer_label']

# Combine both sets of columns
all_columns_to_check = list(status_columns) + additional_columns

# Show value counts for each column
for col in all_columns_to_check:
    if col in df.columns:  # Check if the column exists in the dataframe
        print(f"Value counts for {col}:")
        print(df[col].value_counts())
        print("\n")

Value counts for contract_item_cancellation_status_code:
contract_item_cancellation_status_code
Not Canceled              121
Canceled                   72
Cancellation Requested     15
Name: count, dtype: int64


Value counts for contract_item_customer_contract_life_cycle_status_code:
contract_item_customer_contract_life_cycle_status_code
In Process    113
Completed      93
Released        2
Name: count, dtype: int64


Value counts for contract_item_validity_status_code:
contract_item_validity_status_code
Active         132
Expired         70
Not Started      6
Name: count, dtype: int64


Value counts for item_list_cancellation_status_code:
item_list_cancellation_status_code
Not Canceled                      114
Canceled                           69
Partial Cancellation Requested     11
Cancellation Requested              9
Partially Canceled                  5
Name: count, dtype: int64


Value counts for item_list_customer_contract_life_cycle_status_code:
item_list_customer_contract_

## Count unique combinations of status columns

In [20]:
# Select the relevant columns for the analysis
columns_to_analyze = list(status_columns) + additional_columns

# Filter the DataFrame to include only these columns
filtered_df = df[columns_to_analyze]
combination_counts = filtered_df.value_counts().reset_index()
combination_counts.columns = columns_to_analyze + ["count"]

# Display unique combinations across the selected columns
unique_combinations = combination_counts.drop_duplicates()
# Show the unique combinations
unique_combinations

Unnamed: 0,contract_item_cancellation_status_code,contract_item_customer_contract_life_cycle_status_code,contract_item_validity_status_code,item_list_cancellation_status_code,item_list_customer_contract_life_cycle_status_code,item_list_validity_status_code,contract_label,customer_label,count
0,Not Canceled,In Process,Active,Not Canceled,In Process,Active,active,active,90
1,Canceled,Completed,Expired,Canceled,Completed,Expired,cancelled,cancelled,45
2,Canceled,Completed,Active,Canceled,Completed,Active,cancelled,cancelled,19
3,Not Canceled,Completed,Expired,Not Canceled,In Process,Active,active,active,14
4,Cancellation Requested,In Process,Active,Cancellation Requested,In Process,Active,cancelled,cancelled,6
5,Cancellation Requested,In Process,Active,Partial Cancellation Requested,In Process,Active,cancelled,cancelled,4
6,Not Canceled,Completed,Expired,Partial Cancellation Requested,In Process,Active,cancelled,cancelled,4
7,Not Canceled,In Process,Not Started,Not Canceled,In Process,Active,active,active,4
8,Canceled,Completed,Expired,Canceled,Completed,Expired,cancelled,active,3
9,Cancellation Requested,In Process,Active,Cancellation Requested,In Process,Active,cancelled,active,3


## Check for logical inconsistencies in the status columns

In [21]:
# show the values of all the columns that have status in the name
status_columns = df.columns[df.columns.str.contains('status', case=False)]
for col in status_columns:
    print(f"Value counts for {col}:")
    print(df[col].value_counts())
    print("\n")

Value counts for contract_item_cancellation_status_code:
contract_item_cancellation_status_code
Not Canceled              121
Canceled                   72
Cancellation Requested     15
Name: count, dtype: int64


Value counts for contract_item_customer_contract_life_cycle_status_code:
contract_item_customer_contract_life_cycle_status_code
In Process    113
Completed      93
Released        2
Name: count, dtype: int64


Value counts for contract_item_validity_status_code:
contract_item_validity_status_code
Active         132
Expired         70
Not Started      6
Name: count, dtype: int64


Value counts for item_list_cancellation_status_code:
item_list_cancellation_status_code
Not Canceled                      114
Canceled                           69
Partial Cancellation Requested     11
Cancellation Requested              9
Partially Canceled                  5
Name: count, dtype: int64


Value counts for item_list_customer_contract_life_cycle_status_code:
item_list_customer_contract_

In [22]:
# Flag logical errors for contract items
df['contract_item_logical_error'] = (
    # Contract item canceled but lifecycle status is In Process
    ((df['contract_item_cancellation_status_code'] == 'Canceled') & (df['contract_item_customer_contract_life_cycle_status_code'] == 'In Process')) |
    # Active contract item with lifecycle status Completed
    ((df['contract_item_validity_status_code'] == 'Active') & (df['contract_item_customer_contract_life_cycle_status_code'] == 'Completed')) |
    # 'Expired' contract item still in 'In Process' lifecycle
    ((df['contract_item_validity_status_code'] == 'Expired') & (df['contract_item_customer_contract_life_cycle_status_code'] == 'In Process')) |
    # Cancellation requested but validity is Not Started
    ((df['contract_item_cancellation_status_code'] == 'Cancellation Requested') & (df['contract_item_validity_status_code'] == 'Not Started')) |
    # Subcontract is cancelled but contract label is active
    ((df['contract_item_cancellation_status_code'] == 'Canceled') & (df['contract_label'] == 'active')) |
    # Subcontract is active but contract label is canceled
    ((df['contract_item_cancellation_status_code'] == 'Not Canceled') & (df['contract_label'] == 'cancelled')) 
    
)

# Flag logical errors for item list
df['item_list_logical_error'] = (
    # Item list canceled but lifecycle status is In Process
    ((df['item_list_cancellation_status_code'] == 'Canceled') & (df['item_list_customer_contract_life_cycle_status_code'] == 'In Process')) |
    # 'Expired' item list still in 'In Process' lifecycle
    ((df['item_list_validity_status_code'] == 'Expired') & (df['item_list_customer_contract_life_cycle_status_code'] == 'In Process')) |
    # Active item list with lifecycle status Completed
    ((df['item_list_validity_status_code'] == 'Active') & (df['item_list_customer_contract_life_cycle_status_code'] == 'Completed')) |
    # Partially Canceled item list still in 'In Process' lifecycle
    ((df['item_list_cancellation_status_code'] == 'Partially Canceled') & (df['item_list_customer_contract_life_cycle_status_code'] == 'In Process')) |
    # Cancellation requested but validity is Not Started
    ((df['item_list_cancellation_status_code'] == 'Cancellation Requested') & (df['item_list_validity_status_code'] == 'Not Started')) |
    # Item list canceled but contract label is active
    ((df['item_list_cancellation_status_code'] == 'Canceled') & (df['contract_label'] == 'active')) |
    # Item list active but contract label is canceled
    ((df['item_list_cancellation_status_code'] == 'Not Canceled') & (df['contract_label'] == 'cancelled'))
)

# Flag logical errors for customer label
df['customer_logical_error'] = (
    # Cancelled customer but has an active contract
    ((df['customer_label'] == 'cancelled') & (df['contract_item_validity_status_code'] == 'Active') & (df['contract_item_cancellation_status_code'] == 'Not Canceled') & (df['contract_label'] == 'active')) |
    # Active customer but all contracts are cancelled
    ((df['customer_label'] == 'active') & (df['contract_item_validity_status_code'] == 'Expired') & (df['contract_item_cancellation_status_code'] == 'Canceled') & (df['contract_label'] == 'cancelled'))
)


## Check for time logic errors

In [23]:
# Check for logical errors in date columns
df['date_logical_error'] = (
    # 1. End date should not be before start date
    ((df['end_date_time'] < df['start_date_time']) & df['end_date_time'].notnull() & df['start_date_time'].notnull()) |
    ((df['contract_item_support_end_date'] < df['contract_item_support_start_date']) & 
     df['contract_item_support_end_date'].notnull() & df['contract_item_support_start_date'].notnull()) |
    # 2. Customer latest end date should not be before the earliest start date
    ((df['customer_latest_end'] < df['customer_earliest_start']) & 
     df['customer_latest_end'].notnull() & df['customer_earliest_start'].notnull())
)


In [24]:
# Combine errors from all sources
df['logical_error'] = df['contract_item_logical_error'] | df['item_list_logical_error'] | df['customer_logical_error'] | df['date_logical_error']

# View rows with logical errors
df_with_errors = df[df['logical_error'] == True]

# Display the problematic rows
df_for_review = df_with_errors[['contract_item_cancellation_status_code', 'contract_item_customer_contract_life_cycle_status_code', 'contract_item_validity_status_code', 'contract_item_support_start_date', 'contract_item_support_end_date', 'item_list_cancellation_status_code', 'item_list_customer_contract_life_cycle_status_code', 'item_list_validity_status_code', 'start_date_time', 'end_date_time', 'contract_label', 'customer_label', 'customer_earliest_start', 'customer_latest_end', 'contract_item_logical_error', 'item_list_logical_error', 'customer_logical_error', 'date_logical_error', 'logical_error']]
df_for_review

Unnamed: 0,contract_item_cancellation_status_code,contract_item_customer_contract_life_cycle_status_code,contract_item_validity_status_code,contract_item_support_start_date,contract_item_support_end_date,item_list_cancellation_status_code,item_list_customer_contract_life_cycle_status_code,item_list_validity_status_code,start_date_time,end_date_time,contract_label,customer_label,customer_earliest_start,customer_latest_end,contract_item_logical_error,item_list_logical_error,customer_logical_error,date_logical_error,logical_error
68,Not Canceled,In Process,Active,2023-08-23 22:00:00,2024-08-23 22:00:00,Not Canceled,In Process,Active,2023-08-23 22:00:00,2024-08-23 22:00:00,active,cancelled,2022-08-23 22:00:00,2024-08-23 22:00:00,False,False,True,False,True
82,Not Canceled,In Process,Active,2023-10-08 22:00:00,2024-10-08 22:00:00,Not Canceled,In Process,Active,2023-10-08 22:00:00,2024-10-08 22:00:00,active,cancelled,2014-03-01 00:00:00,2024-10-08 22:00:00,False,False,True,False,True
112,Not Canceled,Completed,Expired,2022-01-06 23:00:00,2023-01-06 23:00:00,Not Canceled,Completed,Expired,2021-01-06 23:00:00,2023-01-06 23:00:00,cancelled,cancelled,2013-04-01 00:00:00,2023-01-06 23:59:00,True,True,False,False,True
114,Canceled,Completed,Active,2022-01-21 23:00:00,2023-01-21 23:00:00,Canceled,Completed,Active,2020-01-21 23:00:00,2023-01-21 23:00:00,cancelled,cancelled,2016-10-09 00:00:00,2023-01-21 23:59:00,True,True,False,False,True
115,Canceled,Completed,Active,2022-01-31 23:00:00,2023-01-31 23:00:00,Canceled,Completed,Active,2022-01-31 23:00:00,2023-01-31 23:00:00,cancelled,cancelled,2021-02-01 00:00:00,2023-01-31 23:59:00,True,True,False,False,True
122,Not Canceled,Completed,Expired,2022-03-15 23:00:00,2023-03-15 23:00:00,Not Canceled,Completed,Expired,2022-03-15 23:00:00,2023-03-15 23:00:00,cancelled,cancelled,2021-03-16 00:00:00,2023-03-15 23:59:00,True,True,False,False,True
130,Canceled,Completed,Active,2022-04-19 22:00:00,2023-04-19 22:00:00,Canceled,Completed,Active,2021-04-19 22:00:00,2023-04-19 22:00:00,cancelled,active,2016-11-30 23:00:00,2024-11-30 23:00:00,True,True,False,False,True
131,Canceled,Completed,Active,2022-04-19 22:00:00,2023-04-19 22:00:00,Canceled,Completed,Active,2022-04-19 22:00:00,2023-04-19 22:00:00,cancelled,cancelled,2022-04-19 22:00:00,2023-04-19 23:59:00,True,True,False,False,True
132,Canceled,Completed,Active,2022-04-26 22:00:00,2023-04-26 22:00:00,Canceled,Completed,Active,2019-04-26 22:00:00,2023-04-26 22:00:00,cancelled,cancelled,2015-11-01 00:00:00,2023-05-31 22:00:00,True,True,False,False,True
136,Canceled,Completed,Active,2022-04-30 22:00:00,2023-04-30 22:00:00,Canceled,Completed,Active,2022-04-30 22:00:00,2023-04-30 22:00:00,cancelled,cancelled,2007-05-01 00:00:00,2023-04-30 23:59:00,True,True,False,False,True


### Define Churn

In [25]:
# Define churn based on some conditions
df['churn'] = (
    # Check if the customer is labeled 'cancelled'
    (df['customer_label'] == 'cancelled') |
    # Additional conditions based on contract states
    ((df['contract_label'] == 'cancelled') & 
     (df['contract_item_validity_status_code'].isin(['Expired', 'Not Started'])))
).astype(int)  # Convert boolean values to 0 and 1

df[['customer_label', 'contract_label', 'contract_item_validity_status_code', 'churn']]

Unnamed: 0,customer_label,contract_label,contract_item_validity_status_code,churn
0,active,active,Active,0
1,active,active,Active,0
2,active,active,Active,0
3,active,active,Active,0
4,active,active,Active,0
...,...,...,...,...
203,cancelled,cancelled,Active,1
204,cancelled,cancelled,Expired,1
205,active,cancelled,Active,0
206,cancelled,cancelled,Active,1


## Save to CSV

In [26]:
df

Unnamed: 0.1,Unnamed: 0,contract_item_cancellation_status_code,contract_item_customer_contract_life_cycle_status_code,contract_item_description,contract_item_internal_id,contract_item_product_category,contract_item_product_description,contract_item_support_end_date,contract_item_support_start_date,contract_item_validity_status_code,...,concat_volume,concat_currency,cancellation_date_orca,end_customer_id,contract_item_logical_error,item_list_logical_error,customer_logical_error,date_logical_error,logical_error,churn
0,0,Not Canceled,In Process,OTRS GOLD,10,Contracts Managed OTRS,OTRS GOLD,2023-12-06 23:00:00,2022-12-06 23:00:00,Active,...,17495.0,Euro,,CFHbGks3,False,False,False,False,False,0
1,1,Not Canceled,In Process,OTRS On-Premise PLATINUM,10,Contracts On-Premise OTRS,OTRS On-Premise PLATINUM,2023-12-11 23:00:00,2022-12-11 23:00:00,Active,...,37800.0,Euro,,laVvIOXe,False,False,False,False,False,0
2,2,Not Canceled,In Process,OTRS On-Premise GOLD,10,Contracts On-Premise OTRS,OTRS On-Premise GOLD,2023-12-31 23:00:00,2022-12-31 23:00:00,Active,...,5995.0,Euro,,Yc6VmmVi,False,False,False,False,False,0
3,3,Not Canceled,In Process,OTRS On-Premise GOLD (Testsystem),10,Contracts On-Premise OTRS,OTRS On-Premise GOLD,2023-12-31 23:00:00,2022-12-31 23:00:00,Active,...,5995.0,Euro,,Yc6VmmVi,False,False,False,False,False,0
4,4,Not Canceled,In Process,OTRS On-Premise SILVER,10,Contracts On-Premise OTRS,OTRS On-Premise SILVER,2023-12-31 23:00:00,2022-12-31 23:00:00,Active,...,3995.0,Euro,,Yc6VmmVi,False,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,203,Cancellation Requested,In Process,OTRS Contract - Annual Support ENTRY ADV,20,Contracts On-Premise Old,Old Basic,2024-08-15 22:00:00,2023-08-15 22:00:00,Active,...,4995.0,Euro,2023-06-29 06:36:00,5gw4WtWX,False,False,False,False,False,1
204,204,Not Canceled,Completed,OTRS On-Premise GOLD,10,Contracts On-Premise OTRS,OTRS On-Premise GOLD,2023-01-11 23:00:00,2022-10-11 22:00:00,Expired,...,15047.0,US-Dollar,,U00PI7xM,True,False,False,False,True,1
205,205,Cancellation Requested,In Process,OTRS On-Premise PLATINUM,20,Contracts On-Premise OTRS,OTRS On-Premise PLATINUM,2024-10-16 22:00:00,2023-10-16 22:00:00,Active,...,25995.0,Euro,,Fs3qolwK,False,False,False,False,False,0
206,206,Not Canceled,In Process,OTRS GOLD,40,Contracts Managed OTRS,OTRS GOLD,2024-10-26 22:00:00,2023-10-26 22:00:00,Active,...,15995.0,Euro,,JuFKu1yu,True,True,False,False,True,1


In [27]:
df.to_csv("data/data_with_churn.csv")