In [14]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [15]:
df = pd.read_csv('../data/cancaled_db.csv')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881659 entries, 0 to 881658
Data columns (total 11 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   age                       881659 non-null  float64
 1   gender                    881659 non-null  object 
 2   time_as_client            881659 non-null  float64
 3   use_frequency             881659 non-null  float64
 4   callcenter_calls          881659 non-null  float64
 5   days_of_delay             881659 non-null  float64
 6   plan                      881659 non-null  object 
 7   contract_duration         881659 non-null  object 
 8   total_spent               881659 non-null  float64
 9   last_interaction_monthly  881659 non-null  float64
 10  canceled                  881659 non-null  float64
dtypes: float64(8), object(3)
memory usage: 74.0+ MB


In [17]:
df.head()

Unnamed: 0,age,gender,time_as_client,use_frequency,callcenter_calls,days_of_delay,plan,contract_duration,total_spent,last_interaction_monthly,canceled
0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


In [18]:
def categorical_to_binary(column):

    category_array = df[column].unique()

    # Making an array with the exact number of categories
    binary_array = list(range(len(df[column].unique())))

    # Making a loop to pair each category with its binary representative
    categorical_binary_dict = {}

    for index in binary_array:
        categorical_binary_dict.update({category_array[index] : binary_array[index]})

    return categorical_binary_dict


In [19]:
for column in df:

    if df[column].dtype == 'object':

        dict = categorical_to_binary(column)

        df[column] = df[column].map(dict)

In [20]:
df.head()

Unnamed: 0,age,gender,time_as_client,use_frequency,callcenter_calls,days_of_delay,plan,contract_duration,total_spent,last_interaction_monthly,canceled
0,30.0,0,39.0,14.0,5.0,18.0,0,0,932.0,17.0,1.0
1,65.0,0,49.0,1.0,10.0,8.0,1,1,557.0,6.0,1.0
2,55.0,0,14.0,4.0,6.0,18.0,1,2,185.0,3.0,1.0
3,58.0,1,38.0,21.0,7.0,7.0,0,1,396.0,29.0,1.0
4,23.0,1,32.0,20.0,5.0,8.0,1,1,617.0,20.0,1.0


# Random forest

In [21]:
columns_to_x = df.columns[0:-1]

In [22]:
X = df[columns_to_x]
y = df['canceled']

Defining train and test varibles

In [23]:
seed = 42

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

Creating the model

In [25]:
model = RandomForestClassifier(random_state=seed)
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)

In [29]:
accuracy = accuracy_score(y_test, y_pred)

In [37]:
print(accuracy)

0.9999432887961346


In [39]:
matrix = confusion_matrix(y_test, y_pred)

In [40]:
tp = matrix[0][0]
tn = matrix[0][1]
fp = matrix[1][0]
fn = matrix[1][1]

In [41]:
print(f'True positives: {tp}')
print(f'True negatives: {tn}')
print(f'False positives: {fp}')
print(f'False negatives: {fn}')

True positives: 76274
True negatives: 0
False positives: 10
False negatives: 100048
