# CHURN ANALYSIS

In [113]:
import pandas as pd
import os

DATA_PATH = '../data'
FAKE_FILENAME = 'exchanges_ML.csv'
exchanges_ML = pd.read_csv(os.path.join(DATA_PATH, FAKE_FILENAME))

In [115]:
exchanges_ML.head()


Unnamed: 0.1,Unnamed: 0,user_id,nb_year,has_renewed,is_new_user,referral,promotion,payment3x,payement2,payment3,...,exchanges_host,nb_guests_host,nights_host,exchange_type_host,home_host,residence_host,capacity_host,diff_capacity_host,finalized_host,book_diff_host
0,0,1,1,0,0,0,0,0,0,0,...,32.0,4.0,6.0,NON_RECIPROCAL,Home,primary,8.0,4.0,2.0,56.0
1,1,9,2,1,0,0,0,0,0,0,...,380.0,4.0,5.0,NON_RECIPROCAL,apartment,primary,7.0,3.0,8.0,43.0
2,2,10,1,1,1,0,0,0,0,0,...,53.0,3.0,11.0,NON_RECIPROCAL,apartment,primary,12.0,9.0,0.0,121.0
3,3,40,1,1,0,0,0,0,0,0,...,4.0,3.0,5.0,NON_RECIPROCAL,Home,primary,5.0,2.0,0.0,104.0
4,4,67,1,1,1,0,0,0,0,0,...,53.0,3.0,8.0,NON_RECIPROCAL,apartment,primary,6.0,3.0,1.0,144.0


In [117]:
exchanges_ML.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70726 entries, 0 to 70725
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           70726 non-null  int64  
 1   user_id              70726 non-null  int64  
 2   nb_year              70726 non-null  int64  
 3   has_renewed          70726 non-null  int64  
 4   is_new_user          70726 non-null  int64  
 5   referral             70726 non-null  int64  
 6   promotion            70726 non-null  int64  
 7   payment3x            70726 non-null  int64  
 8   payement2            70726 non-null  int64  
 9   payment3             70726 non-null  int64  
 10  abscence_year        70726 non-null  int64  
 11  exchanges_guest      70726 non-null  float64
 12  nb_guests_guest      70726 non-null  float64
 13  nights_guest         70726 non-null  float64
 14  exchange_type_guest  70726 non-null  object 
 15  home_guest           70726 non-null 

In [119]:
exchanges_ML.isnull().sum()

Unnamed: 0             0
user_id                0
nb_year                0
has_renewed            0
is_new_user            0
referral               0
promotion              0
payment3x              0
payement2              0
payment3               0
abscence_year          0
exchanges_guest        0
nb_guests_guest        0
nights_guest           0
exchange_type_guest    0
home_guest             0
residence_guest        0
capacity_guest         0
diff_capacity_guest    0
finalized_guest        0
book_diff_guest        0
exchanges_host         0
nb_guests_host         0
nights_host            0
exchange_type_host     0
home_host              0
residence_host         0
capacity_host          0
diff_capacity_host     0
finalized_host         0
book_diff_host         0
dtype: int64

## PREPROCESSING

In [122]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [234]:
exchanges_ML.drop(columns=['Unnamed: 0'], inplace=True)

In [236]:
exchanges_ML.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70726 entries, 0 to 70725
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   user_id              70726 non-null  int64  
 1   nb_year              70726 non-null  int64  
 2   has_renewed          70726 non-null  int64  
 3   is_new_user          70726 non-null  int64  
 4   referral             70726 non-null  int64  
 5   promotion            70726 non-null  int64  
 6   payment3x            70726 non-null  int64  
 7   payement2            70726 non-null  int64  
 8   payment3             70726 non-null  int64  
 9   abscence_year        70726 non-null  int64  
 10  exchanges_guest      70726 non-null  float64
 11  nb_guests_guest      70726 non-null  float64
 12  nights_guest         70726 non-null  float64
 13  exchange_type_guest  70726 non-null  object 
 14  home_guest           70726 non-null  object 
 15  residence_guest      70726 non-null 

In [238]:
target = 'has_renewed'

In [240]:
features = [
    'nb_year', 'has_renewed', 'is_new_user', 'referral', 'promotion', 
    'payment3x', 'payement2', 'payment3', 'abscence_year', 'exchanges_guest', 
    'nb_guests_guest', 'nights_guest', 'exchange_type_guest', 'home_guest', 
    'residence_guest', 'capacity_guest', 'diff_capacity_guest', 'finalized_guest', 
    'book_diff_guest', 'exchanges_host', 'nb_guests_host', 'nights_host', 
    'exchange_type_host', 'home_host', 'residence_host', 'capacity_host', 
    'diff_capacity_host', 'finalized_host', 'book_diff_host']

In [242]:
X = exchanges_ML[features]
y = exchanges_ML[target]

In [244]:
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(exclude=['object']).columns

In [246]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [248]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [250]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [254]:
if y.dtype == 'object':
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

In [256]:
y_train

51138    0
63498    1
66048    1
8250     0
66559    1
        ..
49100    1
20609    1
21440    1
50057    1
5192     0
Name: has_renewed, Length: 56580, dtype: int64

## TRAINING

In [264]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)


In [266]:
y_pred = model.predict(X_test)

In [268]:
accuracy_test = model.score(X_test, y_test)

print(f"test data accuracy = {(accuracy_test)}")

test data accuracy = 1.0


In [270]:
accuracy_train = model.score(X_train, y_train)

In [272]:
accuracy_train

1.0

In [214]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [216]:
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## EVALUATE THE MODEL

In [219]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5470
           1       1.00      1.00      1.00      8676

    accuracy                           1.00     14146
   macro avg       1.00      1.00      1.00     14146
weighted avg       1.00      1.00      1.00     14146

[[5470    0]
 [   0 8676]]


In [220]:
import seaborn as sns
import matplotlib.pyplot as plt

In [221]:
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(report_df)

              precision  recall  f1-score  support
0                   1.0     1.0       1.0   5470.0
1                   1.0     1.0       1.0   8676.0
accuracy            1.0     1.0       1.0      1.0
macro avg           1.0     1.0       1.0  14146.0
weighted avg        1.0     1.0       1.0  14146.0
