# Descriptions
- Prediksi yang dilakukan adalah klasifikasi untuk target churn (1-Yes atau 0-No) dan Regresi untuk target tenure.
- Prediksi dengan klasifikasi menggunakan model machine learning
- Prediksi dengan regresi menggunakan model machine learning

# Import modules

In [40]:
import pandas as pd
import sys
import numpy as np
from sklearn import svm
from sklearn.model_selection import cross_val_score, cross_val_predict

# Classification metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report, precision_recall_curve
                            
# Regression metrics
from sklearn.metrics import max_error, mean_absolute_error, median_absolute_error, r2_score

# Initialize and load dataset
Terdapat tiga dataset, antara lain:
- EDA_df, yaitu dataset yang merupakan hasil dari eksplorasi terhadap fitur-fitur, yang telah bersih dari outlier, missing, duplikat, dan mismatch.
- feature_engineering_telco_customer_churn, yaitu dataset yang merupakan hasil dari feature engieering yang meliputi, feature transformation, feature extraction, dan feture selection.
- PCA_telco_customer_churn.csv adalah hasil reduki fitur menggunakan metode PCA (Principal Component Analysis).

## EDA

In [2]:
EDA_train_df = pd.read_csv('../datasets/train/EDA_train.csv')
EDA_test_df = pd.read_csv('../datasets/test/EDA_test.csv')

## Feature Engineering

In [3]:
feature_engineering_train_df = pd.read_csv('../datasets/train/feature_engineering_train.csv')
feature_engineering_test_df = pd.read_csv('../datasets/test/feature_engineering_test.csv')

## PCA

In [4]:
PCA_train_df = pd.read_csv('../datasets/train/PCA_train.csv')
PCA_test_df = pd.read_csv('../datasets/test/PCA_test.csv')

# Build model

## Support Vector Classifier

In [5]:
pd.set_option('display.max_columns', None)
EDA_train_df.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,paperless_billing,payment_method,monthly_charges,total_charges,churn
0,4223-BKEOR,0,0,0,0,21,0,0,0,1,0,1,0,0,1,1,0,3,64.85,1361.85,0
1,6035-RIIOM,0,0,0,0,54,0,1,1,0,1,0,0,1,1,2,1,0,97.2,5248.8,0
2,3797-VTIDR,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,2,23.45,23.45,1
3,2568-BRGYX,1,0,0,0,4,0,0,1,0,0,0,0,0,0,0,1,2,70.2,280.8,1
4,2775-SEFEE,1,0,0,0,0,0,1,0,1,1,0,1,0,0,2,1,0,61.9,0.0,0


### Target "churn"

#### Predicted target

In [6]:
X_train_churn = EDA_train_df.loc[:, ['gender','senior_citizen','partner','dependents','tenure','phone_service', 'multiple_lines',
                                     'internet_service','online_security','device_protection','tech_support','streaming_tv',
                                     'streaming_movies','contract','paperless_billing','payment_method','monthly_charges','total_charges']
                                ]

y_train_churn = EDA_train_df.loc[:, 'churn']

X_test_churn = EDA_test_df.loc[:, ['gender','senior_citizen','partner','dependents','tenure','phone_service', 'multiple_lines',
                                   'internet_service','online_security','device_protection','tech_support','streaming_tv',
                                   'streaming_movies','contract','paperless_billing','payment_method','monthly_charges','total_charges']
                              ]

y_test_churn = EDA_test_df.loc[:, 'churn']

In [33]:
np.set_printoptions(threshold=sys.maxsize)

In [8]:
svm_clf = svm.SVC(gamma='auto')
svm_clf.fit(X_train_churn, y_train_churn)
svm_clf.predict(X_train_churn)

array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

#### Actual target

In [9]:
np.ravel(y_train_churn)

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,

#### Confusion Matrix

In [10]:
y_train_churn_pred = cross_val_predict(svm_clf, X_train_churn, np.ravel(y_train_churn), cv=3)
print(f'True positive  : {confusion_matrix(np.ravel(y_train_churn), y_train_churn_pred)[1, 1]}')
print(f'True negative  : {confusion_matrix(np.ravel(y_train_churn), y_train_churn_pred)[0, 0]}')
print(f'False positive : {confusion_matrix(np.ravel(y_train_churn), y_train_churn_pred)[0, 1]}')
print(f'False negative : {confusion_matrix(np.ravel(y_train_churn), y_train_churn_pred)[1, 0]}')

True positive  : 375
True negative  : 3944
False positive : 194
False negative : 1121


#### Accuracy Score

In [11]:
accuracy_score(y_test_churn, svm_clf.predict(X_test_churn))

0.7771469127040455

#### Cross Validation Score

In [12]:
cross_val_score(svm_clf, X_train_churn, np.ravel(y_train_churn), cv=3, scoring="accuracy")

array([0.76677316, 0.76783813, 0.76517572])

#### Precision

In [13]:
y_train_pred = cross_val_predict(svm_clf, X_train_churn, np.ravel(y_train_churn), cv=3)

precision_score(np.ravel(y_train_churn), y_train_churn_pred, average='weighted')

0.7469127591302924

#### Recall

In [14]:
recall_score(np.ravel(y_train_churn), y_train_churn_pred, average='weighted')

0.7665956691515797

#### F-1 Score

In [15]:
f1_score(np.ravel(y_train_churn), y_train_churn_pred, average='weighted')

0.725962030491707

#### Classification Report

In [16]:
print(classification_report(y_test_churn, svm_clf.predict(X_test_churn)))

              precision    recall  f1-score   support

           0       0.80      0.94      0.86      1036
           1       0.66      0.33      0.44       373

    accuracy                           0.78      1409
   macro avg       0.73      0.63      0.65      1409
weighted avg       0.76      0.78      0.75      1409



### Target "tenure"

#### Predicted target

In [17]:
X_train_tenure = EDA_train_df.loc[:, ['gender','senior_citizen','partner','dependents','phone_service', 'multiple_lines',
                                      'internet_service','online_security','device_protection','tech_support','streaming_tv',
                                      'streaming_movies','contract','paperless_billing','payment_method','monthly_charges',
                                      'total_charges','churn']
                                 ]

y_train_tenure = EDA_train_df.loc[:, 'tenure']

X_test_tenure = EDA_test_df.loc[:, ['gender','senior_citizen','partner','dependents','phone_service', 'multiple_lines',
                                      'internet_service','online_security','device_protection','tech_support','streaming_tv',
                                      'streaming_movies','contract','paperless_billing','payment_method','monthly_charges',
                                      'total_charges','churn']
                                 ]

y_test_tenure = EDA_test_df.loc[:, 'tenure']

In [30]:
svm_regr = svm.SVR(gamma='auto')
svm_regr.fit(X_train_tenure, y_train_tenure)
svm_regr.predict(X_train_tenure)

array([29.00250895, 31.38280047,  5.48586803, 24.05982495, 29.25196946,
       24.26256612, 31.28409059, 35.06537367, 28.24325117, 20.56137827,
       29.34835908, 31.37388269, 31.78140229, 31.39439207, 31.56315997,
       31.49547877, 31.53273997, 32.08572794, 35.79360139, 31.39628812,
       25.57988292, 34.00402033, 31.46283705, 31.85737399, 31.49569423,
       27.17747429, 29.36513322, 28.38479027, 31.75522866, 31.36734274,
       24.03753989, 29.23037338, 29.23470218, 25.26638675, 31.43084464,
       32.23402005, 31.36527596, 30.90001885, 32.02721535, 31.38960985,
       33.28429854, 33.6719313 ,  4.48842547, 31.82808977, 29.28537224,
       31.37764626,  1.96379699,  1.31113488, 31.36524942, 31.57909915,
       29.17219642, 28.80241743, 31.37382453, 28.93689199, 28.75839773,
       32.25985521, 31.64167773, 13.66328438,  1.18651154, 31.52368783,
       31.61173962, 29.36482661, 28.2378195 , 28.66782427, 27.01886595,
       31.3656925 , 29.31870878, 13.1494573 , 30.9000722 , 25.91

#### Max Error

In [42]:
max_error(y_train_tenure, svm_regr.predict(X_train_tenure))

40.63476785599201

#### Mean Absolute Error

In [43]:
mean_absolute_error(y_train_tenure, svm_regr.predict(X_train_tenure))

17.02765177491923

#### Median Absolute Error

In [44]:
median_absolute_error(y_train_tenure, svm_regr.predict(X_train_tenure))

16.146258569831563

#### R2 Score

In [45]:
r2_score(y_train_tenure, svm_regr.predict(X_train_tenure))

0.27074428723687805

## Decision Tree

## Random Forest