In [33]:
import pandas as pd
from sklearn.ensemble import IsolationForest as IF
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [34]:
data = pd.read_csv('../data/churn.csv')
data.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [35]:
data.dtypes

State              object
Account Length      int64
Area Code           int64
Phone              object
Int'l Plan         object
VMail Plan         object
VMail Message       int64
Day Mins          float64
Day Calls           int64
Day Charge        float64
Eve Mins          float64
Eve Calls           int64
Eve Charge        float64
Night Mins        float64
Night Calls         int64
Night Charge      float64
Intl Mins         float64
Intl Calls          int64
Intl Charge       float64
CustServ Calls      int64
Churn?             object
dtype: object

In [36]:
data.isna().sum()

State             0
Account Length    0
Area Code         0
Phone             0
Int'l Plan        0
VMail Plan        0
VMail Message     0
Day Mins          0
Day Calls         0
Day Charge        0
Eve Mins          0
Eve Calls         0
Eve Charge        0
Night Mins        0
Night Calls       0
Night Charge      0
Intl Mins         0
Intl Calls        0
Intl Charge       0
CustServ Calls    0
Churn?            0
dtype: int64

In [37]:
data.drop(['State','Phone'],axis=1,inplace=True)

In [38]:
data.nunique()

Account Length     212
Area Code            3
Int'l Plan           2
VMail Plan           2
VMail Message       46
Day Mins          1667
Day Calls          119
Day Charge        1667
Eve Mins          1611
Eve Calls          123
Eve Charge        1440
Night Mins        1591
Night Calls        120
Night Charge       933
Intl Mins          162
Intl Calls          21
Intl Charge        162
CustServ Calls      10
Churn?               2
dtype: int64

In [39]:
data['Churn?'].value_counts()

False.    2850
True.      483
Name: Churn?, dtype: int64

In [40]:
bin_cols = data.nunique()[data.nunique()==2].keys().tolist()
num_cols = []
for col_name in data.columns:
    if data[col_name].dtypes != 'object':
        num_cols.append(col_name)

le = LabelEncoder()
data_bin = data[bin_cols]
for col in bin_cols:
    data_bin[col] = le.fit_transform(data_bin[col])
    
# scaling numerical features
ss = StandardScaler()
data_num = ss.fit_transform(data[num_cols])
data_num = pd.DataFrame(data_num,columns=num_cols)

new_data = pd.concat([data_bin,data_num],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [41]:
new_data['Churn?'].value_counts()

0    2850
1     483
Name: Churn?, dtype: int64

In [42]:
new_data['Churn?']=[1 if x == 0 else -1 for x in new_data['Churn?'] ]
# 1 for inliers, -1 for outliers.

In [43]:
new_data['Churn?'].value_counts()

 1    2850
-1     483
Name: Churn?, dtype: int64

In [44]:
X = new_data.drop('Churn?',axis=1)
y = new_data['Churn?']

In [48]:
y_pred = IF().fit_predict(X)

In [49]:
print("Confusion_Matrix:")
print(str(confusion_matrix(y, y_pred)))
print(classification_report(y, y_pred))
print("Accuracy:" + str(accuracy_score(y, y_pred)))
print("AUC_ROC:" + str(roc_auc_score(y, y_pred)))
# worse result than supervised solution
# maybe needs more feature engineering
# maybe the dataset is not so imbalanced

Confusion_Matrix:
[[ 133  350]
 [ 352 2498]]
              precision    recall  f1-score   support

          -1       0.27      0.28      0.27       483
           1       0.88      0.88      0.88      2850

    accuracy                           0.79      3333
   macro avg       0.58      0.58      0.58      3333
weighted avg       0.79      0.79      0.79      3333

Accuracy:0.7893789378937894
AUC_ROC:0.5759267734553776


In [None]:
# reference:
# https://www.jianshu.com/p/5af3c66e0410
# https://www.cnblogs.com/bonelee/p/7776711.html

+ Many people use the terms anomaly detection and novelty detection interchangeably, but they are not exactly the same. In anomaly detection, the algorithm is trained on a dataset that may contain outliers, and the goal is typically to identify these outliers (within the training set), as well as outliers among new instances. In novelty detection, the algorithm is trained on a dataset that is presumed to be “clean,” and the objective is to detect novelties strictly among new instances. Some algorithms work best for anomaly detection (e.g., Isolation Forest), while others are better suited for novelty detection (e.g., one-class SVM).