In [1]:
import numpy as np  
import pandas as pd
from sklearn.ensemble import IsolationForest  

In [2]:
# 设置训练样本数及异常样本比例
n_samples = 10000  
outliers_fraction = 0.25    
n_inliers = int((1. - outliers_fraction) * n_samples)  
n_outliers = int(outliers_fraction * n_samples)  

In [3]:
# //表示整数除法  
rng = np.random.RandomState(123)    
X = 0.3 * rng.randn(n_inliers // 2, 2)  

In [None]:
# 构建正常样本与异常样本  
X_train = np.r_[X + 2, X - 2]   
outliers = rng.uniform(low=-6, high=6, size=(n_outliers, 2))

# 正常样本与异常样本的融合  
X_train = np.r_[X_train, outliers]  

array([[ 1.67431082,  2.29920363],
       [ 2.08489355,  1.54811159],
       [ 1.82641992,  2.49543096],
       ...,
       [-4.33496129,  3.65191256],
       [ 1.26733042, -1.55240204],
       [-3.73895569, -5.78858711]])

In [7]:


clf = IsolationForest(contamination=outliers_fraction, random_state=2018, n_jobs=-1, behaviour="new")  
# predict / fit_predict方法返回每个样本是否为正常值，若返回1表示正常值，返回-1表示异常值
y_pred_train = clf.fit_predict(X_train)  
pred = np.array(['正常' if i==1 else '异常' for i in y_pred_train])

# 分数越小于0，越有可能是异常值
scores_pred = clf.decision_function(X_train) 
dict_ = {'anomaly_score':scores_pred, 'y_pred':y_pred_train, 'result':pred}
scores = pd.DataFrame(dict_)
scores.sample(5)

Unnamed: 0,anomaly_score,y_pred,result
9686,-0.245039,-1,异常
1976,0.074305,1,正常
5050,0.060877,1,正常
1789,0.020527,1,正常
4153,0.081342,1,正常
