# Imports



In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
!pip install openml;



In [None]:
%%capture 
!pip install scikit-fuzzy

In [None]:
import math
import pandas as pd
import openml
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import numpy as np
from typing import List

In [None]:
import skfuzzy as fuzz
import numpy as np
from skfuzzy import control as ctrl

In [None]:
import abc

#  New Implementation
 

## abstract class

In [None]:
class Fuzzifier():
  """
  Fuzzifier class: given a DataFrame of Crisp Data Return corresponding fuzzified data
  ----------
  Parameters
  ----------
  x : dataframe-like
      Universe variables. Contains the features data (must be numerical)
      Required.
  modalities : Dictionnary where every key corresponds to a feature
      and values are either an object containing the modalities as keys, and the membership function (or 'auto') as value,
      or auto as key and the number of modalities as value . Optional.
      exemple: {'age': {'old': {'trimf': [0,1,2]}, 'young': {'gaussmf': [0,1]}}}
      exemple2: {'age': {'auto' : 3}}
  Methods
  -------
  """
  def __init__(self, x, modalities = {}):
    self.x = x
    self.modalities = modalities
    self.variables = OrderedDict()
    self.df = pd.DataFrame()


  def __getitem__(self, key):
        """
        Calling `fuzzifier['label']` will return the 'label' fuzzy variable
        """
        if key in self.variables.keys():
            return self.variables[key]
        else:
            # Build a pretty list of available fuzzy variables and raise an
            # informative error message
            options = '['
            for available_key in self.variables.keys():
              options += "'" + str(available_key) + "',"
            options += ']'
            raise ValueError("Fuzzy Variable {0}' does not exist.\n "
                             "Available options: {1}".format(key, options))
            

  
  def fuzzify(self):
    """
    The fuzzifying procedure, takes no params.
    """
    # TODO: verify params passed and raise errors if any problem

    for variable_name in self.x.columns:
      # Create the FuzzyVariable object
      fuzzyVar = ctrl.Antecedent(self.x[variable_name], variable_name)
      
      # check if key is present in modalities, if it is create modalities else auto
      if variable_name in self.modalities.keys():
        self.__fuzzify_numerical_in_modalities(variable_name,fuzzyVar)
       

      else:
        # If variable name neither categorical nor present in the passed object
        fuzzyVar.automf(3)

      self.variables[variable_name] = fuzzyVar

    # If the fuzzy method is recalled, recreate the DataFrame on toDataFrame method call
    self.df = pd.DataFrame()


  def __fuzzify_numerical_in_modalities(self, variable_name,fuzzyVar):

     # Get modalities corresponding to that feature name
      modalities = self.modalities[variable_name]

      # check if the modalities object is {'auto': 3/5/7} create the memberships automatically 
      if (next(iter(modalities)) == 'auto'):
        # Get number of triangular membership function (3, 5 or 7)
        n = modalities['auto']

        # Auto Generate membership functions
        fuzzyVar.automf(n);

      else :
        # For each modality (like 'old' or 'young') add the membership function to the fuzzy var
        for modality in modalities.items():

          # Assign membership values to fuzzy var
          # modality[0] is the modality name, modality[1] is the membership + values dict
          fuzzyVar[modality[0]] = self.__getMembership(fuzzyVar.universe, modality[1])

  # Utility method returning the sci-kit fuzzy membership function given the name of the membership function
  def __getMembership(self,x, membershipFunctionData):

    # Get the membership function type; can be trimf, gaussmf or trapmf or auto
    membership_name = next(iter(membershipFunctionData))

    if (membership_name == 'trimf' ):
      return fuzz.trimf(x, membershipFunctionData[membership_name])

    if (membership_name == 'trapmf' ):
      return fuzz.trapmf(x, membershipFunctionData[membership_name])

    if (membership_name == 'gaussmf'):
      return fuzz.gaussmf(x, membershipFunctionData[membership_name][0],membershipFunctionData[membership_name][1] )
    return None

  # After fuzzification, use this method to return a DataFrame containing the fuzzified Data
  def toDataFrame(self):

    # Only Generate the dictionnary once
    if (not self.df.empty):
      return self.df

    self.df = pd.DataFrame({}, index = self.x.index)

    for fuzzyVar_name, fuzzy_var in self.variables.items():
      # create columns and append to DataFrame
      for modality in fuzzy_var.terms:
        self.df['{0};{1}'.format(fuzzyVar_name, modality)] = fuzzy_var[modality].mf
    return self.df

  def interpolate_new_entry(self, xx):
    new_df = pd.DataFrame({}, index = [0])
    for fuzzyVar_name, fuzzy_var in self.variables.items():
      # create columns and append to DataFrame
      for modality in fuzzy_var.terms:
        new_df['{0};{1}'.format(fuzzyVar_name, modality)] = interpolate(self.x.loc[:,fuzzyVar_name],fuzzy_var[modality].mf, xx.loc[:,fuzzyVar_name])
    return new_df
  

  def view(self):
    """
    Show the membership functions to each variable
    """
    for fuzzyVar_name, fuzzy_var in self.variables.items():
      fig, ax = plt.subplots()
      labels = []

      for modality in fuzzy_var.terms:
        labels.append(modality)
        sns.lineplot(ax = ax, x = fuzzy_var.universe, y = fuzzy_var[modality].mf)
      
      ax.legend(labels = labels)
      plt.title(fuzzyVar_name)
      plt.show()

In [None]:
class Node:
    def __init__(self, value, children=None,is_leaf=False,split_value=None,split_attribut=None):
      self.value = value  
      self.children = children
      self.is_leaf = is_leaf
      self.split_value=split_value
      self.split_attribut=split_attribut  

In [None]:
class ICustomIForest:
  """
  ICustomIForest abstract class: interface used to generate an Isolation Forest model 
  ----------
  Parameters
  ----------
  number_of_trees : int
      represent the number of trees generated in the forest
  sample_size :int
      represent the size of the sample used to generate the tree
  contamination: float
      The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples.
      should be between [0,1]
      default value equal to 0.1 
  Methods
  -------
  """

  def __init__(self, number_of_trees: int,sample_size: int = 256,contamination: float = 0.1 ):
    self.number_of_trees = number_of_trees
    self.sample_size = sample_size
    self.forest: List[Node] = None
    self.contamination = contamination
    self.threshold = None

  # inner class node
  # class Node:
  #   def __init__(self, value, children: List[Node]=None,is_leaf=False,split_value=None,split_attribut=None):
  #     self.value = value  
  #     self.children = children
  #     self.is_leaf = is_leaf
  #     self.split_value=split_value
  #     self.split_attribut=split_attribut  

  def fit(self,set):
    self.forest = self._iForest(set)
    return self.forest

  def _generate_threshold(self,scores):
    self.threshold = np.percentile(scores,(1-self.contamination)*100)

  #training Stage 
  @staticmethod 
  def _random_split(sample_set):
    pass
  
  
  def _iTree(self,sample_set,current_height: int,height_limit):
    if current_height>= height_limit or sample_set.shape[0]<=1:
        return Node(sample_set,is_leaf= True)
    else:
        partitions,random_split_value,random_attribut= type(self)._random_split(sample_set)
        if  (partitions == None) : return Node(sample_set,is_leaf= True)
        node= Node(sample_set,is_leaf= False,split_value=random_split_value,split_attribut=random_attribut)
        node.children=[]
        for partition in partitions:
          node.children.append(self._iTree(partition,current_height+1,height_limit))
        return node
  
  def _iForest(self,training_set)-> List[Node] :
    forest=[]
    height_limit= math.ceil(math.log(self.sample_size,2))
    for i in range(self.number_of_trees):
        sample_set = training_set[np.random.choice(training_set.shape[0], size=self.sample_size, replace=False),:]
        forest.append(self._iTree(sample_set,0,height_limit))
    return forest
  
  # Evaluation Stage
  @staticmethod  
  def _cost(size: int):
    pass

  @staticmethod
  def _path_length(instance,tree: Node,current_path_length):
    pass
  
  @staticmethod
  def _anomalie_score(estimated_path,set_size: int):
    pass

  def score_samples(self,test_set):
    pass
    
  def predict(self, test_set) :
    predictions = [] 
    scores = self.score_samples(test_set)
    self._generate_threshold(scores)
    for score in scores:
      predicted_class = 0 if ( score <= self.threshold ) else 1
      predictions.append(predicted_class)
    return np.array(predictions)

## alpha-cut implementation

### fuzzifier API (implemented in Scickit fuzzy notebook )

In [None]:
from collections import OrderedDict
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.api.types import is_string_dtype
from sklearn.preprocessing import OneHotEncoder

In [None]:
class AlphaCutIForestModel(ICustomIForest):
  def __init__(self, number_of_trees: int,sample_size: int = 256,contamination: float = 0.1 ):
    super().__init__(number_of_trees,sample_size,contamination)

  @staticmethod
  def _random_split(sample_set, splitable_attributs = None):
    # TODO: implement unsplitable_attributs
    if splitable_attributs != None and len(splitable_attributs) ==0: return None, None, None
    if splitable_attributs == None: splitable_attributs= [*range(sample_set.shape[1])]
    random_attribut=splitable_attributs[np.random.choice(len(splitable_attributs), size=1, replace=False)[0]]
    min=np.amin(sample_set[:,[random_attribut]])
    max=np.amax(sample_set[:,[random_attribut]])
    random_split_value=np.random.uniform(min, max)
    left_partition= sample_set[np.where(sample_set[:,random_attribut]<=random_split_value)]
    right_partition= sample_set[np.where(sample_set[:,random_attribut]>random_split_value)]
    if right_partition.shape[0]*left_partition.shape[0]==0:
      # print("Invalid partitioning! Retrying...")
      # print("min: "+str(min))
      # print("max: "+str(max))
      # print("split: "+str(random_split_value))
      splitable_attributs.remove(random_attribut)
      return AlphaCutIForestModel._random_split(sample_set, splitable_attributs)
    partitions= [left_partition, right_partition]
    return partitions,random_split_value,random_attribut

  def fit(self,set):
    if (type(set)):
      self.forest = self._iForest(set)
    return self.forest

  @staticmethod  
  def _cost(size: int):
    if (size==1): return 0
    return 2*(math.log(size-1)+np.euler_gamma-(1-(1/size)))

  @staticmethod
  def _path_length(instance,tree: Node,current_path_length):
    if tree.is_leaf:
      return current_path_length+AlphaCutIForestModel._cost(tree.value.shape[0])
    if instance[tree.split_attribut]<=tree.split_value:
      return AlphaCutIForestModel._path_length(instance,tree.children[0],current_path_length+1)
    else:
      return AlphaCutIForestModel._path_length(instance,tree.children[1],current_path_length+1)
  
  @staticmethod
  def _anomalie_score(estimated_path,set_size: int):
    return math.pow(2,-(estimated_path/AlphaCutIForestModel._cost(set_size)))

  def score_samples(self,test_set):
    if not(self.forest):
      return Null
    scores=[]
    forest_size=len(self.forest)
    for i in range(test_set.shape[0]):
      instance= test_set[i,:]    
      s=0
      for j in range(forest_size):
        s += AlphaCutIForestModel._path_length(instance,self.forest[j],0)
      score = AlphaCutIForestModel._anomalie_score(s/forest_size,forest_size)
      scores.append(score)
    return np.array(scores)

In [None]:
# we will be using the mullcross dataset
dataset = openml.datasets.get_dataset(40897)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="array", target=dataset.default_target_attribute
)
mulcross = pd.DataFrame(X, columns=attribute_names)
fuzzifier = Fuzzifier(mulcross)
fuzzifier.fuzzify()
fuzzy_set = fuzzifier.toDataFrame() 
X_fuzzy = fuzzy_set
mulcross["class"] = y
#fuzzy_set["class"] = y
mulcross.head()

Unnamed: 0,V1,V2,V3,V4,class
0,-0.20395,0.363011,1.013766,0.187131,0
1,-0.761118,2.436424,0.681846,0.654366,0
2,-0.209979,1.131098,-0.28218,-0.20221,0
3,0.836812,0.650342,-0.4269,-0.305281,0
4,0.454204,1.560128,-0.204841,0.219233,0


In [None]:
fuzzy_set.head()

Unnamed: 0,V1;poor,V1;average,V1;good,V2;poor,V2;average,V2;good,V3;poor,V3;average,V3;good,V4;poor,V4;average,V4;good
0,0.023579,0.976421,0.0,0.0,0.95401,0.04599,0.0,0.64293,0.35707,0.0,0.85141,0.14859
1,0.146758,0.853242,0.0,0.0,0.48434,0.51566,0.0,0.759904,0.240096,0.0,0.691427,0.308573
2,0.024912,0.975088,0.0,0.0,0.780023,0.219977,0.099642,0.900358,0.0,0.0,0.984722,0.015278
3,0.0,0.793486,0.206514,0.0,0.888924,0.111076,0.150643,0.849357,0.0,0.020014,0.979986,0.0
4,0.0,0.878073,0.121927,0.0,0.682839,0.317161,0.072386,0.927614,0.0,0.0,0.840418,0.159582


In [None]:
#train-test splitting
X_train,X_test, y_train,y_test=train_test_split(X_fuzzy,y,random_state=42,test_size=0.1)
print("shape of X :",X_fuzzy.shape)
print("shape of X_train:", X_train.shape)
print("shape of X_test:", X_test.shape)

shape of X : (262144, 12)
shape of X_train: (235929, 12)
shape of X_test: (26215, 12)


In [None]:
X_train

Unnamed: 0,V1;poor,V1;average,V1;good,V2;poor,V2;average,V2;good,V3;poor,V3;average,V3;good,V4;poor,V4;average,V4;good
169183,0.000000,0.835173,0.164827,0.321015,0.678985,0.000000,0.000000,0.063161,0.936839,0.000000,0.082669,0.917331
212989,0.000000,0.788247,0.211753,0.309118,0.690882,0.000000,0.000000,0.110546,0.889454,0.000000,0.114697,0.885303
200814,0.189029,0.810971,0.000000,0.059737,0.940263,0.000000,0.000000,0.941340,0.058660,0.000000,0.948290,0.051710
159603,0.050430,0.949570,0.000000,0.047063,0.952937,0.000000,0.102052,0.897948,0.000000,0.000000,0.783294,0.216705
13691,0.195273,0.804727,0.000000,0.337432,0.662568,0.000000,0.000000,0.857919,0.142081,0.000000,0.913130,0.086870
...,...,...,...,...,...,...,...,...,...,...,...,...
259178,0.000000,0.806419,0.193581,0.171359,0.828641,0.000000,0.024321,0.975680,0.000000,0.036350,0.963650,0.000000
103694,0.000000,0.975505,0.024495,0.000000,0.755167,0.244833,0.000000,0.970873,0.029127,0.000000,0.724545,0.275455
131932,0.488351,0.511648,0.000000,0.000000,0.957279,0.042721,0.534348,0.465652,0.000000,0.000000,0.695496,0.304504
146867,0.000000,0.986867,0.013133,0.000000,0.798101,0.201899,0.027340,0.972660,0.000000,0.037444,0.962556,0.000000


In [None]:
model = AlphaCutIForestModel(100)

forest = model.fit(X_train.to_numpy())

In [None]:
scores = model.score_samples(X_test.to_numpy())
predictions = model.predict(X_test.to_numpy())

In [None]:
data=pd.DataFrame(predictions)
data.value_counts()

0    23593
1     2622
dtype: int64

In [None]:
# AUC Score via scores samples

from sklearn.metrics import roc_auc_score
my_auc=roc_auc_score(y_test,scores)
my_auc

0.9553048381146069

[range(0, 10)]