In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import math
import random
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import f1_score

In [4]:
def process_data(sms_data_str):
    """
    convert `sms_data_str` into a pandas dataframe
    """
    data_arr = []

    data_records = sms_data_str.split('\n')[:-1]
    for data in data_records:
        label = None
        sample = None
        match data[:3]:
            case 'ham':
                label = 'legitimate'
                sample = data[4:] 
            case 'spa':
                label = 'spam'
                sample = data[5:] 
            case _:
                label = 'N/A'
            
        data_arr.append([label, sample])
        
    data_arr = np.array(data_arr)
    data_label = data_arr[:, 0]
    data_records = data_arr[:, 1]
    
    return data_records, data_label

def tfidf_vectorizer(records):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        token_pattern=r'\b[A-Za-z]+\b', 
        norm=None
    )
    
    records_transformed = vectorizer.fit_transform(records)

    return records_transformed.toarray(), vectorizer.get_feature_names_out()

def feature_extraction(X, n_components=5):
    reduction_pca = PCA(
        n_components=n_components,
        whiten=False
    )
    data_reduced = reduction_pca.fit_transform(X)
    return data_reduced

def feature_selection(df_records, labels, n_components=5):
    feature_selection_model = SelectKBest(mutual_info_classif, k=n_components) 
    ## make a selection over the best features
    selected_record_features = feature_selection_model.fit_transform(df_records, labels)
    
    return selected_record_features, feature_selection_model.get_feature_names_out()

In [6]:
sms_data_str = None
with open('SMSSpamCollection') as file:
    sms_data_str = file.read()

In [7]:
records, labels = process_data(sms_data_str)
records_vectorized, feature_names = tfidf_vectorizer(records)

## one hot encoding labels
labels = np.array([0 if y == 'legitimate' else 1 for y in labels] )

## reducing dimension
records_dim_reduced = feature_extraction(records_vectorized)

In [8]:
records_dim_reduced[:5]

array([[-1.85805392,  0.27572111, -1.18545044,  0.81086529,  0.76042213],
       [-2.78411079,  0.50263449, -1.74878456,  0.50571976, -0.74484833],
       [ 0.4873508 , -0.01723656,  2.00985677, -6.53330242,  0.92624187],
       [-1.83688752,  1.10175246, -3.93666596, -0.18701524, -2.0047502 ],
       [ 0.27470192, -0.77496318,  0.09358943,  1.3075239 , -0.81004664]])

In [9]:
records_vectorized = pd.DataFrame(records_vectorized, columns=feature_names)

records_selection, feature_name_selection = feature_selection(records_vectorized,labels=labels)

In [10]:
## for better visualization
df = pd.DataFrame(records_selection, columns=feature_name_selection)
df['labels'] = labels

In [11]:
#Divide our dataset to spam and ham parts for feeding to the model!

spam_dataset = df[df['labels']==1]
ham_dataset = df[df['labels']==0]

In [12]:
s = spam_dataset.iloc[0][-1]

s

1.0

In [13]:
class Chromosome:
    def __init__(self, m=None, s=None, cf=None):
        # rule = [[['x1','h'],['x2','l'],.......,['label','0']],[...],...]
        self.rules = []
        # determine the function of rules tri,sigmoid,...
        self.functions = []
        # determine the m,s values for the rules
        self.s = s
        self.m = m
        # determine the cf for the chromosome
        self.cf = cf
        # to do
        # add don't care as linguistic_terms
        # number of features in rules can vary -> 1 feature to 5 feature

In [14]:
def find_total_range():
    bound = 1000
    return bound

def s_m_init(bound):
    step = bound // 5
    m_list = []
    s_list = []
    for i in range(1, bound, step):
        m_list.append(random.randint(i, i+step))
        s_list.append(random.randint(1, step))

    return m_list, s_list

In [15]:
def function(func, x, m , s):
    if func == 'tri':
       left = (x - m + s) / s
       right = (m + s - x) / s
       return max(min(left, right), 0)
    
    elif func == 'rect-trap':
        return max( min ((x-m+s)/s, 1),0)
    
    elif func == 'gaussian':
        return math.exp((-1/2)*((x-m)/s)**2)
    
    elif func == 'sigmoid':
        return 1 / (1 + math.exp(-((x-m)/s)))

In [16]:
## TODO: build a fuzzy rule-based model for (records, label)
initial_functions = ['tri', 'rect-trap', 'gaussian', 'sigmoid']
linguistic_terms = ['vl', 'l', 'm', 'h', 'vh', 'X']

population_size = 50
number_of_rules = 10

def get_individuals(individuals):
    for _ in range(population_size):
        chromosome = Chromosome()

        # determine s, m of chromosome
        bound = find_total_range()
        s, m = s_m_init(bound)
        chromosome.m = m
        chromosome.s = s

        # determine functions for linguistic_terms
        # number_of_linguistic_terms = random.randint(3,5)
        number_of_linguistic_terms = 5
        for _ in range(number_of_linguistic_terms):
            random_function_index = random.randint(0, len(initial_functions) - 1)
            chromosome.functions.append(initial_functions[random_function_index])
        
        # determine rules of chromosome
        for _ in range(number_of_rules):        
            rule = []
            for i in range(5):
                linguistic_term_selection = linguistic_terms[random.randint(0, len(linguistic_terms) - 1)]
                rule.append([f'x{i+1}', linguistic_term_selection])
            rule.append(['label', random.randint(0, 1)])
            # update our chromosome
            chromosome.rules.append(rule)
            
        # add chromosome to individuals list
        individuals.append(chromosome)
        
    return individuals

In [17]:
def calculate_membership(chromosome, features):
    output = [0, 0]
    for rule in chromosome.rules:
        value = 1
        for i in range(len(features)):
            # x ['x1', 'l']
            if rule[i][1] == 'X':
                value *= 1
                continue
            index = linguistic_terms.index(rule[i][1])
            m = chromosome.m[index]
            s = chromosome.s[index]
            value *= function(chromosome.functions[index], features[i], m, s)
        if random.randint(0, 1) == 0:
            value = 1 - value
        if rule[5][1] == 0:
            output[0] += value
        else:
            output[1] += value
    return output

def calculate_cf(chromosome, gc, true_label):
    # print('calculate cf:',gc,true_label)
    if gc[0] == 0 and gc[1] == 0:
        chromosome.cf = 0
    else:
        if true_label == 0:
            f_c = gc[0]
            f_neg = gc[1]
        else:
            f_c = gc[1]
            f_neg = gc[0]
        cf = (f_c - f_neg) / (f_c + f_neg)
        chromosome.cf = cf
        
def select_random_input(mode):

    if mode == 0:
        index = random.randint(0, len(spam_dataset) - 1)
        features = spam_dataset.iloc[index][:-1]
        label = spam_dataset.iloc[index][-1]
        # print('import label', label)

    else:
        ham_index = random.randint(0, len(ham_dataset) - 1)
        features = ham_dataset.iloc[ham_index][:-1]
        label = ham_dataset.iloc[ham_index][-1]
        # print('import label', label)


    return features,label

In [19]:
import random

def uniform_crossover(parent1, parent2):
    #parent1: first parent
    #parent2: second parent
    #return: child
    #initialize child
    child1 = []
    child2= []
    #iterate through genes
    for i in range(len(parent1)):
        #select parent
        p = random.randint(0, 1)
        #add gene
        child1.append(parent1[i] if p == 0 else parent2[i])
        child2.append(parent1[i]if p == 1 else parent2[i])
    #return child
    return child1, child2

def single_point_crossover(parent1, parent2):
    #parent1: first parent
    #parent2: second parent
    #return: child
    #initialize child
    child = []
    #select crossover point
    point = random.randint(1, len(parent1)-1)
    #add genes
    child.extend(parent1[:point])
    child.extend(parent2[point:])
    # #return child
    # for i in range(len(child)):
    #     child[i] = str(child[i])

    return child

def two_point_crossover(parent1, parent2):
    #parent1: first parent
    #parent2: second parent
    #return: child
    #initialize child
    child1 = []
    child2 = []
    #select crossover points
    point1 = random.randint(1, len(parent1)-2)
    point2 = random.randint(point1+1, len(parent1)-1)
    #add genes
    child1.extend(parent1[:point1])
    child1.extend(parent2[point1:point2])
    child1.extend(parent1[point2:])

    child2.extend(parent2[:point1])
    child2.extend(parent1[point1:point2])
    child2.extend(parent2[point2:])


    #return child
    return child1, child2


In [26]:
def make_crossover(individuals):
    # find a weighted list based on cf of our chromosomes
    cf_list = []
    for chromosome in individuals:
        cf_list.append(chromosome.cf)

    weighted_cf = []
    for i in range(len(cf_list)):
        weighted_cf.append(cf_list[i] / np.sum(cf_list))
    
    # select 2 chromosomes as parents
    parent1 = cf_list.index(random.choices(cf_list, weights=weighted_cf, k=1)[0])
    parent2 = random.randint(0, len(individuals) - 1)
    # parent2 = cf_list.index(random.choices(cf_list, weights=weighted_cf, k=1)[0])
    # while parent1 == parent2:
    #     parent2 = cf_list.index(random.choices(cf_list, weights=weighted_cf, k=1)[0])
    
    # make child elements
    # new rules
    rules1, rules2 = two_point_crossover(individuals[parent1].rules, individuals[parent2].rules)
    rules3, rules4 = uniform_crossover(individuals[parent1].rules, individuals[parent2].rules)
    # new m
    m1, m2 = two_point_crossover(individuals[parent1].m, individuals[parent2].m)
    m3, m4 = uniform_crossover(individuals[parent1].m, individuals[parent2].m)
    # new s
    s1, s2 = two_point_crossover(individuals[parent1].s, individuals[parent2].s)
    s3, s4 =uniform_crossover(individuals[parent1].s, individuals[parent2].s)
    
    function1, function2 = two_point_crossover(individuals[parent1].functions, individuals[parent2].functions)
    function3, function4 = uniform_crossover(individuals[parent1].functions, individuals[parent2].functions)
    #(rule
    child1 = Chromosome(m1, s1)
    child2 = Chromosome(m2, s2)
    child3 = Chromosome(m3, s3)
    child4 = Chromosome(m4, s4)

    child1.rules = rules1
    child2.rules = rules2
    child3.rules = rules3
    child4.rules = rules4

    child1.functions = function1
    child2.functions = function2
    child3.functions = function3
    child4.functions = function4

    return child1, child2, child3, child4

In [33]:
# evolutionary algorithm implementation
generations = 500
P_mut = 0.1
P_cro = 0.9

individuals = []
individuals = get_individuals(individuals)

# find cf of initial individuals
# select random input
features, label = select_random_input(1)
# calculate CF
for chromosome in individuals:
    gc = calculate_membership(chromosome, features)
    calculate_cf(chromosome, gc, label)
    
# print the mean of cf at first
c=0
for ch in individuals:
    c += ch.cf
print(c / population_size)
mode = 1
# evolutionary algorithm
for generation in range(generations):
    # select random input
    features, label = select_random_input(mode)
    mode = not mode 

    # making childs
    for _ in range(population_size):
        if random.random() < P_cro:
            child1, child2, child3, child4 = make_crossover(individuals)

            for chromosome in [child1, child2, child3, child4]:
                gc = calculate_membership(chromosome, features)
                calculate_cf(chromosome, gc, label)
            individuals.append(child1)
            individuals.append(child2)
            individuals.append(child3)
            individuals.append(child4)
        if random.random() < P_mut:
            # todo
            # implement mutation function
            pass

       
    # sort our chromosomes based on their cf
    individuals.sort(key=lambda x: x.cf, reverse=True)

    # pick top 50 chromosome
    # del individuals[population_size:]
    individuals = individuals[:population_size]
            

print('----------after EA-------------')
c=0
for ch in individuals:
    c += ch.cf
print(c / population_size)


# for chromosoms in individuals:
#     print(chromosoms.cf)
# calculate accuracy
our_labels = []
for i in range(len(records_selection)):
    features = records_selection[i]
    label = labels[i]
    acc = [0, 0]
    for chromosome in individuals:
        gc = calculate_membership(chromosome , features)
        acc[0] += gc[0]
        acc[1] += gc[1]
    predicted_label = np.argmax(acc)
    our_labels.append(predicted_label)

print(our_labels)
accuracy = np.mean(np.array(labels) == np.array(our_labels))
print(accuracy)
            

0.03008113780493014
----------after EA-------------
1.0
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0