In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import math
import random
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import crossovers

In [2]:
def process_data(sms_data_str):
    """
    convert `sms_data_str` into a pandas dataframe
    """
    data_arr = []

    data_records = sms_data_str.split('\n')[:-1]
    for data in data_records:
        label = None
        sample = None
        match data[:3]:
            case 'ham':
                label = 'legitimate'
                sample = data[4:] 
            case 'spa':
                label = 'spam'
                sample = data[5:] 
            case _:
                label = 'N/A'
            
        data_arr.append([label, sample])
        
    data_arr = np.array(data_arr)
    data_label = data_arr[:, 0]
    data_records = data_arr[:, 1]
    
    return data_records, data_label

def tfidf_vectorizer(records):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        token_pattern=r'\b[A-Za-z]+\b', 
        norm=None
    )
    
    records_transformed = vectorizer.fit_transform(records)

    return records_transformed.toarray(), vectorizer.get_feature_names_out()

def feature_extraction(X, n_components=5):
    reduction_pca = PCA(
        n_components=n_components,
        whiten=False
    )
    data_reduced = reduction_pca.fit_transform(X)
    return data_reduced

def feature_selection(df_records, labels, n_components=5):
    feature_selection_model = SelectKBest(mutual_info_classif, k=n_components) 
    ## make a selection over the best features
    selected_record_features = feature_selection_model.fit_transform(df_records, labels)
    
    return selected_record_features, feature_selection_model.get_feature_names_out()

In [3]:
sms_data_str = None
with open('SMSSpamCollection') as file:
    sms_data_str = file.read()

In [4]:
records, labels = process_data(sms_data_str)
records_vectorized, feature_names = tfidf_vectorizer(records)

## one hot encoding labels
labels = np.array([0 if y == 'legitimate' else 1 for y in labels] )

## reducing dimension
records_dim_reduced = feature_extraction(records_vectorized)

In [5]:
records_dim_reduced[:5]

array([[-1.85631326,  0.28367574, -1.18566742,  0.818027  ,  0.75082336],
       [-2.78398433,  0.52095566, -1.74225193,  0.50137006, -0.73299918],
       [ 0.48310156, -0.03831907,  2.0095934 , -6.5393044 ,  1.10045652],
       [-1.83555282,  1.13871187, -3.93131219, -0.18685446, -1.97792601],
       [ 0.27698251, -0.77630354,  0.11410541,  1.31398039, -0.72028866]])

In [6]:
records_vectorized = pd.DataFrame(records_vectorized, columns=feature_names)

records_selection, feature_name_selection = feature_selection(records_vectorized,labels=labels)

In [7]:
## for better visualization
df = pd.DataFrame(records_selection, columns=feature_name_selection)
df['labels'] = labels
df

Unnamed: 0,call,claim,free,i,txt,labels
0,0.0000,0.0000,0.000000,0.000000,0.00000,0
1,0.0000,0.0000,0.000000,0.000000,0.00000,0
2,0.0000,0.0000,4.187968,0.000000,4.51406,1
3,0.0000,0.0000,0.000000,0.000000,0.00000,0
4,0.0000,0.0000,0.000000,1.992194,0.00000,0
...,...,...,...,...,...,...
5569,3.3125,4.9347,0.000000,0.000000,0.00000,1
5570,0.0000,0.0000,0.000000,0.000000,0.00000,0
5571,0.0000,0.0000,0.000000,0.000000,0.00000,0
5572,0.0000,0.0000,4.187968,3.984388,0.00000,0


In [8]:
#Divide our dataset to spam and ham parts for feeding to the model!
# X_train, X_test, y_train, y_test = train_test_split(records_selection, labels)
X_train, X_test, y_train, y_test = train_test_split(records_selection, labels,test_size=0.33, random_state=42)

spam_dataset = df[df['labels']==1]
ham_dataset = df[df['labels']==0]

spam_dataset = []
spam_labels = []
ham_dataset = []
ham_labels = []


for i in range(len(X_train)):
    if y_train[i] == 1:
        spam_dataset.append(X_train[i])
        spam_labels.append(y_train[i])
    else:
        ham_dataset.append(X_train[i])
        ham_labels.append(y_train[i])


In [9]:
class Chromosome:
    def __init__(self, m=None, s=None, cf=None):
        # rule = [[['x1','h'],['x2','l'],.......,['label','0']],[...],...]
        self.rules = []
        # determine the function of rules tri,sigmoid,...
        self.functions = []
        # determine the m,s values for the rules
        self.s = s
        self.m = m
        # determine the cf for the chromosome
        self.cf = cf
        # to do
        # add don't care as linguistic_terms
        # number of features in rules can vary -> 1 feature to 5 feature

In [10]:
def find_total_range():
    bound = 10000
    return bound

def s_m_init(bound):
    step = bound // 5
    m_list = []
    s_list = []
    for i in range(0, bound, step):
        m_list.append(random.randint(i, i+step))
        s_list.append(random.randint(2, step))

    return m_list, s_list

In [11]:
def function(func, x, m , s):
    if func == 'tri':
       left = (x - m + s) / s
       right = (m + s - x) / s
       return max(min(left, right), 0)
    
    elif func == 'rect-trap':
        return max( min ((x-m+s)/s, 1),0)
    
    elif func == 'gaussian':
        return math.exp((-1/2)*((x-m)/s)**2)
    
    elif func == 'sigmoid':
        return 1 / (1 + math.exp(-((x-m)/s)))

In [12]:
## TODO: build a fuzzy rule-based model for (records, label)
initial_functions = ['tri', 'rect-trap', 'gaussian', 'sigmoid']
linguistic_terms = ['vl', 'l', 'm', 'h', 'vh', 'X']


def get_individuals(individuals, population_size, number_of_rules):
    for _ in range(population_size):
        chromosome = Chromosome()

        # determine s, m of chromosome
        bound = find_total_range()
        s, m = s_m_init(bound)
        chromosome.m = m
        chromosome.s = s

        # determine functions for linguistic_terms
        # number_of_linguistic_terms = random.randint(3,5)
        number_of_linguistic_terms = 5
        for _ in range(number_of_linguistic_terms):
            random_function_index = random.randint(0, len(initial_functions) - 1)
            chromosome.functions.append(initial_functions[random_function_index])
        
        # determine rules of chromosome
        for _ in range(number_of_rules):        
            rule = []
            for i in range(5):
                linguistic_term_selection = linguistic_terms[random.randint(0, len(linguistic_terms) - 1)]
                rule.append([f'x{i+1}', linguistic_term_selection])
            rule.append(['label', random.randint(0, 1)])
            # update our chromosome
            chromosome.rules.append(rule)
            
        # add chromosome to individuals list
        individuals.append(chromosome)
        
    return individuals

In [13]:
np.seterr(divide='ignore', invalid='ignore')
def calculate_membership(chromosome, features):
    output = [0, 0]
    for rule in chromosome.rules:
        value = 1
        x_num = 0 
        for i in range(len(features)):
            # x ['x1', 'l']
            if rule[i][1] == 'X':
                x_num += 1
                value *= 1
                continue
            index = linguistic_terms.index(rule[i][1])
            m = chromosome.m[index]
            s = chromosome.s[index]
            tmp = function(chromosome.functions[index], features[i], m, s)
            if random.randint(0, 1) == 0:
                value *= (1 - tmp)
            else : value *= tmp
        if x_num == len(features):
            value = 0
        if rule[5][1] == 0:
            output[0] += value
        else:
            output[1] += value
    return output

def calculate_cf(chromosome, gc, true_label):
    # print('calculate cf:',gc,true_label)
    if gc[0] == 0 and gc[1] == 0:
        chromosome.cf = 0
    else:
        if true_label == 0:
            f_c = gc[0]
            f_neg = gc[1]
        else:
            f_c = gc[1]
            f_neg = gc[0]
        cf = (f_c - f_neg) / (f_c + f_neg)
        chromosome.cf = cf
        
def select_random_input(mode):

    if mode == 1:
        index = random.randint(0, len(spam_dataset) - 1)
        features = spam_dataset[index]
        label = spam_labels[index]
        # features = spam_dataset.iloc[index][:-1]
        # label = spam_dataset.iloc[index][-1]

    else:
        ham_index = random.randint(0, len(ham_dataset) - 1)
        # features = ham_dataset.iloc[ham_index][:-1]
        # label = ham_dataset.iloc[ham_index][-1]
        features = ham_dataset[ham_index]
        label = ham_labels[ham_index]


    return features,label

In [14]:
def make_crossover(individuals):
    parent1 = 0
    parent2 = random.randint(1, len(individuals) - 1)
    
    # make child elements
    # new rules
    rules1 = []
    rules2 = []
    rules3 = []
    rules4 = []

    for i in range(len(individuals[parent1].rules)):
        rule_child1, rule_child2 = crossovers.two_point_crossover(individuals[parent1].rules[i], individuals[parent2].rules[i])
        rule_child3, rule_child4 = crossovers.two_point_crossover(individuals[parent1].rules[i], individuals[parent2].rules[i]) 
        rules1.append(rule_child1)
        rules2.append(rule_child2)
        rules3.append(rule_child3)
        rules4.append(rule_child4)

    # new m
    m1, m2 = crossovers.two_point_crossover(individuals[parent1].m, individuals[parent2].m)
    m3, m4 = crossovers.uniform_crossover(individuals[parent1].m, individuals[parent2].m)
    # new s
    s1, s2 = crossovers.two_point_crossover(individuals[parent1].s, individuals[parent2].s)
    s3, s4 = crossovers.uniform_crossover(individuals[parent1].s, individuals[parent2].s)
    
    function1, function2 = crossovers.two_point_crossover(individuals[parent1].functions, individuals[parent2].functions)
    function3, function4 = crossovers.uniform_crossover(individuals[parent1].functions, individuals[parent2].functions)
    
    # update childs
    child1 = Chromosome(m1, s1)
    child2 = Chromosome(m2, s2)
    child3 = Chromosome(m3, s3)
    child4 = Chromosome(m4, s4)

    child1.rules = rules1
    child2.rules = rules2
    child3.rules = rules3
    child4.rules = rules4

    child1.functions = function1
    child2.functions = function2
    child3.functions = function3
    child4.functions = function4

    return child1, child2, child3, child4

In [15]:
def calculate_accuracy(mode, dic):
    our_labels = []
    if mode == 'test':
        for i in range(len(X_test)):
            features = X_test[i]
            # label = labels[i]
            total_gc = [0, 0]
            for j in [0,1]:
                for chromosome in dic[j]:
                    gc = calculate_membership(chromosome, features)
                    total_gc[0] += gc[0]
                    total_gc[1] += gc[1] 

            predicted_label = np.argmax(total_gc)
            our_labels.append(predicted_label)
    else:
        for i in range(len(X_train)):
            features = X_train[i]
            # label = labels[i]
            total_gc = [0, 0]
            for j in [0,1]:
                for chromosome in dic[j]:
                    gc = calculate_membership(chromosome, features)
                    total_gc[0] += gc[0]
                    total_gc[1] += gc[1] 

            predicted_label = np.argmax(total_gc)
            our_labels.append(predicted_label)
    return our_labels

In [16]:
def EA(generations, P_mut, P_cro, population_size, number_of_rules):

    individuals = []
    individuals = get_individuals(individuals, population_size, number_of_rules)

    # find cf of initial individuals
    # select random input
    features, label = select_random_input(random.randint(0, 1))

    # calculate cf
    for chromosome in individuals:
        gc = calculate_membership(chromosome, features)
        calculate_cf(chromosome, gc, label)
    
    dic = {}
    dic[0] = list(individuals[:population_size // 2])
    dic[1] = list(individuals[population_size // 2:])

    for i in [0, 1]:
        # evolutionary algorithm
        for _ in range(generations):
            # select random input
            features, label = select_random_input(i)
            # making childs
            for _ in range(population_size):
                if random.random() < P_cro:
                    child1, child2, child3, child4 = make_crossover(dic[i])
                    for chromosome in [child1, child2, child3, child4]:
                        gc = calculate_membership(chromosome, features)
                        calculate_cf(chromosome, gc, label)
                    for chromosome in [child1, child2, child3, child4]:
                        dic[i].append(chromosome)
                if random.random() < P_mut:
                    # todo
                    # implement mutation function
                    pass
            
            l = dic[i]
            random.shuffle(l)
            l.sort(key=lambda x: x.cf, reverse=True)
            l = l[:population_size // 2]
            dic[i] = l
    return dic

In [None]:
epochs = 51

total_test = 0.0
total_train = 0.0

for _ in range(epochs):
    dic = EA(15, 0.1, 0.9, 50, 1)

    our_labels_train = calculate_accuracy('train', dic)
    our_labels_test = calculate_accuracy('test', dic)

    # accuracy_test = np.mean(np.array(y_test) == np.array(our_labels_test))
    # accuracy_train = np.mean(np.array(y_train) == np.array(our_labels_train))
    
    total_test += np.mean(np.array(y_test) == np.array(our_labels_test))
    total_train += np.mean(np.array(y_train) == np.array(our_labels_train))

# x1 = f1_score(y_train,our_labels_train)
# x2 = f1_score(y_test,our_labels_test)

print('Accuracy Train: ',(total_test / epochs) * 100)
print('Accuracy Test: ', (total_train / epochs) * 100)