In [1]:
!pip install -U tensorflow




In [2]:
import tensorflow as tf
import numpy as np

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=4000)

word_index = tf.keras.datasets.imdb.get_word_index()
index2word = dict((i + 3, word) for (word, i) in word_index.items())
index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'
x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])

In [3]:
vocabulary = list()
for text in x_train:
  tokens = text.split()
  vocabulary.extend(tokens)

vocabulary = set(vocabulary)
print(len(vocabulary))



3998


In [4]:
from tqdm import tqdm

x_train_binary = list()
x_test_binary = list()

for text in tqdm(x_train):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_train_binary.append(binary_vector)

x_train_binary = np.array(x_train_binary)
x_train_binary = np.array(x_train_binary).reshape(len(x_train_binary), -1)

for text in tqdm(x_test):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_test_binary.append(binary_vector)

x_test_binary = np.array(x_test_binary)
x_test_binary = np.array(x_test_binary).reshape(len(x_test_binary), -1)



100%|██████████| 25000/25000 [06:11<00:00, 67.22it/s] 
100%|██████████| 25000/25000 [05:52<00:00, 70.90it/s] 


In [5]:
class Node :
    def __init__(self , checking_feature = None, isLeaf = False, category = None):
        self.checking_feature = checking_feature
        self.left_child = None
        self.right_child = None
        self.isLeaf = isLeaf
        self.category = category


In [6]:
from statistics import mode
import numpy as np
import math
class ID3 :
    def __init__(self , features):
        self.tree = None
        self.features = features

    def fit(self, x, y) :
        most_common = mode(y.flatten())
        self.tree = self.create_tree(x,y,features = np.arange(len(self.features)) , category = most_common)
        return self.tree 
    
    def create_tree(self,x_train,y_train,features,category) :
        if len(x_train) == 0 :
            return Node(checking_feature= None,isLeaf = True, category = category)
        if np.all(y_train.flatten() ==0) :
            return Node(checking_feature= None , isLeaf= True,category = 0)
        elif np.all(y_train.flatten() == 1) :
            return Node(checking_feature= None, isLeaf = True , category = 1)
        
        if len(features) == 0 :
           return Node(checking_feature= None, isLeaf = True, category = mode(y_train.flatten())) 
        
        igs = list()
        for feat_index in features.flatten() :
            igs.append(self.calculate_ig(y_train.flatten() , [example[feat_index] for example in x_train]))

        max_ig_idx = np.argmax(np.array(igs).flatten())
        common_category = mode(y_train.flatten())

        root = Node(checking_feature= max_ig_idx)

        # data subset with category = 0 
        x_train_0 = x_train[x_train[:, max_ig_idx] == 0, :]
        y_train_0 = y_train[x_train[:,max_ig_idx] == 0].flatten()

        # data subset with category = 1
        x_train_1 = x_train[x_train[:, max_ig_idx] == 1, :]
        y_train_1 = y_train[x_train[:,max_ig_idx] == 1].flatten()

        new_features_indices = np.delete(features.flatten(), max_ig_idx)

        root.left_child = self.create_tree(x_train = x_train_1 , y_train = y_train_1, features = new_features_indices , category = common_category)
        root.right_child = self.create_tree(x_train = x_train_0, y_train = y_train_0, features=new_features_indices,category = common_category)

        return root 

    @staticmethod
    def calculate_ig(classes_vector, feature):
        classes = set(classes_vector)

        HC = 0
        for c in classes:
            PC = list(classes_vector).count(c) / len(classes_vector)  # P(C=c)
            HC += - PC * math.log(PC, 2)  # H(C)
            # print('Overall Entropy:', HC)  # entropy for C variable
            
        feature_values = set(feature)  # 0 or 1 in this example
        HC_feature = 0
        for value in feature_values:
            # pf --> P(X=x)
            pf = list(feature).count(value) / len(feature)  # count occurences of value 
            indices = [i for i in range(len(feature)) if feature[i] == value]  # rows (examples) that have X=x

            classes_of_feat = [classes_vector[i] for i in indices]  # category of examples listed in indices above
            for c in classes:
                # pcf --> P(C=c|X=x)
                pcf = classes_of_feat.count(c) / len(classes_of_feat)  # given X=x, count C
                if pcf != 0: 
                    # - P(X=x) * P(C=c|X=x) * log2(P(C=c|X=x))
                    temp_H = - pf * pcf * math.log(pcf, 2)
                    # sum for all values of C (class) and X (values of specific feature)
                    HC_feature += temp_H
        
        ig = HC - HC_feature
        return ig    

    def predict(self, x):
        predicted_classes = list()

        for unlabeled in x:  # for every example 
            tmp = self.tree  # begin at root
            while not tmp.isLeaf:
                if unlabeled.flatten()[tmp.checking_feature] == 1:
                    tmp = tmp.left_child
                else:
                    tmp = tmp.right_child
            
            predicted_classes.append(tmp.category)
        
        return np.array(predicted_classes)

In [7]:
import numpy as np
from statistics import mode

class AdaBoostID3:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.models = []  # List of ID3 trees
        self.alphas = []  # Classifier weights

    def fit(self, X, y):
        n_samples, _ = X.shape
        w = np.ones(n_samples) / n_samples  # Initialize sample weights

        for _ in range(self.n_estimators):
            # Bootstrap sampling based on weights
            indices = np.random.choice(n_samples, size=n_samples, replace=True, p=w)
            X_bootstrap, y_bootstrap = X[indices], y[indices]

            # Train an ID3 tree
            tree = ID3(features=np.arange(X.shape[1]))  # Use all features
            tree.fit(X_bootstrap, y_bootstrap)
            y_pred = tree.predict(X)

            # Compute weighted error
            err = np.sum(w * (y_pred != y)) / np.sum(w)

            # Compute alpha (classifier weight)
            if err == 0:  # Avoid division by zero
                alpha = 1
            else:
                alpha = 0.5 * np.log((1 - err) / (err + 1e-10))

            # Update sample weights
            w *= np.exp(-alpha * y * y_pred)
            w /= np.sum(w)  # Normalize

            # Store model and alpha
            self.models.append(tree)
            self.alphas.append(alpha)

    def predict(self, X):
        final_pred = np.zeros(X.shape[0])

        for alpha, model in zip(self.alphas, self.models):
            final_pred += alpha * model.predict(X)

        return np.sign(final_pred)  # Convert to -1/1 labels

# Train AdaBoost with ID3
adaboost_id3 = AdaBoostID3(n_estimators=50)
adaboost_id3.fit(x_train_binary, y_train)

# Test the classifier
y_pred = adaboost_id3.predict(x_test_binary)

# Evaluate accuracy
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy: {accuracy:.2f}")


: 

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score

# Δημιουργία συνάρτησης για καμπύλες μάθησης
def learning_curves(model, X_train, y_train, X_dev, y_dev, step_size=1000):
    """
    Υπολογισμός καμπυλών μάθησης για διαφορετικά μεγέθη εκπαίδευσης.
    :param model: Το μοντέλο AdaBoost
    :param y_train: Ετικέτες εκπαίδευσης
    :param X_dev: Δεδομένα ανάπτυξης
    :param y_dev: Ετικέτες ανάπτυξης
    :param step_size: Βήμα αύξησης του μεγέθους εκπαίδευσης
    :return: Λίστες με ακρίβεια, ανάκληση, και F1-score
    """
    training_sizes = []
    precisions = []
    recalls = []
    f1_scores = []

    for i in range(step_size, len(X_train) + 1, step_size):
        # Training subset
        #KANE 2D
        X_subset = X_train[:i]
        y_subset = y_train[:i]

        # Verify alignment of subset
        assert len(X_subset) == len(y_subset), "Mismatch between X_subset and y_subset lengths"

        # Εκπαίδευση του μοντέλου
        model.fit(X_subset, y_subset)

        # Πρόβλεψη στα δεδομένα ανάπτυξης
        y_pred = model.predict(X_dev)

        # Υπολογισμός ακρίβειας, ανάκλησης, και F1
        precision = precision_score(y_dev, y_pred, average='binary')
        recall = recall_score(y_dev, y_pred, average='binary')
        f1 = f1_score(y_dev, y_pred, average='binary')

        # Αποθήκευση αποτελεσμάτων
        training_sizes.append(i)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

        return training_sizes, precisions, recalls, f1_scores

# Δημιουργία γραφημάτων
def plot_learning_curves(training_sizes, precisions, recalls, f1_scores):
    plt.figure(figsize=(10, 6))
    plt.plot(training_sizes, precisions, label="Precision")
    plt.plot(training_sizes, recalls, label="Recall")
    plt.plot(training_sizes, f1_scores, label="F1-score")
    plt.xlabel("Training Set Size")
    plt.ylabel("Score")
    plt.title("Learning Curves")
    plt.legend()
    plt.grid()
    plt.show()

# Εφαρμογή του μοντέλου και υπολογισμός καμπυλών
ab_classifier = AdaBoost()

# Δημιουργία δεδομένων ανάπτυξης από τα δεδομένα εκπαίδευσης
dev_size = int(0.1 * len(x_train_binary))  # 10% ως development set
X_dev = x_train_binary[:dev_size]
y_dev = y_train[:dev_size]

X_train = x_train_binary[dev_size:]
y_train = y_train[dev_size:]

# Υπολογισμός καμπυλών μάθησης
training_sizes, precisions, recalls, f1_scores = learning_curves(
ab_classifier, X_train, y_train, X_dev, y_dev
)

# Σχεδίαση καμπυλών
plot_learning_curves(training_sizes, precisions, recalls, f1_scores)

# Υπολογισμός τελικών αποτελεσμάτων στα δεδομένα αξιολόγησης
ab_classifier.fit(X_train, y_train)
y_test_pred = ab_classifier.predict(x_test_binary_final)

# Ακρίβεια, Ανάκληση και F1 για κατηγορίες και μέσα (macro, micro)
precision_pos = precision_score(y_test, y_test_pred, pos_label=1)
recall_pos = recall_score(y_test, y_test_pred, pos_label=1)
f1_pos = f1_score(y_test, y_test_pred, pos_label=1)

precision_neg = precision_score(y_test, y_test_pred, pos_label=0)
recall_neg = recall_score(y_test, y_test_pred, pos_label=0)
f1_neg = f1_score(y_test, y_test_pred, pos_label=0)

precision_macro = precision_score(y_test, y_test_pred, average="macro")
recall_macro = recall_score(y_test, y_test_pred, average="macro")
f1_macro = f1_score(y_test, y_test_pred, average="macro")

precision_micro = precision_score(y_test, y_test_pred, average="micro")
recall_micro = recall_score(y_test, y_test_pred, average="micro")
f1_micro = f1_score(y_test, y_test_pred, average="micro")

# Εκτύπωση αποτελεσμάτων
print("Precision (Positive):", precision_pos)
print("Recall (Positive):", recall_pos)
print("F1 (Positive):", f1_pos)

print("Precision (Negative):", precision_neg)
print("Recall (Negative):", recall_neg)
print("F1 (Negative):", f1_neg)

print("Macro-averaged Precision:", precision_macro)
print("Macro-averaged Recall:", recall_macro)
print("Macro-averaged F1:", f1_macro)

print("Micro-averaged Precision:", precision_micro)
print("Micro-averaged Recall:", recall_micro)
print("Micro-averaged F1:", f1_micro)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

def learning_curves_sklearn(model, X_train, y_train, X_dev, y_dev, step_size=1000):
    training_sizes, precisions, recalls, f1_scores = [], [], [], []
    
    for i in range(step_size, min(len(X_train), len(y_train)) + 1, step_size):
        #KANE 2D
        X_subset, y_subset = X_train[:i], y_train[:i]
        model.fit(X_subset, y_subset)
        y_pred = model.predict(X_dev)
        
        precisions.append(precision_score(y_dev, y_pred, average='binary'))
        recalls.append(recall_score(y_dev, y_pred, average='binary'))
        f1_scores.append(f1_score(y_dev, y_pred, average='binary'))
        training_sizes.append(i)
    
    return training_sizes, precisions, recalls, f1_scores

def plot_learning_curves(training_sizes, precisions, recalls, f1_scores):
    plt.figure(figsize=(10, 6))
    plt.plot(training_sizes, precisions, label="Precision")
    plt.plot(training_sizes, recalls, label="Recall")
    plt.plot(training_sizes, f1_scores, label="F1-score")
    plt.xlabel("Training Set Size")
    plt.ylabel("Score")
    plt.title("Learning Curves - AdaBoost (Scikit-learn)")
    plt.legend()
    plt.grid()
    plt.show()

# Διαίρεση σε training και development set
dev_size = int(0.1 * len(x_train_binary))
X_dev, y_dev = x_train_binary[:dev_size], y_train[:dev_size]
X_train, y_train = x_train_binary[dev_size:], y_train[dev_size:]

# Διασφάλιση συμβατών μεγεθών
dev_size = min(len(X_dev), len(y_dev))
train_size = min(len(X_train), len(y_train))
X_dev, y_dev = X_dev[:dev_size], y_dev[:dev_size]
X_train, y_train = X_train[:train_size], y_train[:train_size]

# Εκπαίδευση και αξιολόγηση του AdaBoost του Scikit-learn
sklearn_ab = AdaBoostClassifier(binarize=None)
training_sizes, precisions, recalls, f1_scores = learning_curves_sklearn(
    sklearn_ab, X_train, y_train, X_dev, y_dev
)
plot_learning_curves(training_sizes, precisions, recalls, f1_scores)

# Τελική αξιολόγηση στο test set
sklearn_ab.fit(X_train, y_train)
y_test_pred = sklearn_ab.predict(x_test_binary[:len(y_test)])

# Υπολογισμός μετρικών
precision_macro = precision_score(y_test[:len(y_test_pred)], y_test_pred, average="macro")
recall_macro = recall_score(y_test[:len(y_test_pred)], y_test_pred, average="macro")
f1_macro = f1_score(y_test[:len(y_test_pred)], y_test_pred, average="macro")

# Εκτύπωση αποτελεσμάτων
print("Scikit-learn AdaBoost Results:")
print("Macro Precision:", precision_macro)
print("Macro Recall:", recall_macro)
print("Macro F1:", f1_macro)
Μήνυμα στους Νεφέλη Δημητρίου


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Δημιουργία του AdaBoost με Decision Stumps (δέντρα βάθους 1)
model = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),  # Decision stump
    n_estimators=50,  # Αριθμός weak learners
    algorithm="SAMME"  # Για binary classification
)

# Εκπαίδευση του μοντέλου
model.fit(X_train, y_train)

# Πρόβλεψη
y_pred = model.predict(X_test)
