In [7]:
# Essential Libraries
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning and Deep Learning Libraries
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

# Swarm Intelligence
# !pip install pyswarms
from pyswarm import pso
from pyswarms.single.global_best import GlobalBestPSO

# Statistical Test
from scipy.stats import shapiro, f_oneway, kruskal

In [8]:
# Set seed
np.random.seed(42)

In [9]:
# Load dataset
df = pd.read_excel('XCleanCryptocurrencyDataset.xlsx', index_col=0)
display(df)

Unnamed: 0,full_text,processed_text,vader_sentiment
0,'Token: $GROK24 - Grok 2024 Network: Ethereum ...,token grok grok network ethereum contract xccc...,Positive
1,@metaversejoji Let's check @SolanaMono $SOL #W...,let check sol,Negative
2,"Day's DCA: $BTC, $ATOM, $DVPN, $AXL, $JKL, $HU...",day dca btc atom dvpn axl jkl huahua,Negative
3,@BorkSOL @Cerita_Crypto @solana @aeyakovenko Y...,project really amazing thats followed send please,Positive
4,👉 WL FOR .0 SOL MINT 👈 👉40 HOURS TILL SNAPSHOT...,sol mint hour till snapshot requirement join d...,Positive
...,...,...,...
9879,CyberKong VX #11328 was adopted for 0.18 $ETH...,cyberkong adopted eth blur,Negative
9880,BULLISH ON SOLANA BULLISH ON JUP BULLISH ON MA...,bullish solana bullish jup bullish madlads,Negative
9881,@naija_bitcoin 🍿🍿🍿🍿🍿 rd to 3k before valentine...,valentine,Negative
9882,Binance Futures #KLAY/ #USDT Take-Profit targe...,binance future takeprofit target profit period...,Positive


In [10]:
# Asumsikan df adalah DataFrame Anda yang sudah dimuat
X = df['processed_text']  # Kolom teks yang sudah diproses
y = df['vader_sentiment']  # Target/Label

# Encoding target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)


# BARU
# Word Frequency Analysis
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
word_counts = tokenizer.word_counts

# Sorting word counts in descending order
sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

# Determining num_words for covering 95% of total frequency
cumulative_coverage = np.cumsum([x[1] for x in sorted_word_counts]) / sum(word_counts.values())
num_words = np.searchsorted(cumulative_coverage, 0.95) + 1  # e.g., to cover 95% of total frequency
# BARU

# Tokenisasi dan pembuatan sequences
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(sequences, maxlen=100)  # Sesuaikan maxlen sesuai dengan kebutuhan

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)

# Batasan feature selection
dim = X_train.shape[1]  # Jumlah token maksimal dalam sequences
lb = [0] * dim
ub = [1] * dim

In [11]:
def objective_function(weights):
    # Memilih fitur berdasarkan bobot feature selection
    # Dalam konteks ini, weights akan menentukan embedding tokens yang akan digunakan
    selected_indices = np.where(weights > 0.5)[0]
    X_train_selected = X_train[:, selected_indices]
    X_test_selected = X_test[:, selected_indices]

    # Definisikan dan latih model LSTM di sini
    model = Sequential([
        Embedding(input_dim=num_words, output_dim=300, input_length=len(selected_indices)),  # Sesuaikan parameter
        LSTM(256, dropout=0.2, recurrent_dropout=0.2),
        Dense(y_train.shape[1], activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train_selected, y_train, epochs=10, batch_size=128, verbose=0)  # Kurangi epoch untuk kecepatan

    # Evaluasi model
    loss, accuracy = model.evaluate(X_test_selected, y_test, verbose=0)
    return -accuracy  # Negatif karena kita ingin memaksimalkan akurasi

In [None]:
# # Standardize features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# # Objective function for PSO
# def objective_function(feature_selection_vector):
#     selected_features = [i for i, bit in enumerate(feature_selection_vector) if bit > 0.5]  # Threshold at 0.5 for binary decision
#     if not selected_features:  # Avoid empty selection
#         return 1  # Worst case scenario
    
#     # Use only the selected features for training and testing
#     X_train_selected = X_train_scaled[:, selected_features]
#     X_test_selected = X_test_scaled[:, selected_features]
    
#     # Train a simple classifier on the selected features
#     clf = RandomForestClassifier(random_state=42)
#     clf.fit(X_train_selected, y_train)
    
#     # Predict on the test set and calculate accuracy
#     y_pred = clf.predict(X_test_selected)
#     accuracy = accuracy_score(y_test, y_pred)
    
#     # Return the negative accuracy as we want to maximize accuracy (minimize negative accuracy)
#     return -accuracy

# def objective_function_lstm(params):
#     num_lstm_units = int(params[0])  # Extract the optimized number of LSTM units
#     model = create_lstm_model(X_train.shape[1], y_train.shape[1], num_lstm_units)
#     model.fit(X_train, y_train, epochs=10, batch_size=128, verbose=0, validation_split=0.2, callbacks=[early_stopping])

#     # Use predict method and post-process to get predicted classes
#     y_prob = model.predict(X_test)
#     y_pred = np.argmax(y_prob, axis=1)

#     accuracy = accuracy_score(np.argmax(y_test, axis=1), y_pred)
#     return -accuracy

# def create_lstm_model(input_length, num_classes, num_lstm_units, dropout_rate=0.2):
#     # Define the LSTM model with specified dropout rate
#     model = Sequential([
#         Embedding(input_dim=num_words, output_dim=300, input_length=input_length),
#         LSTM(num_lstm_units, dropout=dropout_rate, recurrent_dropout=0.2),
#         Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')
#     ])
#     # Compile the model with the appropriate loss function
#     model.compile(optimizer='adam', loss='categorical_crossentropy' if num_classes > 2 else 'binary_crossentropy', metrics=['accuracy'])
#     return model

# # PSO parameters
# lb = [0] * 20  # Lower bound (0 for each feature, meaning not selected)
# ub = [1] * 20  # Upper bound (1 for each feature, meaning selected)

In [12]:
# Run PSO
best_solution_pso, best_score_pso = pso(objective_function, lb, ub, swarmsize=15, maxiter=50, minfunc=0.01, minstep=0.01, debug=True)

# Identify selected features
selected_features_indices_pso = [i for i, bit in enumerate(best_solution_pso) if bit > 0.5]

No constraints given.
Best after iteration 1: [0.53258943 0.05182354 0.33660428 0.13441468 0.06337497 0.98996023
 0.32235384 0.80987445 0.25464065 0.68150272 0.76022786 0.59563874
 0.47157619 0.41184091 0.34886827 0.92952914 0.83061941 0.96502691
 0.12429722 0.73086748 0.93834046 0.18123307 0.06649627 0.74112065
 0.57447311 0.84182878 0.13977238 0.79526731 0.20162732 0.16365594
 0.1642658  0.81457472 0.66519722 0.52306542 0.35883048 0.87720054
 0.39244511 0.81659944 0.43913491 0.37694443 0.46267979 0.30137787
 0.74760938 0.50272039 0.2322127  0.89957457 0.38389122 0.54355286
 0.90647211 0.624238   0.11689804 0.93983212 0.62770805 0.33490561
 0.13927207 0.79402519 0.62007276 0.53346109 0.89389258 0.78859721
 0.15167488 0.31172207 0.24848914 0.74394629 0.03353243 0.56988968
 0.76245869 0.87676564 0.34208175 0.8212573  0.11063174 0.84645229
 0.12748866 0.39728729 0.79729537 0.14991743 0.2292514  0.72225257
 0.72003654 0.64114763 0.69394844 0.54272444 0.25179906 0.34569599
 0.18159772 0.90

In [21]:
def aco(func, num_ants, num_iterations, lb, ub, decay=0.95, alpha=1.0, beta=1.0):
    best_score = -float('inf')  # Inisialisasi dengan nilai terburuk; gunakan -inf karena kita mencari maksimum
    best_solution = None
    # Inisialisasi level feromon
    pheromone_levels = np.ones(ub - lb) * 0.1  # Asumsikan semua path dimulai dengan level feromon yang sama dan rendah
    
    for iteration in range(num_iterations):
        # Simpan solusi dan skor untuk setiap semut
        solutions = []
        scores = []

        for ant in range(num_ants):
            # Membangun solusi berdasarkan distribusi probabilitas feromon
            p = (pheromone_levels ** alpha) * ((1.0 / (np.arange(ub - lb) + 1)) ** beta)
            p /= p.sum()  # Normalisasi probabilitas
            chosen_index = np.random.choice(np.arange(ub - lb), p=p)
            solution = lb + chosen_index
            
            # Hitung skor menggunakan fungsi objektif
            score = func([solution])  # Asumsi fungsi objektif menerima list

            solutions.append(solution)
            scores.append(score)
            
            # Perbarui solusi terbaik
            if score > best_score:
                best_score = score
                best_solution = solution

        # Perbarui level feromon untuk semua solusi
        for solution, score in zip(solutions, scores):
            pheromone_levels[solution - lb] += 1.0 / (1.0 - score)  # Update feromon berdasarkan kebaikan solusi

        # Terapkan penguapan feromon
        pheromone_levels *= decay

    return best_solution, best_score

In [22]:
best_solution_aco, best_score_aco = aco(func=objective_function, lb=lb[0], ub=ub[0], num_ants=15, num_iterations=50, decay=0.95, alpha=1.0, beta=2.0)
selected_features_indices_aco = [i for i, bit in enumerate(best_solution_aco) if bit > 0.5]

TypeError: '>' not supported between instances of 'list' and 'float'

In [None]:
best_solution_cso, best_score_cso = cso(func=objective_function, lb=lb, ub=ub, num_cats=15, num_iterations=50, mixture_ratio=0.5)
selected_features_indices_cso = [i for i, bit in enumerate(best_solution_cso) if bit > 0.5]

In [None]:
# Data untuk membuat tabel
data = {
    "Algorithm": ["PSO", "ACO", "CSO"],
    "Best Score (Accuracy)": [-best_score_pso, -best_score_aco, -best_score_cso],  # Mengubah skor menjadi positif karena skor negatif digunakan untuk optimisasi
    "Number of Selected Features": [len(selected_features_indices_pso), len(selected_features_indices_aco), len(selected_features_indices_cso)],
    "Selected Features Indices": [selected_features_indices_pso, selected_features_indices_aco, selected_features_indices_cso]
}

# Membuat DataFrame
df = pd.DataFrame(data)

# Menampilkan DataFrame
print(df.to_string(index=False))