In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:

# Load dataset
data = pd.read_csv('C://Users//Jayashrinidhi V//OneDrive//Documents//VScode//AlgoMaster//encrypted_messages_dataset12.csv')
print(data.head())

  algorithm                                          plaintext  \
0       AES  The volunteerism is a key component of communi...   
1       AES  +"f&R*~?GE3%-'}}i5bMx"IB:dqN0?2/R)sHWm;\.:SP1$...   
2       AES  The educational technology is a key component ...   
3       AES                        [GmcvmxtqF4}`}wOs NyXORrP{*   
4       AES  The encryption is a key component of global po...   

                                                 key  \
0  06f50db57c46117664333a82aa4628adf24baf21367d26...   
1  5bbff7147513d5e3624560a71214653e3ebe1b330c41e1...   
2  e229117d280cafcec62d4cbf6555bdee64da661d5226c1...   
3  74c9b7ebe8cc4cf50c99c34ede237bf48cf3813c3f9569...   
4  2226799dab3df3b994d8ee4f86398ba251c99c25ac4b6a...   

                                 iv  \
0  8855be909cbf4c1cf3f646fcfe0bb375   
1  bfa2fe6369565e48493aecfe3c625f78   
2  d95b59ff2e2911759b468342e537eb26   
3  237b47f8047ba7480f365e62b16487f6   
4  b7c4f3f628754f520f27655145ed4d7c   

                               

In [3]:
# Encode the target variable
le = LabelEncoder()
data['algorithm_encoded'] = le.fit_transform(data['algorithm'])

# Define features and target
X = data['encrypted_message']
y = data['algorithm_encoded']
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_features = vectorizer.fit_transform(X)

# Define base models
base_models = [
    ('logreg', LogisticRegression(max_iter=1000)),
    ('svc', SVC(probability=True)),
    ('decision_tree', DecisionTreeClassifier()),
    ('random_forest', RandomForestClassifier()),
    ('gbm', GradientBoostingClassifier()),
    ('xgboost', xgb.XGBClassifier(eval_metric='mlogloss')),
    ('catboost', CatBoostClassifier(silent=True)),
    ('knn', KNeighborsClassifier()),
    ('naive_bayes', MultinomialNB())
]

# Define meta-model for stacking
meta_model = LogisticRegression(max_iter=1000)

In [5]:
# Create Voting Classifier
voting_clf = VotingClassifier(estimators=base_models, voting='soft')
voting_clf.fit(X_features, y)


In [6]:
# Create Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_clf.fit(X_features, y)

In [8]:
"""from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# Resample and vectorize the dataset
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_features, y)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Use unigrams and bigrams
X_features = vectorizer.fit_transform(X)

# Split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

# Define Classifiers
classifiers = {
    'random_forest': RandomForestClassifier(n_estimators=50, random_state=42),
    'knn': KNeighborsClassifier(),
    'xgboost': xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=42)
}

# Train and evaluate each Classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"{name} Classifier Report:\n", classification_report(y_test, y_pred, target_names=le.classes_, zero_division=1))
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# Resample and vectorize the dataset
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_features, y)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Use unigrams and bigrams
X_features = vectorizer.fit_transform(X)

# Split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

# Define and tune RandomForestClassifier
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Evaluate the best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print("RandomForestClassifier Report:\n", classification_report(y_test, y_pred, target_names=le.classes_, zero_division=1))


RandomForestClassifier Report:
               precision    recall  f1-score   support

        3DES       0.16      1.00      0.28        49
         AES       1.00      0.00      0.00        56
    Blowfish       1.00      0.00      0.00        66
    ChaCha20       1.00      0.00      0.00        64
         DES       1.00      0.00      0.00        65

    accuracy                           0.16       300
   macro avg       0.83      0.20      0.06       300
weighted avg       0.86      0.16      0.05       300



In [9]:
# Split dataset for evaluation
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

# Evaluate Voting Classifier
y_pred_voting = voting_clf.predict(X_test)
print("Voting Classifier Report:\n", classification_report(y_test, y_pred_voting, target_names=le.classes_))

# Evaluate Stacking Classifier
y_pred_stacking = stacking_clf.predict(X_test)
print("Stacking Classifier Report:\n", classification_report(y_test, y_pred_stacking, target_names=le.classes_))

# Evaluate Bagging Classifier
y_pred_bagging = bagging_clf.predict(X_test)
print("Bagging Classifier Report:\n", classification_report(y_test, y_pred_bagging, target_names=le.classes_))


Voting Classifier Report:
               precision    recall  f1-score   support

        3DES       1.00      1.00      1.00        49
         AES       1.00      1.00      1.00        56
    Blowfish       1.00      1.00      1.00        66
    ChaCha20       1.00      1.00      1.00        64
         DES       1.00      1.00      1.00        65

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

Stacking Classifier Report:
               precision    recall  f1-score   support

        3DES       0.16      1.00      0.28        49
         AES       0.00      0.00      0.00        56
    Blowfish       0.00      0.00      0.00        66
    ChaCha20       0.00      0.00      0.00        64
         DES       0.00      0.00      0.00        65

    accuracy                           0.16       300
   macro avg       0.03      0.20      0.06       300
weighted avg       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NameError: name 'bagging_clf' is not defined