In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
# Load dataset
data = pd.read_csv('C://Users//Jayashrinidhi V//OneDrive//Documents//VScode//AlgoMaster//encrypted_messages_dataset12.csv')
print(data.head())

  algorithm                                          plaintext  \
0       AES  The volunteerism is a key component of communi...   
1       AES  +"f&R*~?GE3%-'}}i5bMx"IB:dqN0?2/R)sHWm;\.:SP1$...   
2       AES  The educational technology is a key component ...   
3       AES                        [GmcvmxtqF4}`}wOs NyXORrP{*   
4       AES  The encryption is a key component of global po...   

                                                 key  \
0  06f50db57c46117664333a82aa4628adf24baf21367d26...   
1  5bbff7147513d5e3624560a71214653e3ebe1b330c41e1...   
2  e229117d280cafcec62d4cbf6555bdee64da661d5226c1...   
3  74c9b7ebe8cc4cf50c99c34ede237bf48cf3813c3f9569...   
4  2226799dab3df3b994d8ee4f86398ba251c99c25ac4b6a...   

                                 iv  \
0  8855be909cbf4c1cf3f646fcfe0bb375   
1  bfa2fe6369565e48493aecfe3c625f78   
2  d95b59ff2e2911759b468342e537eb26   
3  237b47f8047ba7480f365e62b16487f6   
4  b7c4f3f628754f520f27655145ed4d7c   

                               

In [6]:
# Split dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)


In [4]:
# Encode the target variable
le = LabelEncoder()
data['algorithm_encoded'] = le.fit_transform(data['algorithm'])

# Define features and target
X = data['encrypted_message']
y = data['algorithm_encoded']
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_features = vectorizer.fit_transform(X)

# Define base models
base_models = [
    ('logreg', LogisticRegression(max_iter=1000)),
    ('svc', SVC(probability=True)),
    ('decision_tree', DecisionTreeClassifier()),
    ('random_forest', RandomForestClassifier()),
    ('gbm', GradientBoostingClassifier()),
    ('xgboost', xgb.XGBClassifier(eval_metric='mlogloss')),
    ('catboost', CatBoostClassifier(silent=True)),
    ('knn', KNeighborsClassifier()),
    ('naive_bayes', MultinomialNB())
]

# Define meta-model for stacking
meta_model = LogisticRegression(max_iter=1000)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Initialize the classifier
rf_clf = RandomForestClassifier(random_state=42)

# Perform Grid Search with Cross-Validation
rf_grid_search = GridSearchCV(estimator=rf_clf, param_grid=rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Best parameters and model
print("Best parameters for Random Forest:", rf_grid_search.best_params_)
best_rf_model = rf_grid_search.best_estimator_


Best parameters for Random Forest: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid for KNN
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Initialize the classifier
knn_clf = KNeighborsClassifier()

# Perform Grid Search with Cross-Validation
knn_grid_search = GridSearchCV(estimator=knn_clf, param_grid=knn_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
knn_grid_search.fit(X_train, y_train)

# Best parameters and model
print("Best parameters for KNN:", knn_grid_search.best_params_)
best_knn_model = knn_grid_search.best_estimator_


Best parameters for KNN: {'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'uniform'}


In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize the classifier
xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Perform Grid Search with Cross-Validation
xgb_grid_search = GridSearchCV(estimator=xgb_clf, param_grid=xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)

# Best parameters and model
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)
best_xgb_model = xgb_grid_search.best_estimator_


Best parameters for XGBoost: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.6}


  _data = np.array(data, dtype=dtype, copy=copy,
Parameters: { "use_label_encoder" } are not used.



In [10]:
from sklearn.model_selection import cross_val_score

# Evaluate using cross-validation
rf_cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring='accuracy')
knn_cv_scores = cross_val_score(best_knn_model, X_train, y_train, cv=5, scoring='accuracy')
xgb_cv_scores = cross_val_score(best_xgb_model, X_train, y_train, cv=5, scoring='accuracy')

print("Random Forest CV Accuracy:", rf_cv_scores.mean())
print("KNN CV Accuracy:", knn_cv_scores.mean())
print("XGBoost CV Accuracy:", xgb_cv_scores.mean())


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Random Forest CV Accuracy: 0.20916666666666667
KNN CV Accuracy: 0.195
XGBoost CV Accuracy: 0.20916666666666667


Parameters: { "use_label_encoder" } are not used.



In [11]:
# Example for RandomForest with class weights
rf_clf_with_weights = RandomForestClassifier(random_state=42, class_weight='balanced')

# Fit the model
rf_clf_with_weights.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf_weights = rf_clf_with_weights.predict(X_test)
print("RandomForest with class weights Report:\n", classification_report(y_test, y_pred_rf_weights))


RandomForest with class weights Report:
               precision    recall  f1-score   support

           0       0.16      1.00      0.28        49
           1       0.00      0.00      0.00        56
           2       0.00      0.00      0.00        66
           3       0.00      0.00      0.00        64
           4       0.00      0.00      0.00        65

    accuracy                           0.16       300
   macro avg       0.03      0.20      0.06       300
weighted avg       0.03      0.16      0.05       300



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
