<a href="https://colab.research.google.com/github/KAVYANSHTYAGI/Ransomware-Analysis-using-Machine-Learning-and-Deep-Learning/blob/main/ML_models_ransomware.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U imbalanced-learn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline



In [None]:
# Load your dataset
data_path = '/content/drive/MyDrive/ransomware_analysis_files/gan_for_synthetic/balanced_oversampled_very_noisy_extended_5k.csv'
dataset = pd.read_csv(data_path)

# Check and remove NaN values in the target variable 'Tag_y'
if dataset['Tag_y'].isnull().any():
    print("NaN values found in target variable 'Tag_y', removing rows...")
    dataset = dataset.dropna(subset=['Tag_y'])

X = dataset.drop(['Tag_y', 'Tag_x','filename', 'cryptographic_usage_encryption_algorithms','complexity_metrics_function_count', 'data_flow_collections_usage', 'hardcoded_urls', 'obfuscation_techniques_variable_name_length', 'unique_suspicious_strings'], axis=1)
y = dataset['Tag_y']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:


# Define models and pipelines for each
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Parameters for GridSearchCV
params = {
    'Logistic Regression': {'classifier__C': [0.1, 1, 10]},
    'SVM': {'classifier__C': [0.1, 1, 10], 'classifier__kernel': ['linear', 'rbf']},
    'Decision Tree': {'classifier__max_depth': [10, 20, 30]},
    'Random Forest': {'classifier__n_estimators': [100, 200, 300]},
    'Gradient Boosting': {'classifier__n_estimators': [100, 200, 300], 'classifier__learning_rate': [0.01, 0.1, 0.2]}
}

results = {}

# Running GridSearchCV for each model
for name, model in models.items():
    pipeline = IMBPipeline([
        ('imputer', SimpleImputer(strategy='median')),  # Impute missing values
        ('scaler', StandardScaler()),  # Normalize features
        ('smote', SMOTE(random_state=42)),  # Handle class imbalance
        ('classifier', model)
    ])
    grid_search = GridSearchCV(pipeline, param_grid=params[name], cv=5, scoring='accuracy', verbose=2)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    results[name] = {
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Classification Report': classification_report(y_test, y_pred)
    }

# Print results for all models
for model_name, result in results.items():
    print(f"Results for {model_name}:")
    print("Best Parameters:", result['Best Parameters'])
    print("Accuracy:", result['Accuracy'])
    print("Classification Report:\n", result['Classification Report'])


Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ..................................classifier__C=0.1; total time=   0.3s
[CV] END ..................................classifier__C=0.1; total time=   0.2s
[CV] END ..................................classifier__C=0.1; total time=   0.2s
[CV] END ..................................classifier__C=0.1; total time=   0.2s
[CV] END ..................................classifier__C=0.1; total time=   0.1s
[CV] END ....................................classifier__C=1; total time=   0.3s
[CV] END ....................................classifier__C=1; total time=   0.3s
[CV] END ....................................classifier__C=1; total time=   0.3s
[CV] END ....................................classifier__C=1; total time=   0.3s
[CV] END ....................................classifier__C=1; total time=   0.2s
[CV] END ...................................classifier__C=10; total time=   0.3s
[CV] END ...................................class

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.pipeline import Pipeline as IMBPipeline
from imblearn.over_sampling import SMOTE


# Define a pipeline with preprocessing and resampling
pipeline = IMBPipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42))
])

# Setting up the Stacking Classifier
stacking_classifier = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000)),
        ('svm', SVC(probability=True)),
        ('mlp', MLPClassifier(max_iter=1000))
    ],
    final_estimator=LogisticRegression()
)

# Setting parameters for grid search
param_grid = {
    'classifier__lr__C': [0.1, 1, 10],
    'classifier__svm__C': [0.1, 1, 10],
    'classifier__svm__kernel': ['rbf', 'linear'],
    'classifier__mlp__alpha': [0.0001, 0.001, 0.01],
    'classifier__final_estimator__C': [0.1, 1, 10]
}

# Adding the classifier to the pipeline
pipeline.steps.append(('classifier', stacking_classifier))

# Configuring GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

# Best model, predict and evaluate
print("Best Parameters:", grid_search.best_params_)
y_pred = grid_search.predict(X_test)
print("Grid Search Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END classifier__final_estimator__C=0.1, classifier__lr__C=0.1, classifier__mlp__alpha=0.0001, classifier__svm__C=0.1, classifier__svm__kernel=rbf; total time=  30.8s
[CV] END classifier__final_estimator__C=0.1, classifier__lr__C=0.1, classifier__mlp__alpha=0.0001, classifier__svm__C=0.1, classifier__svm__kernel=rbf; total time=  28.8s
[CV] END classifier__final_estimator__C=0.1, classifier__lr__C=0.1, classifier__mlp__alpha=0.0001, classifier__svm__C=0.1, classifier__svm__kernel=rbf; total time=  31.6s
[CV] END classifier__final_estimator__C=0.1, classifier__lr__C=0.1, classifier__mlp__alpha=0.0001, classifier__svm__C=0.1, classifier__svm__kernel=rbf; total time=  29.2s
[CV] END classifier__final_estimator__C=0.1, classifier__lr__C=0.1, classifier__mlp__alpha=0.0001, classifier__svm__C=0.1, classifier__svm__kernel=rbf; total time=  28.6s
[CV] END classifier__final_estimator__C=0.1, classifier__lr__C=0.1, classifier__ml

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.pipeline import Pipeline as IMBPipeline
from imblearn.over_sampling import SMOTE


# Define models and pipelines for each
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'MLP': MLPClassifier(max_iter=1000),
    'AdaBoost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

# Parameters for GridSearchCV
params = {
    'Logistic Regression': {'classifier__C': [0.1, 1, 10]},
    'SVM': {'classifier__C': [0.1, 1, 10], 'classifier__kernel': ['linear', 'rbf']},
    'Decision Tree': {'classifier__max_depth': [10, 20, 30]},
    'Random Forest': {'classifier__n_estimators': [100, 200, 300]},
    'Gradient Boosting': {'classifier__n_estimators': [100, 200, 300], 'classifier__learning_rate': [0.01, 0.1, 0.2]},
    'MLP': {
        'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'classifier__activation': ['tanh', 'relu'],
        'classifier__alpha': [0.0001, 0.001, 0.01]
    },
    'AdaBoost': {'classifier__n_estimators': [50, 100, 150], 'classifier__learning_rate': [0.01, 0.1, 1]},
    'Bagging': {'classifier__n_estimators': [10, 50, 100]},
    'Extra Trees': {'classifier__n_estimators': [100, 200, 300]}
}

results = {}

# Running GridSearchCV for each model
for name, model in models.items():
    pipeline = IMBPipeline([
        ('imputer', SimpleImputer(strategy='median')),  # Impute missing values
        ('scaler', StandardScaler()),  # Normalize features
        ('smote', SMOTE(random_state=42)),  # Handle class imbalance
        ('classifier', model)
    ])
    grid_search = GridSearchCV(pipeline, param_grid=params[name], cv=5, scoring='accuracy', verbose=2)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    results[name] = {
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Classification Report': classification_report(y_test, y_pred)
    }

# Print results for all models
for model_name, result in results.items():
    print(f"Results for {model_name}:")
    print("Best Parameters:", result['Best Parameters'])
    print("Accuracy:", result['Accuracy'])
    print("Classification Report:\n", result['Classification Report'])




Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ..................................classifier__C=0.1; total time=   0.2s
[CV] END ..................................classifier__C=0.1; total time=   0.2s
[CV] END ..................................classifier__C=0.1; total time=   0.2s
[CV] END ..................................classifier__C=0.1; total time=   0.2s
[CV] END ..................................classifier__C=0.1; total time=   0.1s
[CV] END ....................................classifier__C=1; total time=   0.3s
[CV] END ....................................classifier__C=1; total time=   0.3s
[CV] END ....................................classifier__C=1; total time=   0.2s
[CV] END ....................................classifier__C=1; total time=   0.2s
[CV] END ....................................classifier__C=1; total time=   0.2s
[CV] END ...................................classifier__C=10; total time=   0.3s
[CV] END ...................................class



[CV] END classifier__learning_rate=0.01, classifier__n_estimators=50; total time=   0.8s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=50; total time=   0.7s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=50; total time=   0.7s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=50; total time=   0.7s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=50; total time=   0.7s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=150; total time=   2.1s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=0.01, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=50; total time=   0.7s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=50; total time=   0.8s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=50; total time=   0.8s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=50; total time=   0.8s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=50; total time=   0.8s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=100; total time=   1.3s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=150; total time=   2.1s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=150; total time=   2.1s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=0.1, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=1, classifier__n_estimators=50; total time=   0.7s




[CV] END classifier__learning_rate=1, classifier__n_estimators=50; total time=   0.7s




[CV] END classifier__learning_rate=1, classifier__n_estimators=50; total time=   0.7s




[CV] END classifier__learning_rate=1, classifier__n_estimators=50; total time=   0.7s




[CV] END classifier__learning_rate=1, classifier__n_estimators=50; total time=   0.7s




[CV] END classifier__learning_rate=1, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=1, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=1, classifier__n_estimators=100; total time=   1.5s




[CV] END classifier__learning_rate=1, classifier__n_estimators=100; total time=   1.4s




[CV] END classifier__learning_rate=1, classifier__n_estimators=100; total time=   1.3s




[CV] END classifier__learning_rate=1, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=1, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=1, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=1, classifier__n_estimators=150; total time=   2.0s




[CV] END classifier__learning_rate=1, classifier__n_estimators=150; total time=   2.0s




Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ........................classifier__n_estimators=10; total time=   0.6s
[CV] END ........................classifier__n_estimators=10; total time=   0.7s
[CV] END ........................classifier__n_estimators=10; total time=   0.7s
[CV] END ........................classifier__n_estimators=10; total time=   0.6s
[CV] END ........................classifier__n_estimators=10; total time=   0.6s
[CV] END ........................classifier__n_estimators=50; total time=   2.8s
[CV] END ........................classifier__n_estimators=50; total time=   3.0s
[CV] END ........................classifier__n_estimators=50; total time=   2.8s
[CV] END ........................classifier__n_estimators=50; total time=   2.7s
[CV] END ........................classifier__n_estimators=50; total time=   2.7s
[CV] END .......................classifier__n_estimators=100; total time=   5.5s
[CV] END .......................classifier__n_est

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.pipeline import Pipeline as IMBPipeline
from imblearn.over_sampling import SMOTE


# Define a pipeline for preprocessing and model fitting
def get_pipeline(model):
    return IMBPipeline([
        ('imputer', SimpleImputer(strategy='median')),  # Handle NaN values
        ('scaler', StandardScaler()),                  # Normalize features
        ('smote', SMOTE(random_state=42)),             # Handle class imbalance
        ('classifier', model)                          # Classifier model
    ])

# Define base models for stacking
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('gb', GradientBoostingClassifier(n_estimators=100)),
    ('mlp', MLPClassifier(max_iter=1000))
]

# Define stacking configurations with various final estimators
stacking_configurations = {
    'Stacking with Logistic Regression': LogisticRegression(),
    'Stacking with RandomForest': RandomForestClassifier(n_estimators=100),
    'Stacking with Gradient Boosting': GradientBoostingClassifier(n_estimators=100),
    'Stacking with MLP': MLPClassifier(max_iter=1000)
}

results = {}
for name, final_estimator in stacking_configurations.items():
    stacking_clf = StackingClassifier(estimators=base_models, final_estimator=final_estimator, cv=5)
    pipeline = get_pipeline(stacking_clf)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Classification Report': classification_report(y_test, y_pred)
    }

# Print results for all models
for model_name, result in results.items():
    print(f"Results for {model_name}:")
    print("Accuracy:", result['Accuracy'])
    print("Classification Report:\n", result['Classification Report'])


Results for Stacking with Logistic Regression:
Accuracy: 0.9958385351643778
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      1202
           1       0.99      1.00      1.00      1201

    accuracy                           1.00      2403
   macro avg       1.00      1.00      1.00      2403
weighted avg       1.00      1.00      1.00      2403

Results for Stacking with RandomForest:
Accuracy: 0.9970869746150645
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1202
           1       1.00      1.00      1.00      1201

    accuracy                           1.00      2403
   macro avg       1.00      1.00      1.00      2403
weighted avg       1.00      1.00      1.00      2403

Results for Stacking with Gradient Boosting:
Accuracy: 0.9962546816479401
Classification Report:
               precision    recall  f1-score   support



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.pipeline import Pipeline as IMBPipeline
from imblearn.over_sampling import SMOTE


# Define a pipeline for preprocessing and model fitting
def get_pipeline(model):
    return IMBPipeline([
        ('imputer', SimpleImputer(strategy='median')),  # Handle NaN values
        ('scaler', StandardScaler()),                  # Normalize features
        ('smote', SMOTE(random_state=42)),             # Handle class imbalance
        ('classifier', model)                          # Classifier model
    ])

# Define stacking configurations with various combinations of base models
stacking_models = {
    'Stacking RF and GB': {
        'estimators': [
            ('rf', RandomForestClassifier(n_estimators=100)),
            ('gb', GradientBoostingClassifier(n_estimators=100))
        ],
        'final_estimator': LogisticRegression()
    },
    'Stacking SVM and MLP': {
        'estimators': [
            ('svm', SVC(probability=True)),
            ('mlp', MLPClassifier(max_iter=1000))
        ],
        'final_estimator': RandomForestClassifier(n_estimators=100)
    },
    'Stacking DT and RF': {
        'estimators': [
            ('dt', DecisionTreeClassifier(max_depth=10)),
            ('rf', RandomForestClassifier(n_estimators=100))
        ],
        'final_estimator': GradientBoostingClassifier(n_estimators=100)
    },
    'Stacking LR and SVM': {
        'estimators': [
            ('lr', LogisticRegression(max_iter=1000)),
            ('svm', SVC(probability=True))
        ],
        'final_estimator': MLPClassifier(max_iter=1000)
    }
}

results = {}
for model_name, config in stacking_models.items():
    stacking_clf = StackingClassifier(estimators=config['estimators'], final_estimator=config['final_estimator'], cv=5)
    pipeline = get_pipeline(stacking_clf)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results[model_name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Classification Report': classification_report(y_test, y_pred)
    }

# Print results for all models
for model_name, result in results.items():
    print(f"Results for {model_name}:")
    print("Accuracy:", result['Accuracy'])
    print("Classification Report:\n", result['Classification Report'])


Results for Stacking RF and GB:
Accuracy: 0.9954223886808157
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      1202
           1       0.99      1.00      1.00      1201

    accuracy                           1.00      2403
   macro avg       1.00      1.00      1.00      2403
weighted avg       1.00      1.00      1.00      2403

Results for Stacking SVM and MLP:
Accuracy: 0.9937578027465668
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      1202
           1       0.99      1.00      0.99      1201

    accuracy                           0.99      2403
   macro avg       0.99      0.99      0.99      2403
weighted avg       0.99      0.99      0.99      2403

Results for Stacking DT and RF:
Accuracy: 0.9958385351643778
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99 

In [None]:
# Define a new stacking model with Gradient Boosting and MLP
new_stacking_clf = StackingClassifier(
    estimators=[
        ('gb', GradientBoostingClassifier(n_estimators=100)),
        ('mlp', MLPClassifier(max_iter=1000))
    ],
    final_estimator=RandomForestClassifier(n_estimators=100),
    cv=5
)

# Prepare the pipeline
pipeline = get_pipeline(new_stacking_clf)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Evaluate the new model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print("Accuracy for Stacking GB and MLP:", accuracy)
print("Classification Report:\n", classification_rep)


Accuracy for Stacking GB and MLP: 0.9937578027465668
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      1202
           1       0.99      1.00      0.99      1201

    accuracy                           0.99      2403
   macro avg       0.99      0.99      0.99      2403
weighted avg       0.99      0.99      0.99      2403



In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as IMBPipeline



# Define the base models for various stacking configurations
stacking_models = {
    'Stacking RF, GB, and MLP': {
        'estimators': [
            ('rf', RandomForestClassifier(n_estimators=100)),
            ('gb', GradientBoostingClassifier(n_estimators=100)),
            ('mlp', MLPClassifier(max_iter=1000))
        ],
        'final_estimator': LogisticRegression()
    },
    'Stacking SVM, DT, and RF': {
        'estimators': [
            ('svm', SVC(probability=True)),
            ('dt', DecisionTreeClassifier(max_depth=10)),
            ('rf', RandomForestClassifier(n_estimators=100))
        ],
        'final_estimator': LogisticRegression()
    },
    'Stacking LR, SVM, and MLP': {
        'estimators': [
            ('lr', LogisticRegression(max_iter=1000)),
            ('svm', SVC(probability=True)),
            ('mlp', MLPClassifier(max_iter=1000))
        ],
        'final_estimator': RandomForestClassifier(n_estimators=100)
    },
    'Stacking GB, DT, and SVM': {
        'estimators': [
            ('gb', GradientBoostingClassifier(n_estimators=100)),
            ('dt', DecisionTreeClassifier(max_depth=10)),
            ('svm', SVC(probability=True))
        ],
        'final_estimator': MLPClassifier(max_iter=1000)
    },
    'Stacking MLP, RF, and GB': {
        'estimators': [
            ('mlp', MLPClassifier(max_iter=1000)),
            ('rf', RandomForestClassifier(n_estimators=100)),
            ('gb', GradientBoostingClassifier(n_estimators=100))
        ],
        'final_estimator': LogisticRegression()
    }
}

# Training and evaluating each stacking configuration
results = {}
for name, config in stacking_models.items():
    pipeline = IMBPipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', StackingClassifier(estimators=config['estimators'], final_estimator=config['final_estimator'], cv=5))
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Classification Report': classification_report(y_test, y_pred)
    }

# Print results for all models
for model_name, result in results.items():
    print(f"Results for {model_name}:")
    print("Accuracy:", result['Accuracy'])
    print("Classification Report:\n", result['Classification Report'])


Results for Stacking RF, GB, and MLP:
Accuracy: 0.9958385351643778
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      1202
           1       0.99      1.00      1.00      1201

    accuracy                           1.00      2403
   macro avg       1.00      1.00      1.00      2403
weighted avg       1.00      1.00      1.00      2403

Results for Stacking SVM, DT, and RF:
Accuracy: 0.9962546816479401
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      1202
           1       0.99      1.00      1.00      1201

    accuracy                           1.00      2403
   macro avg       1.00      1.00      1.00      2403
weighted avg       1.00      1.00      1.00      2403

Results for Stacking LR, SVM, and MLP:
Accuracy: 0.9933416562630045
Classification Report:
               precision    recall  f1-score   support

           0     

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.pipeline import Pipeline as IMBPipeline
from imblearn.over_sampling import SMOTE



# Neural Network as base estimator
neural_networks = [
    ('mlp1', MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000)),
    ('mlp2', MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000))
]

# Traditional ML models as final estimator
final_estimators = [
    RandomForestClassifier(n_estimators=100),
    GradientBoostingClassifier(n_estimators=100),
    LogisticRegression(max_iter=1000)
]

# Create stacked models using neural networks as base and traditional ML models as final
stacked_models = {}
for final_estimator in final_estimators:
    model_name = f"Stacked NNs with {final_estimator.__class__.__name__}"
    stacked_models[model_name] = StackingClassifier(
        estimators=neural_networks,
        final_estimator=final_estimator,
        cv=5
    )

# Train and evaluate each model
results = {}
for name, model in stacked_models.items():
    pipeline = IMBPipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Classification Report': classification_report(y_test, y_pred)
    }

# Print results for all models
for model_name, result in results.items():
    print(f"Results for {model_name}:")
    print("Accuracy:", result['Accuracy'])
    print("Classification Report:\n", result['Classification Report'])


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.pipeline import Pipeline as IMBPipeline
from imblearn.over_sampling import SMOTE


# Base model - MLP
mlp_base = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)

# Stacking configurations
stacking_configurations = {
    'Stacking MLP with Logistic Regression': {
        'estimators': [('mlp', mlp_base)],
        'final_estimator': LogisticRegression()
    },
    'Stacking MLP with SVM': {
        'estimators': [('mlp', mlp_base)],
        'final_estimator': SVC(probability=True)
    }
}

# Train and evaluate each model
results = {}
for name, config in stacking_configurations.items():
    pipeline = IMBPipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=42)),
        ('classifier', StackingClassifier(estimators=config['estimators'], final_estimator=config['final_estimator'], cv=5))
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Classification Report': classification_report(y_test, y_pred)
    }

# Print results for all models
for model_name, result in results.items():
    print(f"Results for {model_name}:")
    print("Accuracy:", result['Accuracy'])
    print("Classification Report:\n", result['Classification Report'])
