In [4]:
import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn import svm, tree, neighbors, ensemble, linear_model
from sklearn.neighbors import KNeighborsClassifier
import itertools
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os
import importlib
from model import *
from joblib import Parallel, delayed
from tqdm import tqdm
from tqdm_joblib import tqdm_joblib

os.chdir(r'C:\Users\hp\Desktop\projects\ensemble_learning_project')

seed = 55

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# dataset 0
# TRAIN SET
train_data_dataset_0 = np.load(r'dataset/train/dataset_0/kaggle_source_cate_0_train.npy')
train_labels_dataset_0 = np.load(r'dataset/train/dataset_0/kaggle_source_cate_0_train_label.npy')
# TEST SET
test_data_dataset_0 = np.load(r'dataset/test/dataset_0/kaggle_source_cate_0_test.npy')
test_labels_dataset_0 = np.load(r'dataset/test/dataset_0/kaggle_source_cate_0_test_label.npy')

# dataset 1
# TRAIN SET
train_data_dataset_1 = np.load(r'dataset/train/dataset_1/kaggle_source_cate_1_train.npy')
train_labels_dataset_1 = np.load(r'dataset/train/dataset_1/kaggle_source_cate_1_train_label.npy')
# TEST SET
test_data_dataset_1 = np.load(r'dataset/test/dataset_1/kaggle_source_cate_1_test.npy')
test_labels_dataset_1 = np.load(r'dataset/test/dataset_1/kaggle_source_cate_1_test_label.npy')

# dataset 2
# TRAIN SET
train_data_dataset_2 = np.load(r'dataset/train/dataset_2/kaggle_source_cate_2_train.npy')
train_labels_dataset_2 = np.load(r'dataset/train/dataset_2/kaggle_source_cate_2_train_label.npy')
# TEST SET
test_data_dataset_2 = np.load(r'dataset/test/dataset_2/kaggle_source_cate_2_test.npy')
test_labels_dataset_2 = np.load(r'dataset/test/dataset_2/kaggle_source_cate_2_test_label.npy')

# dataset 3
# TRAIN SET
train_data_dataset_3 = np.load(r'dataset/train/dataset_3/kaggle_source_cate_3_train.npy')
train_labels_dataset_3 = np.load(r'dataset/train/dataset_3/kaggle_source_cate_3_train_label.npy')
# TEST SET
test_data_dataset_3 = np.load(r'dataset/test/dataset_3/kaggle_source_cate_3_test.npy')
test_labels_dataset_3 = np.load(r'dataset/test/dataset_3/kaggle_source_cate_3_test_label.npy')


In [6]:
print(train_data_dataset_0.shape, train_labels_dataset_0.shape)
print(test_data_dataset_0.shape, test_labels_dataset_0.shape)

print(train_data_dataset_1.shape, train_labels_dataset_1.shape)
print(test_data_dataset_1.shape, test_labels_dataset_1.shape)

print(train_data_dataset_2.shape, train_labels_dataset_2.shape)
print(test_data_dataset_2.shape, test_labels_dataset_2.shape)

print(train_data_dataset_3.shape, train_labels_dataset_3.shape)
print(test_data_dataset_3.shape, test_labels_dataset_3.shape)


(41058, 51) (41058, 2)
(13686, 51) (13686, 2)
(41058, 51) (41058, 2)
(13686, 51) (13686, 2)
(41058, 51) (41058, 2)
(13686, 51) (13686, 2)
(41058, 51) (41058, 2)
(13686, 51) (13686, 2)


In [7]:
def calculate_proportions(dataset, dataset_name):
    labels = dataset[:, 1]
    unique, counts = np.unique(labels, return_counts=True)
    proportions = dict(zip(unique, counts / len(labels)))
    print(f"Proportions for {dataset_name}: {proportions}")

calculate_proportions(train_labels_dataset_0, "train_labels_dataset_0")
calculate_proportions(train_labels_dataset_1, "train_labels_dataset_1")
calculate_proportions(train_labels_dataset_2, "train_labels_dataset_2")
calculate_proportions(train_labels_dataset_3, "train_labels_dataset_3")


Proportions for train_labels_dataset_0: {np.int64(0): np.float64(0.8995810804228165), np.int64(1): np.float64(0.10041891957718349)}
Proportions for train_labels_dataset_1: {np.int64(0): np.float64(0.9004335330508062), np.int64(1): np.float64(0.09956646694919383)}
Proportions for train_labels_dataset_2: {np.int64(0): np.float64(0.901675678308734), np.int64(1): np.float64(0.09832432169126601)}
Proportions for train_labels_dataset_3: {np.int64(0): np.float64(0.9018461688343319), np.int64(1): np.float64(0.09815383116566807)}


In [8]:

# Define the datasets with both train and test data
datasets = [
    {
        'X_train': train_data_dataset_0, 
        'y_train': train_labels_dataset_0[:,1], 
        'X_test': test_data_dataset_0, 
        'y_test': test_labels_dataset_0[:,1], 
        'sampling': 'none'  # No sampling
    },
    {
        'X_train': train_data_dataset_1, 
        'y_train': train_labels_dataset_1[:,1], 
        'X_test': test_data_dataset_1, 
        'y_test': test_labels_dataset_1[:,1], 
        'sampling': 'undersampling'  # Undersampling
    },
    {
        'X_train': train_data_dataset_2, 
        'y_train': train_labels_dataset_2[:,1], 
        'X_test': test_data_dataset_2, 
        'y_test': test_labels_dataset_2[:,1], 
        'sampling': 'oversampling'  # Oversampling
    },
    {
        'X_train': train_data_dataset_3, 
        'y_train': train_labels_dataset_3[:,1], 
        'X_test': test_data_dataset_3, 
        'y_test': test_labels_dataset_3[:,1], 
        'sampling': 'cost_sensitive'  # Cost-sensitive learning
    }
]

# Define the models
models = ['Random Forest', 'Bagging', 'Boosting', 'Penalized Logistic Regression', 'Simple Decision Tree', 'XGBoost', 'Stacking']
models = ['Simple Decision Tree', 'XGBoost']

# Function to apply model and capture results
def run_model_on_dataset(model, dataset, i):
    print(f"Processing dataset {i+1} with sampling: {dataset['sampling']}, model: {model}")
    
    # Extract train and test sets for the current dataset
    X_train = dataset['X_train']
    y_train = dataset['y_train']
    X_test = dataset['X_test']
    y_test = dataset['y_test']
    sampling_method = dataset['sampling']
    
    # Call the apply_algo function and capture the results
    best_params, test_scores, elapsed_time = apply_algo(
        model, X_train, y_train, X_test, y_test, sampling=sampling_method)
    
    # Return the results as a dictionary
    return {
        'dataset': i+1,
        'sampling': sampling_method,
        'model': model,
        'best_params': best_params,
        'F1 Score': test_scores['F1 Score'],
        'Precision': test_scores['Precision'],
        'Recall': test_scores['Recall'],
        'Balanced accuracy': test_scores['Balanced accuracy'],
        'elapsed_time': elapsed_time
    }

# Parallel execution
# Use joblib's Parallel and delayed to parallelize model runs
results_list = Parallel(n_jobs=-1)(delayed(run_model_on_dataset)(model, dataset, i) 
                                   for i, dataset in enumerate(datasets) 
                                   for model in models)

#print("Results List:", results_list)

results_df = pd.DataFrame(results_list)
#results_df.to_csv(r'results/results.csv', index=False)


In [None]:
# results_df.to_csv(r'results/results.csv', index=False)


In [11]:
def calculate_proportions(dataset, dataset_name):
    labels = dataset[:, 1]
    unique, counts = np.unique(labels, return_counts=True)
    proportions = dict(zip(unique, counts / len(labels)))
    print(f"Proportions for {dataset_name}: {proportions}")

calculate_proportions(train_labels_dataset_0, "train_labels_dataset_0")
calculate_proportions(train_labels_dataset_1, "train_labels_dataset_1")
calculate_proportions(train_labels_dataset_2, "train_labels_dataset_2")
calculate_proportions(train_labels_dataset_3, "train_labels_dataset_3")


Proportions for train_labels_dataset_0: {np.int64(0): np.float64(0.8995810804228165), np.int64(1): np.float64(0.10041891957718349)}
Proportions for train_labels_dataset_1: {np.int64(0): np.float64(0.9004335330508062), np.int64(1): np.float64(0.09956646694919383)}
Proportions for train_labels_dataset_2: {np.int64(0): np.float64(0.901675678308734), np.int64(1): np.float64(0.09832432169126601)}
Proportions for train_labels_dataset_3: {np.int64(0): np.float64(0.9018461688343319), np.int64(1): np.float64(0.09815383116566807)}


In [13]:
X = train_data_dataset_0
y = train_labels_dataset_0[:,1]

models = ['Simple Decision Tree', 'Penalized Logistic Regression', 'Bagging', 'Random Forest', 'Boosting', 'XGBoost', 'Stacking']

models = ['Random Forest','Bagging', 'Penalized Logistic Regression']

best_params_list = []
test_score_list = []
elapsed_time_list = []

for model in models:
    print(f"Running model: {model}")
    
    # Call the apply_algo function and capture the results
    best_params, test_score, elapsed_time = apply_algo(model, X, y)
    
    # Append the results to the respective lists
    best_params_list.append(best_params)
    test_score_list.append(test_score)
    elapsed_time_list.append(elapsed_time)

# After the loop, you'll have three lists populated with results for each model
print("Best Parameters List:", best_params_list)
print("Test Score List:", test_score_list)
print("Elapsed Time List:", elapsed_time_list)


Running model: Random Forest


TypeError: apply_algo() missing 2 required positional arguments: 'X_test' and 'y_test'