In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, 
    AdaBoostClassifier, StackingClassifier
)
from sklearn.model_selection import (
    cross_val_score, StratifiedKFold, train_test_split, cross_val_predict, 
    GridSearchCV, RandomizedSearchCV
)
from sklearn.metrics import (
    accuracy_score, f1_score, make_scorer, roc_auc_score, 
    confusion_matrix, classification_report
)
from sklearn.preprocessing import (
    StandardScaler, LabelEncoder, OneHotEncoder, RobustScaler
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import RFECV, SelectFromModel

from tensorflow.keras import layers, models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import make_pipeline as make_imblearn_pipeline
from imblearn.pipeline import Pipeline as imblearn_Pipeline

from feature_engine.encoding import CountFrequencyEncoder
from xgboost import XGBClassifier

from scipy.stats import randint, uniform

import random

from Functions_Classes import *

from sklearn.base import BaseEstimator, TransformerMixin

pd.set_option('display.max_columns', None)
np.random.seed(0)


In [2]:
df = pd.read_excel("C:/Users/Cagan Deliktas/Desktop/ProjectDataMining2/DM2_DataCraft/data/training_data.xls")
X_test = pd.read_excel("C:/Users/Cagan Deliktas/Desktop/ProjectDataMining2/DM2_DataCraft/data/test_data_no_target.xls")

df = df.loc[:, df.columns != 'Perform']
#df = df.loc[:, df.columns != 'Group']


X_train = df.loc[:, df.columns != 'Class']
y_train = df['Class']

In [3]:
numeric_columns = X_train.loc[:, ~X_train.columns.isin(['Group'])].columns.to_list()
X_train.loc[:, numeric_columns] = X_train.loc[:, numeric_columns].replace(
    {
        'NA': np.nan, 
        '': np.nan,
        ' ': np.nan
    }
).astype(float)

numeric_columns = X_test.loc[:, ~X_test.columns.isin(['Group'])].columns.to_list()
X_test[numeric_columns] = X_test.loc[:, numeric_columns].replace(
    {
        'NA': np.nan, 
        '': np.nan,
        ' ': np.nan
    }
).astype(float)

In [4]:
X_train.head()

Unnamed: 0,Group,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13,I14,I15,I16,I17,I18,I19,I20,I21,I22,I23,I24,I25,I26,I27,I28,I29,I30,I31,I32,I33,I34,I35,I36,I37,I38,I39,I40,I41,I42,I43,I44,I45,I46,I47,I48,I49,I50,I51,I52,I53,I54,I55,I56,I57,I58,dI1,dI2,dI3,dI4,dI5,dI6,dI7,dI8,dI9,dI10,dI11,dI12,dI13,dI14,dI15,dI16,dI17,dI18,dI19,dI20,dI21,dI22,dI23,dI24,dI25,dI26,dI27,dI28,dI29,dI30,dI31,dI32,dI33,dI34,dI35,dI36,dI37,dI38,dI39,dI40,dI41,dI42,dI43,dI44,dI45,dI46,dI47,dI48,dI49,dI50,dI51,dI52,dI53,dI54,dI55,dI56,dI57,dI58
0,G9,0.136495,-0.028429,-0.037772,-0.232459,-0.016222,-0.187506,-0.322545,-0.043743,0.125389,-0.014757,-0.033105,0.303035,-0.093811,-0.598917,-0.271292,-0.256749,-0.100146,-0.045525,-0.078422,-0.060129,-0.069528,-0.052432,-0.114432,-0.104989,0.342845,-0.159417,0.006772,-0.303193,-0.163287,-0.080599,-0.82888,-1.064215,-0.547067,-0.540497,-0.676045,-0.305007,-0.507724,-0.191437,-0.087362,-0.856151,0.802525,0.73308,0.006512,0.53329,0.195197,0.058094,-0.228889,-0.150821,-0.104986,-0.026743,0.188312,-0.250701,-0.10119,-0.357521,-0.527956,0.611385,-0.092714,-0.055733,-0.065709,-0.002144,-0.004367,-0.079805,0.17828,0.078155,0.072802,0.00209,0.21177,-0.003073,-0.188447,0.117769,0.001613,-0.024223,0.103204,0.032484,0.002688,0.000765,-0.004447,0.148967,-0.018521,-0.01411,-0.001996,-0.002369,-0.120036,0.013172,-0.215571,-0.021999,0.001728,-5e-05,-0.01212,-0.040172,-0.060103,-0.059464,-0.044899,0.015735,0.022919,-0.003106,0.001233,-0.002339,0.040628,0.411684,0.07309,0.526222,0.07106,-0.019531,0.359889,-0.020476,0.057151,0.07711,0.102563,0.188481,-0.016027,-0.135451,-0.189667,0.250967,0.022171,-0.004265
1,G5,-0.714522,-0.042137,-0.052968,-0.796862,-0.018394,0.070102,-0.076321,-0.063864,-1.045521,-0.037353,-0.792515,-1.082483,0.025798,-0.833652,-0.625088,-0.333608,0.072579,-0.046963,0.223022,-0.605902,-0.131099,-0.235929,-0.07392,-0.063247,-0.798768,-0.899983,1.388771,-0.248677,-0.058083,-0.01447,0.092095,0.561368,0.224819,0.22319,0.098852,-0.128227,-0.215876,-0.007164,-0.03526,-0.123911,-0.089751,-0.094963,0.362818,0.011107,-1.506356,-0.573679,-0.955222,-0.81888,-1.063295,-1.022679,-1.336188,-0.612039,-0.061357,-0.482805,-0.017077,1.192135,-0.114981,-0.028074,-0.004451,-0.000536,-0.002288,-0.045597,-0.080639,-0.081924,-0.033862,-0.005111,-0.261836,0.000122,-0.045046,0.999854,-0.008835,-0.122379,-0.199892,0.013615,0.014404,-0.000405,0.021573,-0.02416,-0.03742,-0.01261,0.003007,0.003617,-0.106893,-0.394834,-0.132496,-0.027354,-0.129804,-0.066157,-0.494334,0.123781,0.284328,0.281308,0.212767,0.192042,0.146926,-0.118826,-0.039203,-0.256107,0.176622,0.16884,0.487752,0.029464,0.014232,0.039633,0.025667,0.006626,0.00518,0.006128,-0.016375,0.020727,-0.006525,-0.01879,-0.098543,0.317744,-0.180502,-0.009215
2,G10,0.104791,-0.038188,-0.053191,0.620233,0.148587,0.489875,0.319274,-0.060246,0.053174,-0.025008,-0.45684,1.28445,-0.13347,3.207672,2.37323,1.304427,,,-0.361293,2.995661,,-0.188988,-0.044158,-0.02455,-0.586562,-0.176292,-1.013037,0.066912,0.219649,0.15449,2.370951,1.384675,0.489152,0.484715,0.367301,0.749572,0.66941,0.423228,0.226897,3.227283,-0.329997,-0.327579,-1.033898,0.014531,0.211889,-1.197156,2.860444,,3.584223,,1.272375,7.427558,-0.182816,-2.713205,-1.877595,-0.568691,0.224945,0.052749,0.37764,0.002656,0.001226,0.22606,0.207653,0.270327,0.283061,0.002934,0.454366,0.004264,0.188623,-0.265918,0.0,2.063796,1.076458,0.240011,,,-0.028327,1.764826,,0.005847,-0.011166,-0.012626,-0.010822,0.056514,-0.100007,-0.216081,-0.127274,-0.056206,0.175751,-0.01177,0.493157,0.487919,0.438576,0.574623,0.564379,-0.165933,-0.051256,0.410379,0.056624,0.047592,0.0,-0.020586,0.237539,0.017314,0.516667,,0.404158,,0.272937,0.774169,-0.007144,0.123954,0.0,-0.110103,0.186669,-0.03072
3,G2,-0.532847,-0.006582,-0.023377,1.306702,-0.068909,0.048024,-0.119481,-0.021057,-1.012916,-0.011783,1.206727,0.311773,-0.005928,3.869459,-1.064793,0.107702,-0.126984,-0.04436,-0.181023,-0.691971,,0.195138,-0.104877,-0.093976,-0.757725,0.004432,-1.471299,0.643575,-0.067005,-0.006874,-0.087499,0.110638,0.04688,0.047141,-0.274713,0.169046,-0.179742,0.047391,0.015197,0.105158,-0.045135,-0.051329,0.202098,0.034693,2.904519,4.514844,-0.241111,,-0.521576,,-0.308812,-0.542532,-0.165028,1.490354,-1.550745,-0.918676,0.013484,-0.013198,0.050586,0.010356,0.007522,0.194792,0.010436,0.10788,0.122549,0.017641,0.136566,0.010365,0.086853,-0.286395,-0.014883,0.347297,0.017765,0.068701,0.01554,0.000208,0.016119,0.003992,,0.043909,-0.000107,9.9e-05,-0.003895,0.00249,-0.003034,-0.015845,0.002377,0.001974,0.05634,0.010802,0.063094,0.062424,0.057012,0.118399,0.116161,-0.017039,0.000839,0.054025,0.030561,0.006389,-0.073937,0.764136,-0.076195,-0.114682,0.119667,,0.001799,,0.004938,0.018494,-0.00335,-0.029214,0.045747,-0.076884,-0.037859,-0.012046
4,G3,-0.200815,-0.016334,-0.036754,-0.886675,0.484495,-1.148744,0.152517,-0.04358,-0.935537,-0.023262,-0.908986,-0.525121,0.015492,-0.347325,0.29636,-0.242201,0.120049,-0.048293,0.290658,-0.345816,0.249586,-0.241812,-0.082055,-0.077706,-0.845163,-0.257777,0.919065,-0.522102,0.146076,0.043851,1.281726,0.039106,0.135331,0.134652,0.654099,1.437536,1.995784,-0.145004,-0.029483,0.252151,0.308723,0.293393,-0.527888,-0.00368,-1.553644,-1.233945,-0.947111,-0.926073,-0.772468,-0.63644,-0.833875,-0.527935,-0.01417,-0.142943,1.070523,-0.284682,-0.15511,-0.026941,0.480767,0.021831,-0.003234,-0.041412,0.112513,-0.157224,-0.14618,-0.014677,-0.45195,0.034598,-0.114443,-0.307095,-0.346711,0.104144,-0.50892,-0.096666,0.044162,0.000159,0.085082,0.254664,-0.000408,-0.01539,-0.006226,-0.012542,-0.101059,0.091145,0.28211,-0.005348,0.112377,0.036976,0.73157,0.050165,0.038419,0.038011,0.265998,1.61412,1.806955,-0.122743,-0.001985,0.126103,0.630259,0.618027,-1.599633,0.032793,-0.126733,-0.163593,-0.225889,-0.02646,-0.080892,-0.095963,-0.014812,-0.324584,-0.019002,-0.379323,-0.046024,0.282145,0.011008,0.010496


## Shape

In [5]:
X_train.shape

(8000, 117)

# Detect Categorical and Numerical Columns

In [7]:
categorical_features = ['Group']
numerical_features = [col for col in X_train.columns if col not in categorical_features + ['Class']]

## Preprocessing Pipeline

In [8]:
numerical_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)), 
    ('scaler', RobustScaler()),
])

categorical_pipeline = Pipeline([
    ('one_hot', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

#rfecv_classifier = DecisionTreeClassifier(random_state=42)
#selector = RFECV(estimator=rfecv_classifier, step=1, cv=StratifiedKFold(3), scoring=matrix_error_function)

# Classification Models

In [14]:
final_classifiers = {
    'random_forest': RandomForestClassifier(random_state=42),
    'xgboost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'gradient_boosting': GradientBoostingClassifier(random_state=42)
}

### 1st Pipeline

In [10]:
y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

cv_scores = {}

for name, classifier in final_classifiers.items():
    pipeline = make_imblearn_pipeline(
        preprocessor,
        SMOTE(random_state=42),
        #selector,
        classifier
    )
   
    scores = cross_val_score(pipeline, X_train, y_train_mapped, cv=StratifiedKFold(5), scoring=matrix_error_function, n_jobs=-1)
   
    cv_scores[name] = scores
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Cross-validation scores for random_forest: [-0.910625 -0.921875 -0.93375  -0.918125 -0.94375 ]
Average cost error for random_forest: -0.9256249999999999
Cross-validation scores for xgboost: [-0.92     -0.88     -0.895    -0.853125 -0.8875  ]
Average cost error for xgboost: -0.8871249999999999
Cross-validation scores for gradient_boosting: [-0.903125 -0.8775   -0.88     -0.880625 -0.89375 ]
Average cost error for gradient_boosting: -0.8869999999999999


### 2nd pipeline

In [12]:
y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

cv_scores = {}

for name, classifier in final_classifiers.items():
    pipeline = make_imblearn_pipeline(
        preprocessor,
        RandomUnderSampler(random_state=42),
        #selector,
        classifier
    )
   
    scores = cross_val_score(pipeline, X_train, y_train_mapped, cv=StratifiedKFold(5), scoring=matrix_error_function, n_jobs=-1)
   
    cv_scores[name] = scores
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Cross-validation scores for random_forest: [-0.906875 -0.894375 -0.916875 -0.895    -0.891875]
Average cost error for random_forest: -0.901
Cross-validation scores for xgboost: [-0.92125  -0.92     -0.876875 -0.881875 -0.87875 ]
Average cost error for xgboost: -0.8957499999999999
Cross-validation scores for gradient_boosting: [-0.91375  -0.873125 -0.885625 -0.879375 -0.91    ]
Average cost error for gradient_boosting: -0.892375


### 3rd Pipeline

In [14]:
y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

cv_scores = {}

for name, classifier in final_classifiers.items():
    pipeline = make_imblearn_pipeline(
        preprocessor,
        RandomOverSampler(random_state=42),
        #selector,
        classifier
    )
   
    scores = cross_val_score(pipeline, X_train, y_train_mapped, cv=StratifiedKFold(5), scoring=matrix_error_function, n_jobs=-1)
   
    cv_scores[name] = scores
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Cross-validation scores for random_forest: [-0.91     -0.903125 -0.949375 -0.933125 -0.91375 ]
Average cost error for random_forest: -0.921875
Cross-validation scores for xgboost: [-0.916875 -0.893125 -0.87875  -0.88125  -0.9075  ]
Average cost error for xgboost: -0.8955
Cross-validation scores for gradient_boosting: [-0.936875 -0.8675   -0.889375 -0.871875 -0.885   ]
Average cost error for gradient_boosting: -0.8901249999999999


### 4th Pipeline

In [15]:
y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

cv_scores = {}

for name, classifier in final_classifiers.items():
    pipeline = make_imblearn_pipeline(
        preprocessor,
        #RandomOverSampler(random_state=42),
        #selector,
        classifier
    )
   
    scores = cross_val_score(pipeline, X_train, y_train_mapped, cv=StratifiedKFold(5), scoring=matrix_error_function, n_jobs=-1)
   
    cv_scores[name] = scores
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Cross-validation scores for random_forest: [-0.90375  -0.8975   -0.9325   -0.874375 -0.9175  ]
Average cost error for random_forest: -0.905125
Cross-validation scores for xgboost: [-0.909375 -0.915625 -0.89375  -0.889375 -0.9075  ]
Average cost error for xgboost: -0.903125
Cross-validation scores for gradient_boosting: [-0.899375 -0.87875  -0.88125  -0.87375  -0.88875 ]
Average cost error for gradient_boosting: -0.884375


### 5th Pipeline

In [10]:
n_neighbors = 20
contamination = 0.1
cv_scores_five = dict()
cv_scores_five['random_forest'] = []
cv_scores_five['xgboost'] = []
cv_scores_five['gradient_boosting'] = []

y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train_mapped.iloc[train_index], y_train_mapped.iloc[test_index]
   
    X_train_fold_preprocessed = preprocessor.fit_transform(X_train_fold)
    X_test_fold_preprocessed = preprocessor.transform(X_test_fold)
   
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    outliers = lof.fit_predict(X_train_fold_preprocessed)
    mask = outliers != -1
    X_train_fold_filtered = X_train_fold_preprocessed[mask]
    y_train_fold_filtered = y_train_fold[mask]
   
    sampler = RandomOverSampler(random_state=42)
    X_train_fold_resampled, y_train_fold_resampled = sampler.fit_resample(X_train_fold_filtered, y_train_fold_filtered)

    print('Feature selection started!')
    feature_selector = SelectFromModel(DecisionTreeClassifier(random_state=42))
    feature_selector.fit(X_train_fold_resampled, y_train_fold_resampled)
    X_train_fold_selected = feature_selector.transform(X_train_fold_resampled)
    X_test_fold_selected = feature_selector.transform(X_test_fold_preprocessed)

    print('Model Fitting and Prediction started!')
    for name, classifier in final_classifiers.items():
        classifier.fit(X_train_fold_selected, y_train_fold_resampled)
        y_pred_fold = classifier.predict(X_test_fold_selected)
        score = custom_error_cost_score(y_test_fold, y_pred_fold)
        cv_scores_five[name].append(score)

Feature selection started!
Model Fitting and Prediction started!
Feature selection started!
Model Fitting and Prediction started!
Feature selection started!
Model Fitting and Prediction started!
Feature selection started!
Model Fitting and Prediction started!
Feature selection started!
Model Fitting and Prediction started!


In [11]:
for name, scores in cv_scores_five.items():
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Cross-validation scores for random_forest: [0.910625, 0.889375, 0.9075, 0.904375, 0.915625]
Average cost error for random_forest: 0.9055
Cross-validation scores for xgboost: [0.923125, 0.878125, 0.8725, 0.894375, 0.925625]
Average cost error for xgboost: 0.89875
Cross-validation scores for gradient_boosting: [0.918125, 0.893125, 0.875625, 0.8725, 0.896875]
Average cost error for gradient_boosting: 0.89125


### 6th Pipeline

In [12]:
n_neighbors = 20
contamination = 0.1
cv_scores = dict()
cv_scores['random_forest'] = []
cv_scores['xgboost'] = []
cv_scores['gradient_boosting'] = []

y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train_mapped.iloc[train_index], y_train_mapped.iloc[test_index]

    print('Initial shape: ', pd.DataFrame(X_train_fold).shape)
   
    X_train_fold = preprocessor.fit_transform(X_train_fold)
    X_test_fold = preprocessor.transform(X_test_fold)
   
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    outliers = lof.fit_predict(X_train_fold)
    mask = outliers != -1
    X_train_fold = X_train_fold[mask]
    y_train_fold = y_train_fold[mask]
   
    sampler = RandomUnderSampler(random_state=42)
    X_train_fold, y_train_fold = sampler.fit_resample(X_train_fold, y_train_fold)

    print('Feature selection started!')
    feature_selector = SelectFromModel(DecisionTreeClassifier(random_state=42))
    feature_selector.fit(X_train_fold, y_train_fold)
    X_train_fold = feature_selector.transform(X_train_fold)
    X_test_fold = feature_selector.transform(X_test_fold)

    print('Last shape: ', pd.DataFrame(X_train_fold).shape)

    print('Model Fitting and Prediction started!')
    for name, classifier in final_classifiers.items():
        classifier.fit(X_train_fold, y_train_fold)
        y_pred_fold = classifier.predict(X_test_fold)
        score = custom_error_cost_score(y_test_fold, y_pred_fold)
        cv_scores[name].append(score)
    print('Model Fitting and Prediction finished!')
    print('**********************************************************')
    
for name, scores in cv_scores.items():
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (2439, 58)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (2415, 59)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (2439, 61)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (2436, 58)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (2436, 63)
Model Fitting and Prediction started!
Model Fitting and Predi

### 7th Pipeline

In [13]:
n_neighbors = 20
contamination = 0.1
cv_scores = dict()
cv_scores['random_forest'] = []
cv_scores['xgboost'] = []
cv_scores['gradient_boosting'] = []

y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train_mapped.iloc[train_index], y_train_mapped.iloc[test_index]

    print('Initial shape: ', pd.DataFrame(X_train_fold).shape)
   
    X_train_fold = preprocessor.fit_transform(X_train_fold)
    X_test_fold = preprocessor.transform(X_test_fold)
   
    # lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    # outliers = lof.fit_predict(X_train_fold)
    # mask = outliers != -1
    # X_train_fold = X_train_fold[mask]
    # y_train_fold = y_train_fold[mask]
   
    sampler = RandomOverSampler(random_state=42)
    X_train_fold, y_train_fold = sampler.fit_resample(X_train_fold, y_train_fold)

    print('Feature selection started!')
    feature_selector = SelectFromModel(DecisionTreeClassifier(random_state=42))
    feature_selector.fit(X_train_fold, y_train_fold)
    X_train_fold = feature_selector.transform(X_train_fold)
    X_test_fold = feature_selector.transform(X_test_fold)

    print('Last shape: ', pd.DataFrame(X_train_fold).shape)

    print('Model Fitting and Prediction started!')
    for name, classifier in final_classifiers.items():
        classifier.fit(X_train_fold, y_train_fold)
        y_pred_fold = classifier.predict(X_test_fold)
        score = custom_error_cost_score(y_test_fold, y_pred_fold)
        cv_scores[name].append(score)
    print('Model Fitting and Prediction finished!')
    print('**********************************************************')
    
for name, scores in cv_scores.items():
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (9045, 61)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (9042, 58)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (9042, 63)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (9042, 66)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (9045, 61)
Model Fitting and Prediction started!
Model Fitting and Predi

### 8th Pipeline

In [14]:
n_neighbors = 20
contamination = 0.1
cv_scores = dict()
cv_scores['random_forest'] = []
cv_scores['xgboost'] = []
cv_scores['gradient_boosting'] = []

y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train_mapped.iloc[train_index], y_train_mapped.iloc[test_index]

    print('Initial shape: ', pd.DataFrame(X_train_fold).shape)
   
    X_train_fold = preprocessor.fit_transform(X_train_fold)
    X_test_fold = preprocessor.transform(X_test_fold)
   
    # lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    # outliers = lof.fit_predict(X_train_fold)
    # mask = outliers != -1
    # X_train_fold = X_train_fold[mask]
    # y_train_fold = y_train_fold[mask]
   
    # sampler = RandomOverSampler(random_state=42)
    # X_train_fold, y_train_fold = sampler.fit_resample(X_train_fold, y_train_fold)

    print('Feature selection started!')
    feature_selector = SelectFromModel(DecisionTreeClassifier(random_state=42))
    feature_selector.fit(X_train_fold, y_train_fold)
    X_train_fold = feature_selector.transform(X_train_fold)
    X_test_fold = feature_selector.transform(X_test_fold)

    print('Last shape: ', pd.DataFrame(X_train_fold).shape)

    print('Model Fitting and Prediction started!')
    for name, classifier in final_classifiers.items():
        classifier.fit(X_train_fold, y_train_fold)
        y_pred_fold = classifier.predict(X_test_fold)
        score = custom_error_cost_score(y_test_fold, y_pred_fold)
        cv_scores[name].append(score)
    print('Model Fitting and Prediction finished!')
    print('**********************************************************')
    
for name, scores in cv_scores.items():
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (6400, 62)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (6400, 64)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (6400, 61)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (6400, 61)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (6400, 58)
Model Fitting and Prediction started!
Model Fitting and Predi

## 9th Pipeline

In [15]:
n_neighbors = 20
contamination = 0.1
cv_scores = dict()
cv_scores['random_forest'] = []
cv_scores['xgboost'] = []
cv_scores['gradient_boosting'] = []

y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train_mapped.iloc[train_index], y_train_mapped.iloc[test_index]

    print('Initial shape: ', pd.DataFrame(X_train_fold).shape)
   
    X_train_fold = preprocessor.fit_transform(X_train_fold)
    X_test_fold = preprocessor.transform(X_test_fold)
   
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    outliers = lof.fit_predict(X_train_fold)
    mask = outliers != -1
    X_train_fold = X_train_fold[mask]
    y_train_fold = y_train_fold[mask]
   
    # sampler = RandomOverSampler(random_state=42)
    # X_train_fold, y_train_fold = sampler.fit_resample(X_train_fold, y_train_fold)

    print('Feature selection started!')
    feature_selector = SelectFromModel(DecisionTreeClassifier(random_state=42))
    feature_selector.fit(X_train_fold, y_train_fold)
    X_train_fold = feature_selector.transform(X_train_fold)
    X_test_fold = feature_selector.transform(X_test_fold)

    print('Last shape: ', pd.DataFrame(X_train_fold).shape)

    print('Model Fitting and Prediction started!')
    for name, classifier in final_classifiers.items():
        classifier.fit(X_train_fold, y_train_fold)
        y_pred_fold = classifier.predict(X_test_fold)
        score = custom_error_cost_score(y_test_fold, y_pred_fold)
        cv_scores[name].append(score)
    print('Model Fitting and Prediction finished!')
    print('**********************************************************')
    
for name, scores in cv_scores.items():
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (5760, 65)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (5760, 67)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (5760, 62)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (5760, 63)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Last shape:  (5760, 65)
Model Fitting and Prediction started!
Model Fitting and Predi

## 10th Pipeline

In [16]:
n_neighbors = 20
contamination = 0.1
cv_scores = dict()
cv_scores['random_forest'] = []
cv_scores['xgboost'] = []
cv_scores['gradient_boosting'] = []

y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train_mapped.iloc[train_index], y_train_mapped.iloc[test_index]

    print('Initial shape: ', pd.DataFrame(X_train_fold).shape)
   
    X_train_fold = preprocessor.fit_transform(X_train_fold)
    X_test_fold = preprocessor.transform(X_test_fold)
   
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    outliers = lof.fit_predict(X_train_fold)
    mask = outliers != -1
    X_train_fold = X_train_fold_preprocessed[mask]
    y_train_fold = y_train_fold[mask]
   
    sampler = RandomOverSampler(random_state=42)
    X_train_fold, y_train_fold = sampler.fit_resample(X_train_fold, y_train_fold)

    # print('Feature selection started!')
    # feature_selector = SelectFromModel(DecisionTreeClassifier(random_state=42))
    # feature_selector.fit(X_train_fold, y_train_fold)
    # X_train_fold = feature_selector.transform(X_train_fold)
    # X_test_fold = feature_selector.transform(X_test_fold)

    # print('Last shape: ', pd.DataFrame(X_train_fold).shape)

    print('Model Fitting and Prediction started!')
    for name, classifier in final_classifiers.items():
        classifier.fit(X_train_fold, y_train_fold)
        y_pred_fold = classifier.predict(X_test_fold)
        score = custom_error_cost_score(y_test_fold, y_pred_fold)
        cv_scores[name].append(score)
    print('Model Fitting and Prediction finished!')
    print('**********************************************************')
    
for name, scores in cv_scores.items():
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Initial shape:  (6400, 117)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Cross-validation scores for random_forest: [0.935, 0.9625, 0.99375, 0.998125, 0.903125]
Average cost error for random_forest: 0.9584999999999999
Cross-validation scores for xgboost

## 11th Pipeline

In [18]:
n_neighbors = 20
contamination = 0.1
cv_scores = dict()
cv_scores['random_forest'] = []
cv_scores['xgboost'] = []
cv_scores['gradient_boosting'] = []

y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train_mapped.iloc[train_index], y_train_mapped.iloc[test_index]

    print('Initial shape: ', pd.DataFrame(X_train_fold).shape)
   
    X_train_fold = preprocessor.fit_transform(X_train_fold)
    X_test_fold = preprocessor.transform(X_test_fold)
   
    # lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
    # outliers = lof.fit_predict(X_train_fold)
    # mask = outliers != -1
    # X_train_fold = X_train_fold_preprocessed[mask]
    # y_train_fold = y_train_fold[mask]
   
    sampler = RandomOverSampler(random_state=42)
    X_train_fold, y_train_fold = sampler.fit_resample(X_train_fold, y_train_fold)

    print('Feature selection started!')
    feature_selector = SelectFromModel(DecisionTreeClassifier(random_state=42))
    feature_selector.fit(X_train_fold, y_train_fold)
    X_train_fold = feature_selector.transform(X_train_fold)
    X_test_fold = feature_selector.transform(X_test_fold)

    print('Last shape: ', pd.DataFrame(X_train_fold).shape)

    print('Model Fitting and Prediction started!')
    for name, classifier in final_classifiers.items():
        classifier.fit(X_train_fold, y_train_fold)
        y_pred_fold = classifier.predict(X_test_fold)
        score = custom_error_cost_score(y_test_fold, y_pred_fold)
        cv_scores[name].append(score)
    print('Model Fitting and Prediction finished!')
    print('**********************************************************')
    
for name, scores in cv_scores.items():
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Initial shape:  (6400, 117)
Feature selection started!
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 117)
Feature selection started!
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Cross-validation scores for random_forest: [0

# Remove the Group Column

In [3]:
df = pd.read_excel("C:/Users/Cagan Deliktas/Desktop/ProjectDataMining2/DM2_DataCraft/data/training_data.xls")
X_test = pd.read_excel("C:/Users/Cagan Deliktas/Desktop/ProjectDataMining2/DM2_DataCraft/data/test_data_no_target.xls")

X_train = df.loc[:, ~df.columns.isin(['Group', 'Class', 'Perform'])]
y_train = df['Class']

numeric_columns = X_train.loc[:, ~X_train.columns.isin(['Group'])].columns.to_list()
X_train.loc[:, numeric_columns] = X_train.loc[:, numeric_columns].replace(
    {
        'NA': np.nan, 
        '': np.nan,
        ' ': np.nan
    }
).astype(float)

numeric_columns = X_test.loc[:, ~X_test.columns.isin(['Group'])].columns.to_list()
X_test[numeric_columns] = X_test.loc[:, numeric_columns].replace(
    {
        'NA': np.nan, 
        '': np.nan,
        ' ': np.nan
    }
).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:, numeric_columns] = X_train.loc[:, numeric_columns].replace(


In [4]:
preprocessor = Pipeline([
    ('imputer', KNNImputer(n_neighbors=10)), 
    ('scaler', RobustScaler()),
])

In [22]:
cv_scores = dict()
cv_scores['random_forest'] = []
cv_scores['xgboost'] = []
cv_scores['gradient_boosting'] = []

y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train_mapped.iloc[train_index], y_train_mapped.iloc[test_index]

    print('Initial shape: ', pd.DataFrame(X_train_fold).shape)
   
    X_train_fold = preprocessor.fit_transform(X_train_fold)
    X_test_fold = preprocessor.transform(X_test_fold)
   
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    outliers = lof.fit_predict(X_train_fold)
    mask = outliers != -1
    X_train_fold = X_train_fold[mask]
    y_train_fold = y_train_fold[mask]
   
    sampler = SMOTE(random_state=42)
    X_train_fold, y_train_fold = sampler.fit_resample(X_train_fold, y_train_fold)

    print('Last shape: ', pd.DataFrame(X_train_fold).shape)

    print('Model Fitting and Prediction started!')
    for name, classifier in final_classifiers.items():
        classifier.fit(X_train_fold, y_train_fold)
        y_pred_fold = classifier.predict(X_test_fold)
        score = custom_error_cost_score(y_test_fold, y_pred_fold)
        cv_scores[name].append(score)
    print('Model Fitting and Prediction finished!')
    print('**********************************************************')
    
for name, scores in cv_scores.items():
    print(f'Cross-validation scores for {name}: {scores}')
    print(f'Average cost error for {name}: {np.mean(scores)}')

Initial shape:  (6400, 116)
Last shape:  (8133, 116)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 116)
Last shape:  (8127, 116)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 116)
Last shape:  (8115, 116)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 116)
Last shape:  (8166, 116)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Initial shape:  (6400, 116)
Last shape:  (8142, 116)
Model Fitting and Prediction started!
Model Fitting and Prediction finished!
**********************************************************
Cross-validation scores for random_forest: [0.92875, 0.

### HyperParameter Tuning

In [29]:
y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

gb_pipeline = imblearn_Pipeline([
    ('imputer', KNNImputer(n_neighbors=10)),
    ('scaler', RobustScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

param_dist = {
    'classifier__n_estimators': randint(100, 350),
    'classifier__max_depth': randint(3, 10),
    'classifier__learning_rate': uniform(0.01, 0.3),
    'classifier__subsample': uniform(0.1, 0.9),
    'classifier__min_samples_split': randint(2, 20),
    'classifier__min_samples_leaf': randint(1, 20),
    'classifier__max_features': ['sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(
    gb_pipeline,
    param_distributions=param_dist,
    n_iter=10,
    cv=StratifiedKFold(3),
    scoring=matrix_error_function,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    error_score='raise'
)

random_search.fit(X_train, y_train_mapped)

print(f'Best parameters: {random_search.best_params_}')
print(f'Best cross-validation score: {random_search.best_score_}')

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters: {'classifier__learning_rate': 0.016175348288740735, 'classifier__max_depth': 4, 'classifier__max_features': 'log2', 'classifier__min_samples_leaf': 6, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 291, 'classifier__subsample': 0.9929904033620958}
Best cross-validation score: -0.888876043532818


In [10]:
y_train_mapped = y_train.copy()
y_train_mapped[y_train == -1] = 0
y_train_mapped[y_train == 0] = 1
y_train_mapped[y_train == 1] = 2

best_parameters = {'learning_rate': 0.016175348288740735, 
                   'max_depth': 4, 
                   'max_features': 'log2', 
                   'min_samples_leaf': 6, 
                   'min_samples_split': 3, 
                   'n_estimators': 291, 
                   'subsample': 0.9929904033620958,
                  'random_state': 42}

gb_pipeline_last = imblearn_Pipeline([
    ('imputer', KNNImputer(n_neighbors=10)),
    ('scaler', RobustScaler()),
    ('smote', SMOTE(random_state=42)),
    ('feature_selector', SelectFromModel(DecisionTreeClassifier(random_state=42))),
    ('classifier', GradientBoostingClassifier(**best_parameters))
])

In [11]:
gb_pipeline_last.fit(X_train, y_train_mapped)

In [14]:
X_test = X_test.drop('Group', axis=1)

In [16]:
preds = gb_pipeline_last.predict(X_test)

In [17]:
preds[preds == 0] = -1
preds[preds == 1] = 0
preds[preds == 2] = 1

In [18]:
preds

array([ 0,  1, -1, ...,  0, -1, -1], dtype=int64)

## Predictions Voting

In [19]:
file_path = "comb11.txt"
pd.DataFrame(preds).to_csv(file_path, index=False, header=False)