In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
import random
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import RFECV

from sklearn.impute import SimpleImputer, KNNImputer

import tensorflow as tf
from tensorflow.keras import layers, models

from Functions_Classes import *

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler

from feature_engine.encoding import CountFrequencyEncoder

from xgboost import XGBClassifier

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
np.random.seed(0)

In [2]:
df = pd.read_excel("C:/Users/Cagan Deliktas/Desktop/ProjectDataMining2/DM2_DataCraft/data/training_data.xls")
X_test_compete = pd.read_excel("C:/Users/Cagan Deliktas/Desktop/ProjectDataMining2/DM2_DataCraft/data/test_data_no_target.xls")

df = df.loc[:, df.columns != 'Perform']
#df = df.loc[:, df.columns != 'Group']


df_x = df.loc[:, df.columns != 'Class']
df_y = df['Class']

X_train = df_x.copy()
y_train =  df_y.copy()

#X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2,shuffle=True, stratify=df_y, random_state=0)

### Remove one year differences from the dataset

In [3]:
X_train = X_train.loc[:, ~X_train.columns.str.contains('d')]
X_test_compete = X_test_compete.loc[:, ~X_test_compete.columns.str.contains('d')]

In [4]:
numeric_columns = X_train.loc[:, ~X_train.columns.isin(['Group'])].columns.to_list()
X_train[numeric_columns] = X_train.loc[:, numeric_columns].replace(
    {
        'NA': np.nan, 
        '': np.nan,
        ' ': np.nan
    }
).astype(float)

In [5]:
X_train.head()

Unnamed: 0,Group,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13,I14,I15,I16,I17,I18,I19,I20,I21,I22,I23,I24,I25,I26,I27,I28,I29,I30,I31,I32,I33,I34,I35,I36,I37,I38,I39,I40,I41,I42,I43,I44,I45,I46,I47,I48,I49,I50,I51,I52,I53,I54,I55,I56,I57,I58
0,G9,0.136495,-0.028429,-0.037772,-0.232459,-0.016222,-0.187506,-0.322545,-0.043743,0.125389,-0.014757,-0.033105,0.303035,-0.093811,-0.598917,-0.271292,-0.256749,-0.100146,-0.045525,-0.078422,-0.060129,-0.069528,-0.052432,-0.114432,-0.104989,0.342845,-0.159417,0.006772,-0.303193,-0.163287,-0.080599,-0.82888,-1.064215,-0.547067,-0.540497,-0.676045,-0.305007,-0.507724,-0.191437,-0.087362,-0.856151,0.802525,0.73308,0.006512,0.53329,0.195197,0.058094,-0.228889,-0.150821,-0.104986,-0.026743,0.188312,-0.250701,-0.10119,-0.357521,-0.527956,0.611385,-0.092714,-0.055733
1,G5,-0.714522,-0.042137,-0.052968,-0.796862,-0.018394,0.070102,-0.076321,-0.063864,-1.045521,-0.037353,-0.792515,-1.082483,0.025798,-0.833652,-0.625088,-0.333608,0.072579,-0.046963,0.223022,-0.605902,-0.131099,-0.235929,-0.07392,-0.063247,-0.798768,-0.899983,1.388771,-0.248677,-0.058083,-0.01447,0.092095,0.561368,0.224819,0.22319,0.098852,-0.128227,-0.215876,-0.007164,-0.03526,-0.123911,-0.089751,-0.094963,0.362818,0.011107,-1.506356,-0.573679,-0.955222,-0.81888,-1.063295,-1.022679,-1.336188,-0.612039,-0.061357,-0.482805,-0.017077,1.192135,-0.114981,-0.028074
2,G10,0.104791,-0.038188,-0.053191,0.620233,0.148587,0.489875,0.319274,-0.060246,0.053174,-0.025008,-0.45684,1.28445,-0.13347,3.207672,2.37323,1.304427,,,-0.361293,2.995661,,-0.188988,-0.044158,-0.02455,-0.586562,-0.176292,-1.013037,0.066912,0.219649,0.15449,2.370951,1.384675,0.489152,0.484715,0.367301,0.749572,0.66941,0.423228,0.226897,3.227283,-0.329997,-0.327579,-1.033898,0.014531,0.211889,-1.197156,2.860444,,3.584223,,1.272375,7.427558,-0.182816,-2.713205,-1.877595,-0.568691,0.224945,0.052749
3,G2,-0.532847,-0.006582,-0.023377,1.306702,-0.068909,0.048024,-0.119481,-0.021057,-1.012916,-0.011783,1.206727,0.311773,-0.005928,3.869459,-1.064793,0.107702,-0.126984,-0.04436,-0.181023,-0.691971,,0.195138,-0.104877,-0.093976,-0.757725,0.004432,-1.471299,0.643575,-0.067005,-0.006874,-0.087499,0.110638,0.04688,0.047141,-0.274713,0.169046,-0.179742,0.047391,0.015197,0.105158,-0.045135,-0.051329,0.202098,0.034693,2.904519,4.514844,-0.241111,,-0.521576,,-0.308812,-0.542532,-0.165028,1.490354,-1.550745,-0.918676,0.013484,-0.013198
4,G3,-0.200815,-0.016334,-0.036754,-0.886675,0.484495,-1.148744,0.152517,-0.04358,-0.935537,-0.023262,-0.908986,-0.525121,0.015492,-0.347325,0.29636,-0.242201,0.120049,-0.048293,0.290658,-0.345816,0.249586,-0.241812,-0.082055,-0.077706,-0.845163,-0.257777,0.919065,-0.522102,0.146076,0.043851,1.281726,0.039106,0.135331,0.134652,0.654099,1.437536,1.995784,-0.145004,-0.029483,0.252151,0.308723,0.293393,-0.527888,-0.00368,-1.553644,-1.233945,-0.947111,-0.926073,-0.772468,-0.63644,-0.833875,-0.527935,-0.01417,-0.142943,1.070523,-0.284682,-0.15511,-0.026941


## Shape

In [6]:
X_train.shape

(8000, 59)

# Classification Models

In [7]:
nn = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),
    activation='relu',
    solver='adam',
    max_iter=100000, 
    random_state=0
)

voting_estimators = [
    ('RandomForest', RandomForestClassifier(random_state=0)),
    #('DecisionTree', DecisionTreeClassifier(random_state=0)),
    #('SVM', SVC(random_state=0, probability=True)),
    #('NaiveBayes', GaussianNB()),
    #('KNN', KNeighborsClassifier()),
    #('LogisticRegression', LogisticRegression(random_state=0, solver="saga", max_iter=1000)),
    #('AdaBoost', AdaBoostClassifier(random_state=0, algorithm='SAMME')),
    ('GradientBoost', GradientBoostingClassifier(random_state=0)),
    ('XGBoost', XGBClassifier(seed=0)),
    ('NeuralNetwork', nn) #hidden_layer_sizes=(20,20) for 2 hidden layers with 20 neurons each
]

vote_model = VotingClassifier(
    estimators=voting_estimators, 
    voting='hard'
)

stacking_estimators = [
    ('RandomForest', RandomForestClassifier(random_state=0)),
    #('DecisionTree', DecisionTreeClassifier(random_state=0)),
    #('SVM', SVC(random_state=0, probability=True)),
    #('NaiveBayes', GaussianNB()),
    #('KNN', KNeighborsClassifier()),
    #('AdaBoost', AdaBoostClassifier(random_state=0, algorithm='SAMME')),
    ('GradientBoost', GradientBoostingClassifier(random_state=0)),
    ('XGBoost', XGBClassifier(seed=0)),
    ('NeuralNetwork',nn)
]
meta_stack_classifier = LogisticRegression(random_state=0, solver="saga", max_iter=1000)

stacking_model = StackingClassifier(
    estimators=stacking_estimators, 
    final_estimator=meta_stack_classifier, 
    cv=5
)

estimators = [
    ('RandomForest', RandomForestClassifier(random_state=0)),
    #('DecisionTree', DecisionTreeClassifier(random_state=0)),
    #('SVM', SVC(random_state=0, probability=True)),
    #('NaiveBayes', GaussianNB()),
    #('KNN', KNeighborsClassifier()),
    #('LogisticRegression', LogisticRegression(random_state=0, solver="saga", max_iter=1000)),
    ('GradientBoost', GradientBoostingClassifier(random_state=0)),
    ('XGBoost', XGBClassifier(seed=0)),
    #('AdaBoost', AdaBoostClassifier(random_state=0, algorithm='SAMME'))
    #('Voting', vote_model),
    #('Stacking', stacking_model),
    #('NeuralNetwork', MLPClassifier(random_state=0, max_iter=1000)) # 2 hidden layers with 20 neurons each
]

# Create Pipeline with different combination of preprocessing steps

## Combination 7
#### knn impute, robust scaler, lof, smote, rfecv

In [8]:
objs = dict()
X_trainP, ohe_encoder = apply_one_hot_encoding(
    X_train, 
    'Group'
)
objs['ohe'] = ohe_encoder
####################################### Imputing Missing Values
X_trainP, imp = handle_missing_vals_simple(
    X_trainP
)

objs['miss'] = imp
print('Shape of xtrain: ', X_trainP.shape)

####################################### Robust
std_scale_cols = (
    X_trainP
    .loc[:, ~X_trainP.columns.str.contains('Group')]
    .columns
)

X_trainP, std_scaler = apply_robust_scaler(
    X_trainP, 
    std_scale_cols
)

objs['robust_scaler'] = std_scaler
####################################### LOF
X_trainP_df = pd.concat(
    [
        X_trainP.reset_index(drop=True), 
        pd.Series(y_train, name='Class').reset_index(drop=True)
    ], 
    axis=1
)

X_trainP_df = detect_outliers_with_lof(
    data=X_trainP_df
)[0]

X_trainP = (
    X_trainP_df
    .loc[:, X_trainP_df.columns != 'Class']
)

y_trainP = X_trainP_df['Class']
print('Shape of xtrain: ', X_trainP.shape)

####################################### Smote
X_trainP, y_trainP = apply_random_oversampling(X_trainP, y_trainP)

Shape of xtrain:  (8000, 69)
Shape of xtrain:  (7392, 69)


# Prepare the test set for real predictions:

In [9]:
X_test_compete_group = X_test_compete.loc[:, 'Group'].copy().reset_index(drop=True)
X_test_compete = X_test_compete.drop('Group', axis=1).reset_index(drop=True)

In [10]:
X_test_competeP = pd.DataFrame(objs['miss'].transform(X_test_compete),columns=X_test_compete.columns)
X_test_competeP = pd.DataFrame(objs['robust_scaler'].transform(X_test_competeP), columns=X_test_competeP.columns)

In [11]:
X_test_competeP = pd.concat([X_test_competeP, X_test_compete_group], axis=1)

In [12]:
X_test_competeP_ohe_cols = pd.DataFrame(
        objs['ohe'].transform(X_test_competeP[['Group']]).toarray(),
        columns=objs['ohe'].get_feature_names_out(['Group'])
    ).reset_index(drop=True)

X_test_competeP = X_test_competeP.drop('Group', axis=1).reset_index(drop=True)
X_test_competeP = pd.concat([X_test_competeP, X_test_competeP_ohe_cols], axis=1)

## Predictions Voting

In [13]:
vote_model.fit(X_trainP, y_trainP)
predictions = vote_model.predict(X_test_competeP)

In [15]:
file_path = "preds_comb8_wo_diff_voting.txt"
pd.DataFrame(predictions).to_csv(file_path, index=False, header=False)