In [86]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import xgboost
import numpy as np
import scipy.stats as stats
import math
from sklearn.compose import ColumnTransformer

### Functions

In [78]:
def clean_up(df):
    df = df.drop(columns=['Id'])
    df.columns = [x.strip() for x in df.columns]
    nulls = pd.DataFrame(df.isnull().sum(), columns=['nulls']).reset_index().rename(columns={'index': 'column'})
    nulls = nulls[nulls['nulls'] > 0]
    null_columns = nulls['column'].tolist()

    for x in null_columns:
        df.loc[df[x].isnull(), x] = df[x].mean()

    return df


def split_train(df):
    x = df['Class']
    y = df.drop(columns=['Class'])
    return x, y


def data_transform(trimmed_df):
    log_cols = [
        'DI',
        'EE'
    ]

    sqrt_cols = [
        'AF'
    ]

    boxcox_cols = [
        'AB',
        'BQ',
        'DE',
        'EB',
        'FE',
        'GB'
    ]

    yeo_cols = [
        'AM',
        'GF',
        'CF'
    ]

    trimmed_df[log_cols] = np.log1p(trimmed_df[log_cols])
    trimmed_df[sqrt_cols] = np.sqrt(trimmed_df[sqrt_cols])

    for col in boxcox_cols:
        if trimmed_df[col].value_counts().iloc[0] != len(trimmed_df[col]):
            trimmed_df[col], lmbda = stats.boxcox(trimmed_df[col])

    for col in yeo_cols:
        if trimmed_df[col].value_counts().iloc[0] != len(trimmed_df[col]):
            trimmed_df[col], lmbda = stats.yeojohnson(trimmed_df[col])

    return trimmed_df


def get_used_cols(clean_df):
    if 'Class' in clean_df.columns:
        used_cols = [
            'AF',
            'AB',
            'BQ',
            'DI',
            'FL',
            'AM',
            'CR',
            'FE',
            'DH',
            'DA',
            'BN',
            'CD',
            'BP',
            'DL',
            'EE',
            'GF',
            'DE',
            'BD',
            'CF',
            'AX',
            'FI',
            'EB',
            'GB',
            'CU',
            'EJ',
            'Class']
    else:
        used_cols = [
            'AF',
            'AB',
            'BQ',
            'DI',
            'FL',
            'AM',
            'CR',
            'FE',
            'DH',
            'DA',
            'BN',
            'CD',
            'BP',
            'DL',
            'EE',
            'GF',
            'DE',
            'BD',
            'CF',
            'AX',
            'FI',
            'EB',
            'GB',
            'CU',
            'EJ']

    trimmed_df = clean_df[used_cols]

    return trimmed_df


def prep_train(train_df):
    train_df = clean_up(train_df)
    train_df = get_used_cols(train_df)

    for col in train_df.columns:
        if col not in ['Id', 'EJ', 'Class']:
            z_scores = stats.zscore(train_df[col])
            threshold = 3  # Adjust the threshold as per your requirement
            outliers = np.abs(z_scores) > threshold
            train_df = train_df[~outliers]

    train_df = data_transform(train_df)

    return train_df


def prep_test(test_df):
    test_df = clean_up(test_df)
    test_df = get_used_cols(test_df)
    test_df = data_transform(test_df)

    return test_df

### Ingestion

In [79]:
train = pd.read_csv('../Data/train.csv')
test = pd.read_csv('../Data/test.csv')

In [80]:
train = prep_train(train)
test = prep_test(test)

### Transformation

In [77]:
X = train[['Class']].to_array()
y = train.drop(columns=['Class']).to_array()
thing = OneHotEncoder(drop='first', handle_unknown='ignore').fit(ej)

NameError: name 'ej' is not defined

In [71]:
train[['EJ']]

Unnamed: 0,EJ
0,B
1,A
2,B
3,B
4,B
...,...
610,A
611,A
612,A
613,B


In [None]:
numeric_pipeline = Pipeline(
    steps=[('num_impute', SimpleImputer(strategy='mean')),
           ('scaler', StandardScaler())]
)

In [5]:
test

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,02fa521e1838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,040e15f562a2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,046e85c7cc7f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5
