In [None]:
import pandas as pd
import numpy as np
import duckdb as duck
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics impor f1_score

In [2]:
SEED:int = 10
SHUFFLE = True

In [3]:
def dtype_down_allocating(df: pd.DataFrame) -> pd.DataFrame:
    ini_mb_usage = df.memory_usage().sum() / 1024**2

    f16min, f16max = np.finfo(np.float16).min, np.finfo(np.float16).max
    f32min, f32max = np.finfo(np.float32).min, np.finfo(np.float32).max

    i16min, i16max = np.iinfo(np.int16).min, np.iinfo(np.int16).max
    i32min, i32max = np.iinfo(np.int32).min, np.iinfo(np.int32).max

    for col in df.columns:
        col_dtype = df[col].dtype

        if pd.api.types.is_numeric_dtype(col_dtype):
            col_max = df[col].max()
            col_min = df[col].min()

            if pd.api.types.is_float_dtype(col_dtype):
                if col_min >= f16min and col_max <= f16max:
                    df[col] = df[col].astype(np.float16)
                elif col_min >= f32min and col_max <= f32max:
                    df[col] = df[col].astype(np.float32)

            elif pd.api.types.is_integer_dtype(col_dtype):
                if col_min >= i16min and col_max <= i16max:
                    df[col] = df[col].astype(np.int16)
                elif col_min >= i32min and col_max <= i32max:
                    df[col] = df[col].astype(np.int32)
                
    end_mb_usage = (df.memory_usage().sum() / 1024**2) *100
    
    print(f'{np.round((end_mb_usage-ini_mb_usage)/ini_mb_usage, 2)}%')
        
    return df

In [4]:
train_df = dtype_down_allocating(pd.read_csv(r"../../data/CMI_Sensor_Data/train.csv"))
test_df = dtype_down_allocating(pd.read_csv(r"../../data/CMI_Sensor_Data/test.csv"))
train_demo = pd.read_csv(r"../../data/CMI_Sensor_Data/train_demographics.csv")
test_demo = pd.read_csv(r"../../data/CMI_Sensor_Data/test_demographics.csv")

25.76%
24.7%


In [5]:
excluded_cols = ['gesture', 'sequence_type', 'behavior', 'orientation','phase'  # Only in Train
    ,'sequence_id', 'sequence_counter',  # Ids
    'row_id', 'subject', 'phase']  # metadata]

X = train_df.merge(right=train_demo,how='left',on=['subject']).drop(columns=excluded_cols)
y = train_df['gesture']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, shuffle=SHUFFLE)

le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_test= le.transform(y_test)

In [7]:
train_df.select_dtypes(include='object').columns

Index(['row_id', 'sequence_type', 'sequence_id', 'subject', 'orientation',
       'behavior', 'phase', 'gesture'],
      dtype='object')

In [10]:
model = xgb.XGBClassifier(
        max_depth= 7,
        learning_rate= 0.1,
        n_estimators= 70,
        subsample= 0.37467576245415724,
        colsample_bytree= 0.35716664430148
        ,eval_metric='mlogloss'
        ,verbosity=1
        ,device="cuda"
        ,early_stopping_rounds=3)

model.fit(X_train ,y_train ,eval_set=[(X_test,y_test)] ,verbose=True)

[0]	validation_0-mlogloss:2.72065
[1]	validation_0-mlogloss:2.59175
[2]	validation_0-mlogloss:2.48828
[3]	validation_0-mlogloss:2.40250
[4]	validation_0-mlogloss:2.32589
[5]	validation_0-mlogloss:2.25594
[6]	validation_0-mlogloss:2.19446
[7]	validation_0-mlogloss:2.13943
[8]	validation_0-mlogloss:2.08876
[9]	validation_0-mlogloss:2.04263
[10]	validation_0-mlogloss:2.00035
[11]	validation_0-mlogloss:1.96111
[12]	validation_0-mlogloss:1.92302
[13]	validation_0-mlogloss:1.88851
[14]	validation_0-mlogloss:1.85633
[15]	validation_0-mlogloss:1.82587
[16]	validation_0-mlogloss:1.79800
[17]	validation_0-mlogloss:1.77188
[18]	validation_0-mlogloss:1.74759
[19]	validation_0-mlogloss:1.72413
[20]	validation_0-mlogloss:1.70236
[21]	validation_0-mlogloss:1.68141
[22]	validation_0-mlogloss:1.66077
[23]	validation_0-mlogloss:1.64097
[24]	validation_0-mlogloss:1.62330
[25]	validation_0-mlogloss:1.60584
[26]	validation_0-mlogloss:1.58866
[27]	validation_0-mlogloss:1.57267
[28]	validation_0-mlogloss:1.5

In [None]:
train_df.drop(columns=excluded_cols).select_dtypes(include='object').columns

Index([], dtype='object')