In [1]:
# Install necessary packages
!uv pip install -q --system scikit-learn==1.5.2  # Quiet install of scikit-learn v1.5.2
!pip install autogluon                           # Install AutoGluon for AutoML
!pip install -U ipywidgets                       # Upgrade ipywidgets for notebook interactivity

Collecting autogluon
  Downloading autogluon-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core==1.3.1 (from autogluon.core[all]==1.3.1->autogluon)
  Downloading autogluon.core-1.3.1-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.3.1 (from autogluon)
  Downloading autogluon.features-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.3.1 (from autogluon.tabular[all]==1.3.1->autogluon)
  Downloading autogluon.tabular-1.3.1-py3-none-any.whl.metadata (14 kB)
Collecting autogluon.multimodal==1.3.1 (from autogluon)
  Downloading autogluon.multimodal-1.3.1-py3-none-any.whl.metadata (13 kB)
Collecting autogluon.timeseries==1.3.1 (from autogluon.timeseries[all]==1.3.1->autogluon)
  Downloading autogluon.timeseries-1.3.1-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.common==1.3.1 (from autogluon.core==1.3.1->autogluon.core[all]==1.3.1->autogluon)
  Downloading autogluon.common-1.3.1-py3-none-any.whl.metadata (11 kB)
C

In [2]:
import pandas as pd

# Load train and test datasets
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')

# Quick data overview
train.head()
test.head()

# Data structure and types
train.info()

# Unique values per column
train.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB


id                           18524
Time_spent_Alone                12
Stage_fear                       2
Social_event_attendance         11
Going_outside                    8
Drained_after_socializing        2
Friends_circle_size             16
Post_frequency                  11
Personality                      2
dtype: int64

In [3]:
# Drop the 'id' column as it's not useful for training
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

# Descriptive statistics
train.describe()

# Null value checks
train.isnull().sum()
round(train.isnull().sum() * 100 / len(train), 2)

# Preview rows with missing values
train[train.isna().any(axis=1)].head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
4,1.0,No,4.0,4.0,No,13.0,,Extrovert
5,2.0,No,8.0,5.0,No,,3.0,Extrovert
6,1.0,No,8.0,,No,,4.0,Extrovert
8,4.0,Yes,2.0,1.0,,0.0,2.0,Introvert


In [4]:
# Check for mismatched categories between train and test datasets
counter = 0
for i in test.select_dtypes(include=['object']).columns.tolist():
    if (len(list(set(train[i].unique().tolist()) ^ set(test[i].unique().tolist()))) != 0):
        print(i, 'need to be worked on')
        counter += 1
if counter == 0:
    print('No work needed')

No work needed


In [5]:
# Convert 'Yes'/'No' to 1/0 for 'Stage_fear'
train['Stage_fear'] = train['Stage_fear'].replace({'Yes': 1, 'No': 0})
test['Stage_fear'] = test['Stage_fear'].replace({'Yes': 1, 'No': 0})

# Convert 'Yes'/'No' to 1/0 for 'Drained_after_socializing'
train['Drained_after_socializing'] = train['Drained_after_socializing'].replace({'Yes': 1, 'No': 0})
test['Drained_after_socializing'] = test['Drained_after_socializing'].replace({'Yes': 1, 'No': 0})

  train['Stage_fear'] = train['Stage_fear'].replace({'Yes': 1, 'No': 0})
  test['Stage_fear'] = test['Stage_fear'].replace({'Yes': 1, 'No': 0})
  train['Drained_after_socializing'] = train['Drained_after_socializing'].replace({'Yes': 1, 'No': 0})
  test['Drained_after_socializing'] = test['Drained_after_socializing'].replace({'Yes': 1, 'No': 0})


In [6]:
# Check for duplicates
train.duplicated().value_counts()

# Examine target label distribution
round(train['Personality'].value_counts() * 100 / len(train), 2)
train['Personality'].value_counts()

Personality
Extrovert    13699
Introvert     4825
Name: count, dtype: int64

In [7]:
from autogluon.tabular import TabularDataset, TabularPredictor

# Define label
label = 'Personality'

# Fit AutoGluon predictor
predictor = TabularPredictor(label=label,
                             eval_metric='accuracy',
                             problem_type="binary"
                            ).fit(train,
                                  presets='medium_quality',
                                  time_limit=3600*9,  # Max 9 hours
                                  verbosity=3,
                                  ag_args_fit={'num_gpus': 1}
                                 )

# Summarize training results
results = predictor.fit_summary()

No path specified. Models will be saved in: "AutogluonModels/ag-20250703_085030"
Verbosity: 3 (Detailed Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Nov 10 10:07:59 UTC 2024
CPU Count:          4
GPU Count:          2
Memory Avail:       30.11 GB / 31.35 GB (96.0%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Presets specified: ['medium_quality']
User Specified kwargs:
{'ag_args_fit': {'num_gpus': 1}, 'auto_stack': False, 'verbosity': 3}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': {'num_gpus': 1},
 'auto_stack': False,
 'calibrate': 'auto',
 'delay_bag_sets': False,
 'ds_args': {'clean_up_fits': True,
             'detection_time_frac': 0.25,
             'enable_callbacks': False,
             'enable_ray_logging': True,
             'holdout_data': None,
             'holdou

[50]	valid_set's binary_error: 0.0302213
[100]	valid_set's binary_error: 0.0307609
[150]	valid_set's binary_error: 0.0307609
[200]	valid_set's binary_error: 0.0307609


Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/LightGBMXT/model.pkl
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/LightGBMXT/y_pred_proba_val.pkl
	0.9703	 = Validation score   (accuracy)
	9.05s	 = Training   runtime
	0.01s	 = Validation runtime
	319771.5	 = Inference  throughput (rows/s | 1853 batch size)
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/trainer.pkl
Fitting model: LightGBM ... Training model for up to 32390.00s of the 32389.99s of remaining time.
	Fitting LightGBM with 'num_gpus': 1, 'num_cpus': 2
	Training LightGBM with GPU, note that this may negatively impact model quality compared to CPU training.
	Fitting 10000 rounds... Hyperparameters: {'learning_rate': 0.05, 'device': 'gpu'}


[50]	valid_set's binary_error: 0.0318403
[100]	valid_set's binary_error: 0.0307609
[150]	valid_set's binary_error: 0.0307609
[200]	valid_set's binary_error: 0.0307609


Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/LightGBM/model.pkl
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/LightGBM/y_pred_proba_val.pkl
	0.9692	 = Validation score   (accuracy)
	1.19s	 = Training   runtime
	0.01s	 = Validation runtime
	237017.6	 = Inference  throughput (rows/s | 1853 batch size)
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/trainer.pkl
Fitting model: RandomForestGini ... Training model for up to 32388.79s of the 32388.78s of remaining time.
	Fitting RandomForestGini with 'num_gpus': 1, 'num_cpus': 4


[250]	valid_set's binary_error: 0.0307609


Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/RandomForestGini/model.pkl
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/RandomForestGini/y_pred_proba_val.pkl
	0.9606	 = Validation score   (accuracy)
	1.39s	 = Training   runtime
	0.09s	 = Validation runtime
	21270.8	 = Inference  throughput (rows/s | 1853 batch size)
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/trainer.pkl
Fitting model: RandomForestEntr ... Training model for up to 32387.21s of the 32387.20s of remaining time.
	Fitting RandomForestEntr with 'num_gpus': 1, 'num_cpus': 4
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/RandomForestEntr/model.pkl
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/RandomForestEntr/y_pred_proba_val.pkl
	0.9601	 = Validation score   (accuracy)
	1.4s	 = Training   runtime
	0.09s	 = Validation runtime
	21592.7	 = Inference  throughput (rows/s | 1853 batch size)
Saving /kaggle/working/AutogluonModels/

0:	learn: 0.9689881	test: 0.9692391	best: 0.9692391 (0)	total: 180ms	remaining: 5.21s
20:	learn: 0.9688681	test: 0.9686994	best: 0.9692391 (0)	total: 307ms	remaining: 132ms
29:	learn: 0.9689881	test: 0.9686994	best: 0.9692391 (0)	total: 364ms	remaining: 0us
bestTest = 0.9692390718
bestIteration = 0
Shrink model to first 1 iterations.
0:	learn: 0.9687481	test: 0.9670804	best: 0.9670804 (0)	total: 34.3ms	remaining: 5m 43s
20:	learn: 0.9688081	test: 0.9686994	best: 0.9686994 (1)	total: 567ms	remaining: 4m 29s
40:	learn: 0.9691080	test: 0.9697787	best: 0.9697787 (31)	total: 1.07s	remaining: 4m 20s
60:	learn: 0.9694080	test: 0.9697787	best: 0.9697787 (31)	total: 1.57s	remaining: 4m 16s


Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/CatBoost/model.pkl
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/CatBoost/y_pred_proba_val.pkl
	0.9698	 = Validation score   (accuracy)
	4.21s	 = Training   runtime
	0.0s	 = Validation runtime
	545751.4	 = Inference  throughput (rows/s | 1853 batch size)
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/trainer.pkl
Fitting model: ExtraTreesGini ... Training model for up to 32381.41s of the 32381.40s of remaining time.
	Fitting ExtraTreesGini with 'num_gpus': 1, 'num_cpus': 4


80:	learn: 0.9695279	test: 0.9692391	best: 0.9697787 (31)	total: 2.07s	remaining: 4m 13s
bestTest = 0.9697787372
bestIteration = 31
Shrink model to first 32 iterations.


Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/ExtraTreesGini/model.pkl
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/ExtraTreesGini/y_pred_proba_val.pkl
	0.9611	 = Validation score   (accuracy)
	1.56s	 = Training   runtime
	0.09s	 = Validation runtime
	21732.1	 = Inference  throughput (rows/s | 1853 batch size)
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/trainer.pkl
Fitting model: ExtraTreesEntr ... Training model for up to 32379.63s of the 32379.61s of remaining time.
	Fitting ExtraTreesEntr with 'num_gpus': 1, 'num_cpus': 4
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/ExtraTreesEntr/model.pkl
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/ExtraTreesEntr/y_pred_proba_val.pkl
	0.9611	 = Validation score   (accuracy)
	1.18s	 = Training   runtime
	0.09s	 = Validation runtime
	21649.9	 = Inference  throughput (rows/s | 1853 batch size)
Saving /kaggle/working/AutogluonModels/ag-20250703

[0]	validation_0-error:0.26066
[50]	validation_0-error:0.03346
[100]	validation_0-error:0.03346
[150]	validation_0-error:0.03292
[187]	validation_0-error:0.03292


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/XGBoost/model.pkl
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/XGBoost/y_pred_proba_val.pkl
	0.9687	 = Validation score   (accuracy)
	0.76s	 = Training   runtime
	0.01s	 = Validation runtime
	344551.4	 = Inference  throughput (rows/s | 1853 batch size)
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/trainer.pkl
Fitting model: NeuralNetTorch ... Training model for up to 32363.25s of the 32363.24s of remaining time.
	Fitting NeuralNetTorch with 'num_gpus': 1, 'num_cpus': 2
Tabular Neural Network treats features as the following types:
{
    "continuous": [
        "Social_event_attendance",
        "Going_outside",
        "Friends_circle_size",
        "Post_frequency"
    ],
    "skewed": [
        "Time_spent_Alone",
        "St

[50]	valid_set's binary_error: 0.0323799
[100]	valid_set's binary_error: 0.0323799
[150]	valid_set's binary_error: 0.0318403
[200]	valid_set's binary_error: 0.0318403
[250]	valid_set's binary_error: 0.0323799
[300]	valid_set's binary_error: 0.0323799
[350]	valid_set's binary_error: 0.0339989


Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/LightGBMLarge/model.pkl
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/LightGBMLarge/y_pred_proba_val.pkl
	0.9687	 = Validation score   (accuracy)
	4.15s	 = Training   runtime
	0.02s	 = Validation runtime
	114967.7	 = Inference  throughput (rows/s | 1853 batch size)
Saving /kaggle/working/AutogluonModels/ag-20250703_085030/models/trainer.pkl
Loading: /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/RandomForestEntr/y_pred_proba_val.pkl
Loading: /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/ExtraTreesGini/y_pred_proba_val.pkl
Loading: /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/NeuralNetTorch/y_pred_proba_val.pkl
Loading: /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/NeuralNetFastAI/y_pred_proba_val.pkl
Loading: /kaggle/working/AutogluonModels/ag-20250703_085030/utils/attr/LightGBMLarge/y_pred_proba_val.pkl
Loading: /kaggle/working/Autoglu

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val eval_metric  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0            LightGBMXT   0.970318    accuracy       0.005795   9.050874                0.005795           9.050874            1       True          3
1   WeightedEnsemble_L2   0.970318    accuracy       0.007387   9.214869                0.001593           0.163995            2       True         14
2              CatBoost   0.969779    accuracy       0.003395   4.209372                0.003395           4.209372            1       True          7
3       NeuralNetFastAI   0.969779    accuracy       0.044628  14.119918                0.044628          14.119918            1       True         10
4              LightGBM   0.969239    accuracy       0.007818   1.189918                0.007818           1.189918            1       True          4
5        NeuralNetTorch   0.9692

In [8]:
# Show leaderboard of model performances
predictor.leaderboard()

# Predict on test dataset
df = predictor.predict(test).to_frame(name=label)
df.head()

Loading: /kaggle/working/AutogluonModels/ag-20250703_085030/models/LightGBMXT/model.pkl
Loading: /kaggle/working/AutogluonModels/ag-20250703_085030/models/WeightedEnsemble_L2/model.pkl


Unnamed: 0,Personality
0,Extrovert
1,Introvert
2,Extrovert
3,Extrovert
4,Introvert


In [9]:
# Load sample submission file
sol = pd.read_csv('/kaggle/input/playground-series-s5e7/sample_submission.csv')

# Assign predictions to submission DataFrame
sol[label] = df[label]

# Save to CSV for submission
sol.to_csv('./Autogluon_medium_quality_gpu.csv', index=False)