In [1]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import CatBoostClassifier, CatBoostRegressor

base_dir = "./inputs"
files = os.listdir(base_dir)
files

['sample_submission.csv', 'test.csv', 'train.csv']

In [21]:
train_df = pd.read_csv(os.path.join(base_dir, files[2])).set_index("id")
target = train_df["Personality"]
train_df.drop("Personality", inplace=True, axis=1)
train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,No,6.0,4.0,No,15.0,5.0
1,1.0,No,7.0,3.0,No,10.0,8.0
2,6.0,Yes,1.0,0.0,,3.0,0.0
3,3.0,No,7.0,3.0,No,11.0,5.0
4,1.0,No,4.0,4.0,No,13.0,
...,...,...,...,...,...,...,...
18519,3.0,No,7.0,3.0,No,9.0,7.0
18520,1.0,,6.0,7.0,No,6.0,5.0
18521,7.0,Yes,1.0,1.0,Yes,1.0,
18522,,Yes,1.0,0.0,Yes,5.0,2.0


In [3]:
test_df = pd.read_csv(os.path.join(base_dir, files[1])).set_index("id")
test_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
18524,3.0,No,7.0,4.0,No,6.0,
18525,,Yes,0.0,0.0,Yes,5.0,1.0
18526,3.0,No,5.0,6.0,No,15.0,9.0
18527,3.0,No,4.0,4.0,No,5.0,6.0
18528,9.0,Yes,1.0,2.0,Yes,1.0,1.0
...,...,...,...,...,...,...,...
24694,3.0,No,5.0,5.0,No,9.0,6.0
24695,8.0,Yes,2.0,1.0,Yes,0.0,0.0
24696,2.0,No,4.0,3.0,No,9.0,7.0
24697,3.0,No,4.0,4.0,No,11.0,9.0


In [24]:
# Encode categorical columns

categories = train_df.select_dtypes(include=["object"]).columns

for column in categories:
    train_df[column] = LabelEncoder().fit_transform(train_df[column])
    test_df[column] = LabelEncoder().fit_transform(test_df[column])


In [25]:
# Scaling non_categorical columns

non_categories = train_df.select_dtypes(include=["number"]).columns
scaler = StandardScaler()

train_df[non_categories] = scaler.fit_transform(train_df[non_categories])
test_df[non_categories] = scaler.fit_transform(test_df[non_categories])

In [26]:
X_train = train_df
y_train = target
X_test = test_df

In [28]:
model = CatBoostClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions

Learning rate set to 0.027759
0:	learn: 0.6480454	total: 5.69ms	remaining: 5.68s
1:	learn: 0.6077566	total: 10.5ms	remaining: 5.25s
2:	learn: 0.5706637	total: 16.2ms	remaining: 5.38s
3:	learn: 0.5365369	total: 22.4ms	remaining: 5.59s
4:	learn: 0.5049338	total: 27.9ms	remaining: 5.56s
5:	learn: 0.4760129	total: 33.4ms	remaining: 5.53s
6:	learn: 0.4487297	total: 39.1ms	remaining: 5.54s
7:	learn: 0.4243022	total: 44.7ms	remaining: 5.54s
8:	learn: 0.4024303	total: 50.7ms	remaining: 5.58s
9:	learn: 0.3823277	total: 56.2ms	remaining: 5.57s
10:	learn: 0.3640281	total: 61.9ms	remaining: 5.57s
11:	learn: 0.3465507	total: 67.4ms	remaining: 5.55s
12:	learn: 0.3301288	total: 73.1ms	remaining: 5.55s
13:	learn: 0.3160629	total: 79.6ms	remaining: 5.6s
14:	learn: 0.3027394	total: 86.8ms	remaining: 5.7s
15:	learn: 0.2900656	total: 93.3ms	remaining: 5.74s
16:	learn: 0.2793013	total: 97.8ms	remaining: 5.65s
17:	learn: 0.2694050	total: 104ms	remaining: 5.67s
18:	learn: 0.2601905	total: 109ms	remaining: 5.

array(['Extrovert', 'Extrovert', 'Extrovert', ..., 'Extrovert',
       'Extrovert', 'Extrovert'], dtype=object)

In [29]:
submission_test = pd.read_csv("./inputs/test.csv")
output = pd.DataFrame({'id': submission_test.id, 'Personality': predictions})
output.to_csv('./outputs/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [19]:
def predictive_column_imputer(df):
    df = df.copy()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    label_encoders = {col: LabelEncoder() for col in cat_cols}

    # Encode categorical columns
    for col in cat_cols:
        non_null_mask = df[col].notnull()
        df.loc[non_null_mask, col] = label_encoders[col].fit_transform(df.loc[non_null_mask, col])
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.apply(pd.to_numeric, errors='coerce')

    for column in df.columns:
        if df[column].isnull().sum() > 0:
            print(f"Imputing column: {column}")

            # Split data into training and test
            not_null_mask = df[column].notnull()
            null_mask = df[column].isnull()

            y_train = df.loc[not_null_mask, column]
            X_train = df.loc[not_null_mask].drop(columns=[column])
            X_test = df.loc[null_mask].drop(columns=[column])

            # Impute only if there's enough data
            if len(X_train) < 10 or len(X_test) == 0:
                print(f"Skipping {column} due to insufficient data")
                continue

            # Scale numerical features
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index)
            X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index)

            # Decide model type
            if y_train.dtype.kind in 'iO':  # int or object -> classification
                model = CatBoostClassifier(verbose=0)
            else:
                model = CatBoostRegressor(verbose=0)

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Fill missing values
            df.loc[null_mask, column] = y_pred

    # Optionally, reverse label encoding
    for col in cat_cols:
        if df[col].isnull().sum() == 0:
            df[col] = df[col].astype(int)
            df[col] = label_encoders[col].inverse_transform(df[col])

    return df

In [13]:
train_df = predictive_column_imputer(train_df)
train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.000000,0,6.0,4.0,0,15.0,5.000000
1,1.000000,0,7.0,3.0,0,10.0,8.000000
2,6.000000,1,1.0,0.0,1,3.0,0.000000
3,3.000000,0,7.0,3.0,0,11.0,5.000000
4,1.000000,0,4.0,4.0,0,13.0,6.257277
...,...,...,...,...,...,...,...
18519,3.000000,0,7.0,3.0,0,9.0,7.000000
18520,1.000000,0,6.0,7.0,0,6.0,5.000000
18521,7.000000,1,1.0,1.0,1,1.0,1.450613
18522,6.996638,1,1.0,0.0,1,5.0,2.000000


In [14]:
test_df = predictive_column_imputer(test_df)
test_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
18524,3.000000,0,7.0,4.0,0,6.0,6.560516
18525,7.275027,1,0.0,0.0,1,5.0,1.000000
18526,3.000000,0,5.0,6.0,0,15.0,9.000000
18527,3.000000,0,4.0,4.0,0,5.0,6.000000
18528,9.000000,1,1.0,2.0,1,1.0,1.000000
...,...,...,...,...,...,...,...
24694,3.000000,0,5.0,5.0,0,9.0,6.000000
24695,8.000000,1,2.0,1.0,1,0.0,0.000000
24696,2.000000,0,4.0,3.0,0,9.0,7.000000
24697,3.000000,0,4.0,4.0,0,11.0,9.000000


In [22]:
X_train = train_df
X_test = test_df
y_train = target

In [23]:
categories = X_train.select_dtypes(include=object).columns
for column in categories:
    X_train[column] = LabelEncoder().fit_transform(X_train[column])
    X_test[column] = LabelEncoder().fit_transform(X_test[column])

In [24]:
model = CatBoostClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions

Learning rate set to 0.035831
0:	learn: 0.6342461	total: 9.49ms	remaining: 9.48s
1:	learn: 0.5813399	total: 17.8ms	remaining: 8.87s
2:	learn: 0.5340324	total: 26.7ms	remaining: 8.88s
3:	learn: 0.4916178	total: 34ms	remaining: 8.46s
4:	learn: 0.4544903	total: 41.7ms	remaining: 8.29s
5:	learn: 0.4216802	total: 47.9ms	remaining: 7.94s
6:	learn: 0.3918230	total: 55.1ms	remaining: 7.82s
7:	learn: 0.3657292	total: 62.3ms	remaining: 7.72s
8:	learn: 0.3423349	total: 69.5ms	remaining: 7.65s
9:	learn: 0.3212174	total: 77.4ms	remaining: 7.66s
10:	learn: 0.3023335	total: 85.5ms	remaining: 7.69s
11:	learn: 0.2856470	total: 93.2ms	remaining: 7.68s
12:	learn: 0.2704541	total: 101ms	remaining: 7.65s
13:	learn: 0.2569669	total: 109ms	remaining: 7.65s
14:	learn: 0.2450044	total: 116ms	remaining: 7.59s
15:	learn: 0.2338818	total: 123ms	remaining: 7.59s
16:	learn: 0.2238519	total: 131ms	remaining: 7.55s
17:	learn: 0.2152470	total: 137ms	remaining: 7.5s
18:	learn: 0.2071921	total: 144ms	remaining: 7.45s
19

array(['Extrovert', 'Introvert', 'Extrovert', ..., 'Extrovert',
       'Extrovert', 'Introvert'], dtype=object)

In [25]:
submission_test = pd.read_csv("./inputs/test.csv")
output = pd.DataFrame({'id': submission_test.id, 'Personality': predictions})
output.to_csv('./outputs/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
