In [11]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier, CatBoostRegressor

base_dir = "./inputs"
files = os.listdir(base_dir)
files

['sample_submission.csv', 'test.csv', 'train.csv']

In [2]:
train_df = pd.read_csv(os.path.join(base_dir, files[2])).set_index("id")
train_df = train_df.dropna()
target = train_df["Personality"]
train_df.drop("Personality", inplace=True, axis=1)
train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,No,6.0,4.0,No,15.0,5.0
1,1.0,No,7.0,3.0,No,10.0,8.0
3,3.0,No,7.0,3.0,No,11.0,5.0
7,2.0,No,8.0,3.0,No,4.0,5.0
9,1.0,No,8.0,6.0,No,14.0,9.0
...,...,...,...,...,...,...,...
18509,1.0,No,3.0,4.0,No,15.0,4.0
18511,0.0,No,4.0,5.0,No,11.0,4.0
18514,6.0,No,5.0,3.0,No,10.0,4.0
18519,3.0,No,7.0,3.0,No,9.0,7.0


In [3]:
# Encoding categorical columns
categories = train_df.select_dtypes(include=object).columns

for column in categories:
    train_df[column] = LabelEncoder().fit_transform(train_df[column])

target = LabelEncoder().fit_transform(target)
train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,0,6.0,4.0,0,15.0,5.0
1,1.0,0,7.0,3.0,0,10.0,8.0
3,3.0,0,7.0,3.0,0,11.0,5.0
7,2.0,0,8.0,3.0,0,4.0,5.0
9,1.0,0,8.0,6.0,0,14.0,9.0
...,...,...,...,...,...,...,...
18509,1.0,0,3.0,4.0,0,15.0,4.0
18511,0.0,0,4.0,5.0,0,11.0,4.0
18514,6.0,0,5.0,3.0,0,10.0,4.0
18519,3.0,0,7.0,3.0,0,9.0,7.0


In [4]:
# Scale the non-categorical columns
non_categories = train_df.select_dtypes(include=["number"]).columns

train_df[non_categories] = StandardScaler().fit_transform(train_df[non_categories])

train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-1.001004,-0.447661,0.122633,-0.236003,-0.447819,1.595076,-0.184829
1,-0.620847,-0.447661,0.507691,-0.766592,-0.447819,0.367511,0.922299
3,0.139467,-0.447661,0.507691,-0.766592,-0.447819,0.613024,-0.184829
7,-0.240690,-0.447661,0.892749,-0.766592,-0.447819,-1.105568,-0.184829
9,-0.620847,-0.447661,0.892749,0.825176,-0.447819,1.349563,1.291341
...,...,...,...,...,...,...,...
18509,-0.620847,-0.447661,-1.032539,-0.236003,-0.447819,1.595076,-0.553872
18511,-1.001004,-0.447661,-0.647482,0.294587,-0.447819,0.613024,-0.553872
18514,1.279938,-0.447661,-0.262424,-0.766592,-0.447819,0.367511,-0.553872
18519,0.139467,-0.447661,0.507691,-0.766592,-0.447819,0.121998,0.553256


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10189 entries, 0 to 18523
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           10189 non-null  float64
 1   Stage_fear                 10189 non-null  float64
 2   Social_event_attendance    10189 non-null  float64
 3   Going_outside              10189 non-null  float64
 4   Drained_after_socializing  10189 non-null  float64
 5   Friends_circle_size        10189 non-null  float64
 6   Post_frequency             10189 non-null  float64
dtypes: float64(7)
memory usage: 636.8 KB


In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)

In [7]:
model = CatBoostClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
matrix = confusion_matrix(y_test, predictions)
print(accuracy)
print(report)
print(matrix)

Learning rate set to 0.025236
0:	learn: 0.6513116	total: 144ms	remaining: 2m 23s
1:	learn: 0.6138189	total: 148ms	remaining: 1m 13s
2:	learn: 0.5791170	total: 153ms	remaining: 50.9s
3:	learn: 0.5470065	total: 165ms	remaining: 41s
4:	learn: 0.5168885	total: 172ms	remaining: 34.3s
5:	learn: 0.4892155	total: 209ms	remaining: 34.6s
6:	learn: 0.4628688	total: 212ms	remaining: 30.1s
7:	learn: 0.4390017	total: 218ms	remaining: 27s
8:	learn: 0.4175089	total: 221ms	remaining: 24.4s
9:	learn: 0.3976507	total: 225ms	remaining: 22.3s
10:	learn: 0.3793038	total: 231ms	remaining: 20.8s
11:	learn: 0.3617003	total: 288ms	remaining: 23.7s
12:	learn: 0.3449522	total: 293ms	remaining: 22.2s
13:	learn: 0.3305912	total: 296ms	remaining: 20.8s
14:	learn: 0.3172970	total: 299ms	remaining: 19.6s
15:	learn: 0.3050287	total: 302ms	remaining: 18.6s
16:	learn: 0.2935834	total: 305ms	remaining: 17.6s
17:	learn: 0.2832216	total: 308ms	remaining: 16.8s
18:	learn: 0.2731868	total: 311ms	remaining: 16.1s
19:	learn: 0.

In [8]:
train_df = pd.read_csv(os.path.join(base_dir, files[2])).set_index("id")

In [9]:
def predictive_column_imputer(df):
    df = df.copy()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    label_encoders = {col: LabelEncoder() for col in cat_cols}

    # Encode categorical columns
    for col in cat_cols:
        non_null_mask = df[col].notnull()
        df.loc[non_null_mask, col] = label_encoders[col].fit_transform(df.loc[non_null_mask, col])
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.apply(pd.to_numeric, errors='coerce')

    for column in df.columns:
        if df[column].isnull().sum() > 0:
            print(f"Imputing column: {column}")

            # Split data into training and test
            not_null_mask = df[column].notnull()
            null_mask = df[column].isnull()

            y_train = df.loc[not_null_mask, column]
            X_train = df.loc[not_null_mask].drop(columns=[column])
            X_test = df.loc[null_mask].drop(columns=[column])

            # Impute only if there's enough data
            if len(X_train) < 10 or len(X_test) == 0:
                print(f"Skipping {column} due to insufficient data")
                continue

            # Scale numerical features
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index)
            X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index)

            # Decide model type
            if y_train.dtype.kind in 'iO':  # int or object -> classification
                model = CatBoostClassifier(verbose=0)
            else:
                model = CatBoostRegressor(verbose=0)

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Fill missing values
            df.loc[null_mask, column] = y_pred

    # Optionally, reverse label encoding
    for col in cat_cols:
        if df[col].isnull().sum() == 0:
            df[col] = df[col].astype(int)
            df[col] = label_encoders[col].inverse_transform(df[col])

    return df

In [12]:
train_df = predictive_column_imputer(train_df)
train_df

Imputing column: Time_spent_Alone
Imputing column: Stage_fear
Imputing column: Social_event_attendance
Imputing column: Going_outside
Imputing column: Drained_after_socializing
Imputing column: Friends_circle_size
Imputing column: Post_frequency


Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.000000,No,6.0,4.0,No,15.0,5.000000,Extrovert
1,1.000000,No,7.0,3.0,No,10.0,8.000000,Extrovert
2,6.000000,Yes,1.0,0.0,Yes,3.0,0.000000,Introvert
3,3.000000,No,7.0,3.0,No,11.0,5.000000,Extrovert
4,1.000000,No,4.0,4.0,No,13.0,6.219460,Extrovert
...,...,...,...,...,...,...,...,...
18519,3.000000,No,7.0,3.0,No,9.0,7.000000,Extrovert
18520,1.000000,No,6.0,7.0,No,6.0,5.000000,Extrovert
18521,7.000000,Yes,1.0,1.0,Yes,1.0,1.418785,Introvert
18522,7.092889,Yes,1.0,0.0,Yes,5.0,2.000000,Introvert


In [13]:
# Encode categorical columns
categories = train_df.select_dtypes(include=object).columns
for column in categories:
    train_df[column] = LabelEncoder().fit_transform(train_df[column])

train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.000000,0,6.0,4.0,0,15.0,5.000000,0
1,1.000000,0,7.0,3.0,0,10.0,8.000000,0
2,6.000000,1,1.0,0.0,1,3.0,0.000000,1
3,3.000000,0,7.0,3.0,0,11.0,5.000000,0
4,1.000000,0,4.0,4.0,0,13.0,6.219460,0
...,...,...,...,...,...,...,...,...
18519,3.000000,0,7.0,3.0,0,9.0,7.000000,0
18520,1.000000,0,6.0,7.0,0,6.0,5.000000,0
18521,7.000000,1,1.0,1.0,1,1.0,1.418785,1
18522,7.092889,1,1.0,0.0,1,5.0,2.000000,1


In [14]:
# Defining the target column
target = train_df["Personality"]
train_df.drop("Personality", inplace=True, axis=1)

In [15]:
# Scale non-catgorical columns
train_df = StandardScaler().fit_transform(train_df)
train_df

array([[-1.0531435 , -0.5451595 ,  0.28301671, ..., -0.54965705,
         1.68504177,  0.02133347],
       [-0.71597092, -0.5451595 ,  0.64889818, ..., -0.54965705,
         0.48929094,  1.07312402],
       [ 0.96989197,  1.83432555, -1.54639061, ...,  1.81931624,
        -1.18476022, -1.73165078],
       ...,
       [ 1.30706455,  1.83432555, -1.54639061, ...,  1.81931624,
        -1.66306055, -1.23422931],
       [ 1.33838433,  1.83432555, -1.54639061, ...,  1.81931624,
        -0.70645989, -1.03045708],
       [-0.71597092, -0.5451595 ,  1.01477964, ..., -0.54965705,
        -0.94561006,  0.72252717]])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)

In [17]:
model = CatBoostClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
matrix = confusion_matrix(y_test, predictions)
print(accuracy)
print(report)
print(matrix)

Learning rate set to 0.032574
0:	learn: 0.6405475	total: 11.1ms	remaining: 11.1s
1:	learn: 0.5924091	total: 23.1ms	remaining: 11.5s
2:	learn: 0.5493348	total: 34.1ms	remaining: 11.3s
3:	learn: 0.5094116	total: 45ms	remaining: 11.2s
4:	learn: 0.4730445	total: 56.9ms	remaining: 11.3s
5:	learn: 0.4406055	total: 69.1ms	remaining: 11.4s
6:	learn: 0.4117308	total: 80.4ms	remaining: 11.4s
7:	learn: 0.3851508	total: 91.4ms	remaining: 11.3s
8:	learn: 0.3622319	total: 100ms	remaining: 11s
9:	learn: 0.3416348	total: 109ms	remaining: 10.8s
10:	learn: 0.3215621	total: 120ms	remaining: 10.8s
11:	learn: 0.3040295	total: 131ms	remaining: 10.8s
12:	learn: 0.2886067	total: 150ms	remaining: 11.4s
13:	learn: 0.2746031	total: 179ms	remaining: 12.6s
14:	learn: 0.2617850	total: 205ms	remaining: 13.5s
15:	learn: 0.2495456	total: 225ms	remaining: 13.8s
16:	learn: 0.2394339	total: 247ms	remaining: 14.3s
17:	learn: 0.2297112	total: 275ms	remaining: 15s
18:	learn: 0.2209889	total: 303ms	remaining: 15.7s
19:	learn