In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from catboost import CatBoostClassifier, CatBoostRegressor
import os

base_dir = "./inputs"
files = os.listdir(base_dir)
files

['sample_submission.csv', 'test.csv', 'train.csv']

In [33]:
train_df = pd.read_csv(os.path.join(base_dir, files[2])).set_index('id')
train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,1.0,No,4.0,4.0,No,13.0,,Extrovert
...,...,...,...,...,...,...,...,...
18519,3.0,No,7.0,3.0,No,9.0,7.0,Extrovert
18520,1.0,,6.0,7.0,No,6.0,5.0,Extrovert
18521,7.0,Yes,1.0,1.0,Yes,1.0,,Introvert
18522,,Yes,1.0,0.0,Yes,5.0,2.0,Introvert


In [34]:
def predictive_column_imputer(df):
    df = df.copy()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    label_encoders = {col: LabelEncoder() for col in cat_cols}

    # Encode categorical columns
    for col in cat_cols:
        non_null_mask = df[col].notnull()
        df.loc[non_null_mask, col] = label_encoders[col].fit_transform(df.loc[non_null_mask, col])
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.apply(pd.to_numeric, errors='coerce')

    for column in df.columns:
        if df[column].isnull().sum() > 0:
            print(f"Imputing column: {column}")

            # Split data into training and test
            not_null_mask = df[column].notnull()
            null_mask = df[column].isnull()

            y_train = df.loc[not_null_mask, column]
            X_train = df.loc[not_null_mask].drop(columns=[column])
            X_test = df.loc[null_mask].drop(columns=[column])

            # Impute only if there's enough data
            if len(X_train) < 10 or len(X_test) == 0:
                print(f"Skipping {column} due to insufficient data")
                continue

            # Scale numerical features
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index)
            X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index)

            # Decide model type
            if y_train.dtype.kind in 'iO':  # int or object -> classification
                model = CatBoostClassifier(verbose=0)
            else:
                model = CatBoostRegressor(verbose=0)

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Fill missing values
            df.loc[null_mask, column] = y_pred

    # Optionally, reverse label encoding
    for col in cat_cols:
        if df[col].isnull().sum() == 0:
            df[col] = df[col].astype(int)
            df[col] = label_encoders[col].inverse_transform(df[col])

    return df

In [35]:
train_df = predictive_column_imputer(train_df)

Imputing column: Time_spent_Alone
Imputing column: Stage_fear
Imputing column: Social_event_attendance
Imputing column: Going_outside
Imputing column: Drained_after_socializing
Imputing column: Friends_circle_size
Imputing column: Post_frequency


In [36]:
# Encode categorical columns
categories = train_df.select_dtypes(include=object).columns
for column in categories:
    train_df[column] = LabelEncoder().fit_transform(train_df[column])

train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.000000,0,6.0,4.0,0,15.0,5.000000,0
1,1.000000,0,7.0,3.0,0,10.0,8.000000,0
2,6.000000,1,1.0,0.0,1,3.0,0.000000,1
3,3.000000,0,7.0,3.0,0,11.0,5.000000,0
4,1.000000,0,4.0,4.0,0,13.0,6.219460,0
...,...,...,...,...,...,...,...,...
18519,3.000000,0,7.0,3.0,0,9.0,7.000000,0
18520,1.000000,0,6.0,7.0,0,6.0,5.000000,0
18521,7.000000,1,1.0,1.0,1,1.0,1.418785,1
18522,7.092889,1,1.0,0.0,1,5.0,2.000000,1


In [37]:
# Defining the target column
target = train_df["Personality"]
train_df.drop("Personality", inplace=True, axis=1)

In [38]:
# Scale non-catgorical columns
train_df = StandardScaler().fit_transform(train_df)
train_df

array([[-1.0531435 , -0.5451595 ,  0.28301671, ..., -0.54965705,
         1.68504177,  0.02133347],
       [-0.71597092, -0.5451595 ,  0.64889818, ..., -0.54965705,
         0.48929094,  1.07312402],
       [ 0.96989197,  1.83432555, -1.54639061, ...,  1.81931624,
        -1.18476022, -1.73165078],
       ...,
       [ 1.30706455,  1.83432555, -1.54639061, ...,  1.81931624,
        -1.66306055, -1.23422931],
       [ 1.33838433,  1.83432555, -1.54639061, ...,  1.81931624,
        -0.70645989, -1.03045708],
       [-0.71597092, -0.5451595 ,  1.01477964, ..., -0.54965705,
        -0.94561006,  0.72252717]])

In [39]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input

# Define the model
model = Sequential([
    Input(shape=(7,)),  # Input layer with 7 features
    Dense(10, activation='relu'),  # 10 hidden layers
    Dense(1, activation='sigmoid')  # Output layer with sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model and save the history
history = model.fit(X_train, y_train, epochs=10, batch_size=10, validation_split=0.2)

Epoch 1/10
[1m1186/1186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.9332 - loss: 0.2797 - val_accuracy: 0.9733 - val_loss: 0.1232
Epoch 2/10
[1m1186/1186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9688 - loss: 0.1404 - val_accuracy: 0.9733 - val_loss: 0.1172
Epoch 3/10
[1m1186/1186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9655 - loss: 0.1431 - val_accuracy: 0.9737 - val_loss: 0.1163
Epoch 4/10
[1m1186/1186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9688 - loss: 0.1346 - val_accuracy: 0.9737 - val_loss: 0.1166
Epoch 5/10
[1m1186/1186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9690 - loss: 0.1322 - val_accuracy: 0.9744 - val_loss: 0.1162
Epoch 6/10
[1m1186/1186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9679 - loss: 0.1351 - val_accuracy: 0.9744 - val_loss: 0.1154
Epoch 7/10
[1m1

In [44]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")
print(f"Test Loss: {loss}")

[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 880us/step - accuracy: 0.9690 - loss: 0.1391
Test Accuracy: 0.9676113128662109
Test Loss: 0.13830575346946716
