In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from catboost import CatBoostClassifier, CatBoostRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
import os

base_dir = "./inputs"
files = os.listdir(base_dir)
files

['sample_submission.csv', 'test.csv', 'train.csv']

In [4]:
train_df = pd.read_csv(os.path.join(base_dir, files[2])).set_index("id")
test_df = pd.read_csv(os.path.join(base_dir, files[1])).set_index("id")

In [6]:
def predictive_column_imputer(df):
    df = df.copy()
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    label_encoders = {col: LabelEncoder() for col in cat_cols}

    # Encode categorical columns
    for col in cat_cols:
        non_null_mask = df[col].notnull()
        df.loc[non_null_mask, col] = label_encoders[col].fit_transform(df.loc[non_null_mask, col])
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df = df.apply(pd.to_numeric, errors='coerce')

    for column in df.columns:
        if df[column].isnull().sum() > 0:
            print(f"Imputing column: {column}")

            # Split data into training and test
            not_null_mask = df[column].notnull()
            null_mask = df[column].isnull()

            y_train = df.loc[not_null_mask, column]
            X_train = df.loc[not_null_mask].drop(columns=[column])
            X_test = df.loc[null_mask].drop(columns=[column])

            # Impute only if there's enough data
            if len(X_train) < 10 or len(X_test) == 0:
                print(f"Skipping {column} due to insufficient data")
                continue

            # Scale numerical features
            scaler = StandardScaler()
            X_train = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index)
            X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index)

            # Decide model type
            if y_train.dtype.kind in 'iO':  # int or object -> classification
                model = CatBoostClassifier(verbose=0)
            else:
                model = CatBoostRegressor(verbose=0)

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Fill missing values
            df.loc[null_mask, column] = y_pred

    # Optionally, reverse label encoding
    for col in cat_cols:
        if df[col].isnull().sum() == 0:
            df[col] = df[col].astype(int)
            df[col] = label_encoders[col].inverse_transform(df[col])

    return df

In [7]:
train_df = predictive_column_imputer(train_df)
test_df = predictive_column_imputer(test_df)

Imputing column: Time_spent_Alone
Imputing column: Stage_fear
Imputing column: Social_event_attendance
Imputing column: Going_outside
Imputing column: Drained_after_socializing
Imputing column: Friends_circle_size
Imputing column: Post_frequency
Imputing column: Time_spent_Alone
Imputing column: Stage_fear
Imputing column: Social_event_attendance
Imputing column: Going_outside
Imputing column: Drained_after_socializing
Imputing column: Friends_circle_size
Imputing column: Post_frequency


In [8]:
train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.000000,No,6.0,4.0,No,15.0,5.000000,Extrovert
1,1.000000,No,7.0,3.0,No,10.0,8.000000,Extrovert
2,6.000000,Yes,1.0,0.0,Yes,3.0,0.000000,Introvert
3,3.000000,No,7.0,3.0,No,11.0,5.000000,Extrovert
4,1.000000,No,4.0,4.0,No,13.0,6.219460,Extrovert
...,...,...,...,...,...,...,...,...
18519,3.000000,No,7.0,3.0,No,9.0,7.000000,Extrovert
18520,1.000000,No,6.0,7.0,No,6.0,5.000000,Extrovert
18521,7.000000,Yes,1.0,1.0,Yes,1.0,1.418785,Introvert
18522,7.092889,Yes,1.0,0.0,Yes,5.0,2.000000,Introvert


In [9]:
test_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
18524,3.000000,No,7.0,4.0,No,6.0,6.560516
18525,7.275027,Yes,0.0,0.0,Yes,5.0,1.000000
18526,3.000000,No,5.0,6.0,No,15.0,9.000000
18527,3.000000,No,4.0,4.0,No,5.0,6.000000
18528,9.000000,Yes,1.0,2.0,Yes,1.0,1.000000
...,...,...,...,...,...,...,...
24694,3.000000,No,5.0,5.0,No,9.0,6.000000
24695,8.000000,Yes,2.0,1.0,Yes,0.0,0.000000
24696,2.000000,No,4.0,3.0,No,9.0,7.000000
24697,3.000000,No,4.0,4.0,No,11.0,9.000000


In [11]:
X_train = train_df.drop("Personality",inplace=False,axis=1)
X_test = test_df
y_train = train_df["Personality"]

In [13]:
categories = X_train.select_dtypes(include=object).columns
for column in categories:
    X_train[column] = LabelEncoder().fit_transform(X_train[column])
    X_test[column] = LabelEncoder().fit_transform(X_test[column])
y_train = LabelEncoder().fit_transform(y_train)

In [19]:
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input

# Define the model
model = Sequential([
    Input(shape=(7,)),  # Input layer with 7 features
    Dense(10, activation='relu'),  # 10 hidden layers
    Dense(1, activation='sigmoid')  # Output layer with sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model and save the history
history = model.fit(X_train, y_train, epochs=10, batch_size=10, validation_split=0.2)

Epoch 1/10
[1m1482/1482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - accuracy: 0.8947 - loss: 0.3063 - val_accuracy: 0.9733 - val_loss: 0.1279
Epoch 2/10
[1m1482/1482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9688 - loss: 0.1392 - val_accuracy: 0.9730 - val_loss: 0.1226
Epoch 3/10
[1m1482/1482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9670 - loss: 0.1406 - val_accuracy: 0.9730 - val_loss: 0.1204
Epoch 4/10
[1m1482/1482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9671 - loss: 0.1417 - val_accuracy: 0.9730 - val_loss: 0.1180
Epoch 5/10
[1m1482/1482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9681 - loss: 0.1372 - val_accuracy: 0.9727 - val_loss: 0.1185
Epoch 6/10
[1m1482/1482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9672 - loss: 0.1382 - val_accuracy: 0.9730 - val_loss: 0.1170
Epoch 7/10
[1m1

In [21]:
predictions = model.predict(X_test)
predictions

[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 711us/step


array([[0.01967195],
       [0.96617806],
       [0.044902  ],
       ...,
       [0.01426387],
       [0.02438053],
       [0.97188133]], dtype=float32)

In [24]:
labels = ['Introvert' if p >= 0.5 else 'Extrovert' for p in predictions]
labels

['Extrovert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Introvert',
 'Introvert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Introvert',
 'Introvert',
 'Introvert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Extrovert',
 'Extrovert',
 'Introvert',
 'Intr

In [25]:
submission_test = pd.read_csv("./inputs/test.csv")
output = pd.DataFrame({'id': submission_test.id, 'Personality': labels})
output.to_csv('./outputs/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
