In [36]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from catboost import CatBoostClassifier

base_dir = "./inputs"
files = os.listdir(base_dir)
files

['sample_submission.csv', 'test.csv', 'train.csv']

In [37]:
train_df = pd.read_csv(os.path.join(base_dir, files[2])).set_index("id")
train_df = train_df.dropna()
target = train_df["Personality"]
train_df.drop("Personality", inplace=True, axis=1)
train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,No,6.0,4.0,No,15.0,5.0
1,1.0,No,7.0,3.0,No,10.0,8.0
3,3.0,No,7.0,3.0,No,11.0,5.0
7,2.0,No,8.0,3.0,No,4.0,5.0
9,1.0,No,8.0,6.0,No,14.0,9.0
...,...,...,...,...,...,...,...
18509,1.0,No,3.0,4.0,No,15.0,4.0
18511,0.0,No,4.0,5.0,No,11.0,4.0
18514,6.0,No,5.0,3.0,No,10.0,4.0
18519,3.0,No,7.0,3.0,No,9.0,7.0


In [38]:
# Encoding categorical columns
categories = train_df.select_dtypes(include=object).columns

for column in categories:
    train_df[column] = LabelEncoder().fit_transform(train_df[column])

target = LabelEncoder().fit_transform(target)
train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,0,6.0,4.0,0,15.0,5.0
1,1.0,0,7.0,3.0,0,10.0,8.0
3,3.0,0,7.0,3.0,0,11.0,5.0
7,2.0,0,8.0,3.0,0,4.0,5.0
9,1.0,0,8.0,6.0,0,14.0,9.0
...,...,...,...,...,...,...,...
18509,1.0,0,3.0,4.0,0,15.0,4.0
18511,0.0,0,4.0,5.0,0,11.0,4.0
18514,6.0,0,5.0,3.0,0,10.0,4.0
18519,3.0,0,7.0,3.0,0,9.0,7.0


In [39]:
# Scale the non-categorical columns
non_categories = train_df.select_dtypes(include=["number"]).columns

train_df[non_categories] = StandardScaler().fit_transform(train_df[non_categories])

train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-1.001004,-0.447661,0.122633,-0.236003,-0.447819,1.595076,-0.184829
1,-0.620847,-0.447661,0.507691,-0.766592,-0.447819,0.367511,0.922299
3,0.139467,-0.447661,0.507691,-0.766592,-0.447819,0.613024,-0.184829
7,-0.240690,-0.447661,0.892749,-0.766592,-0.447819,-1.105568,-0.184829
9,-0.620847,-0.447661,0.892749,0.825176,-0.447819,1.349563,1.291341
...,...,...,...,...,...,...,...
18509,-0.620847,-0.447661,-1.032539,-0.236003,-0.447819,1.595076,-0.553872
18511,-1.001004,-0.447661,-0.647482,0.294587,-0.447819,0.613024,-0.553872
18514,1.279938,-0.447661,-0.262424,-0.766592,-0.447819,0.367511,-0.553872
18519,0.139467,-0.447661,0.507691,-0.766592,-0.447819,0.121998,0.553256


In [40]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10189 entries, 0 to 18523
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           10189 non-null  float64
 1   Stage_fear                 10189 non-null  float64
 2   Social_event_attendance    10189 non-null  float64
 3   Going_outside              10189 non-null  float64
 4   Drained_after_socializing  10189 non-null  float64
 5   Friends_circle_size        10189 non-null  float64
 6   Post_frequency             10189 non-null  float64
dtypes: float64(7)
memory usage: 636.8 KB


In [41]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target, test_size=0.2, random_state=42)

In [42]:
model = CatBoostClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)
matrix = confusion_matrix(y_test, predictions)
print(accuracy)
print(report)
print(matrix)

Learning rate set to 0.025236
0:	learn: 0.6513116	total: 2.34ms	remaining: 2.34s
1:	learn: 0.6138189	total: 5.46ms	remaining: 2.73s
2:	learn: 0.5791170	total: 8.1ms	remaining: 2.69s
3:	learn: 0.5470065	total: 10.6ms	remaining: 2.65s
4:	learn: 0.5168885	total: 13.4ms	remaining: 2.66s
5:	learn: 0.4892155	total: 16.2ms	remaining: 2.69s
6:	learn: 0.4628688	total: 19.2ms	remaining: 2.73s
7:	learn: 0.4390017	total: 21.8ms	remaining: 2.7s
8:	learn: 0.4175089	total: 24.1ms	remaining: 2.65s
9:	learn: 0.3976507	total: 26.5ms	remaining: 2.62s
10:	learn: 0.3793038	total: 28.8ms	remaining: 2.59s
11:	learn: 0.3617003	total: 31.5ms	remaining: 2.59s
12:	learn: 0.3449522	total: 34.1ms	remaining: 2.59s
13:	learn: 0.3305912	total: 36.9ms	remaining: 2.6s
14:	learn: 0.3172970	total: 39.4ms	remaining: 2.58s
15:	learn: 0.3050287	total: 41.6ms	remaining: 2.56s
16:	learn: 0.2935834	total: 44.3ms	remaining: 2.56s
17:	learn: 0.2832216	total: 46.3ms	remaining: 2.52s
18:	learn: 0.2731868	total: 48.8ms	remaining: 2

In [47]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input

# Define the model
model = Sequential([
    Input(shape=(7,)),
    Dense(10, activation='relu'),  # Input layer with 2 features
    Dense(1, activation='sigmoid')  # Output layer with sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model and save the history
history = model.fit(X_train, y_train, epochs=10, batch_size=10, validation_split=0.2)

Epoch 1/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.7230 - loss: 0.4897 - val_accuracy: 0.9706 - val_loss: 0.1418
Epoch 2/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9629 - loss: 0.1631 - val_accuracy: 0.9700 - val_loss: 0.1288
Epoch 3/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9658 - loss: 0.1425 - val_accuracy: 0.9700 - val_loss: 0.1265
Epoch 4/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9579 - loss: 0.1646 - val_accuracy: 0.9700 - val_loss: 0.1258
Epoch 5/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9618 - loss: 0.1536 - val_accuracy: 0.9700 - val_loss: 0.1252
Epoch 6/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9594 - loss: 0.1630 - val_accuracy: 0.9700 - val_loss: 0.1253
Epoch 7/10
[1m652/652[0m 

In [48]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")
print(loss)

[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9581 - loss: 0.1645
Test Accuracy: 0.9578017592430115
0.16321192681789398
