In [21]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

base_dir = "./inputs"
files = os.listdir(base_dir)
files

['sample_submission.csv', 'test.csv', 'train.csv']

In [22]:
train_df = pd.read_csv(os.path.join(base_dir, files[2])).set_index("id")
train_df = train_df.dropna()
target = train_df["Personality"]
train_df.drop("Personality", inplace=True, axis=1)
train_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.0,No,6.0,4.0,No,15.0,5.0
1,1.0,No,7.0,3.0,No,10.0,8.0
3,3.0,No,7.0,3.0,No,11.0,5.0
7,2.0,No,8.0,3.0,No,4.0,5.0
9,1.0,No,8.0,6.0,No,14.0,9.0
...,...,...,...,...,...,...,...
18509,1.0,No,3.0,4.0,No,15.0,4.0
18511,0.0,No,4.0,5.0,No,11.0,4.0
18514,6.0,No,5.0,3.0,No,10.0,4.0
18519,3.0,No,7.0,3.0,No,9.0,7.0


In [23]:
test_df = pd.read_csv(os.path.join(base_dir, files[1])).set_index("id")
test_df

Unnamed: 0_level_0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
18524,3.0,No,7.0,4.0,No,6.0,
18525,,Yes,0.0,0.0,Yes,5.0,1.0
18526,3.0,No,5.0,6.0,No,15.0,9.0
18527,3.0,No,4.0,4.0,No,5.0,6.0
18528,9.0,Yes,1.0,2.0,Yes,1.0,1.0
...,...,...,...,...,...,...,...
24694,3.0,No,5.0,5.0,No,9.0,6.0
24695,8.0,Yes,2.0,1.0,Yes,0.0,0.0
24696,2.0,No,4.0,3.0,No,9.0,7.0
24697,3.0,No,4.0,4.0,No,11.0,9.0


In [24]:
# Encode categorical columns

categories = train_df.select_dtypes(include=["object"]).columns

for column in categories:
    train_df[column] = LabelEncoder().fit_transform(train_df[column])
    test_df[column] = LabelEncoder().fit_transform(test_df[column])


In [25]:
# Scaling non_categorical columns

non_categories = train_df.select_dtypes(include=["number"]).columns
scaler = StandardScaler()

train_df[non_categories] = scaler.fit_transform(train_df[non_categories])
test_df[non_categories] = scaler.fit_transform(test_df[non_categories])

In [26]:
X_train = train_df
y_train = target
X_test = test_df

In [28]:
model = CatBoostClassifier()
model.fit(X_train,y_train)
predictions = model.predict(X_test)
predictions

Learning rate set to 0.027759
0:	learn: 0.6480454	total: 5.69ms	remaining: 5.68s
1:	learn: 0.6077566	total: 10.5ms	remaining: 5.25s
2:	learn: 0.5706637	total: 16.2ms	remaining: 5.38s
3:	learn: 0.5365369	total: 22.4ms	remaining: 5.59s
4:	learn: 0.5049338	total: 27.9ms	remaining: 5.56s
5:	learn: 0.4760129	total: 33.4ms	remaining: 5.53s
6:	learn: 0.4487297	total: 39.1ms	remaining: 5.54s
7:	learn: 0.4243022	total: 44.7ms	remaining: 5.54s
8:	learn: 0.4024303	total: 50.7ms	remaining: 5.58s
9:	learn: 0.3823277	total: 56.2ms	remaining: 5.57s
10:	learn: 0.3640281	total: 61.9ms	remaining: 5.57s
11:	learn: 0.3465507	total: 67.4ms	remaining: 5.55s
12:	learn: 0.3301288	total: 73.1ms	remaining: 5.55s
13:	learn: 0.3160629	total: 79.6ms	remaining: 5.6s
14:	learn: 0.3027394	total: 86.8ms	remaining: 5.7s
15:	learn: 0.2900656	total: 93.3ms	remaining: 5.74s
16:	learn: 0.2793013	total: 97.8ms	remaining: 5.65s
17:	learn: 0.2694050	total: 104ms	remaining: 5.67s
18:	learn: 0.2601905	total: 109ms	remaining: 5.

array(['Extrovert', 'Extrovert', 'Extrovert', ..., 'Extrovert',
       'Extrovert', 'Extrovert'], dtype=object)

In [29]:
submission_test = pd.read_csv("./inputs/test.csv")
output = pd.DataFrame({'id': submission_test.id, 'Personality': predictions})
output.to_csv('./outputs/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
