In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
# Reading in data
### Using Kaggle notebook
# train_df = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
# test_df = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
# sample_subm = pd.read_csv('/kaggle/input/playground-series-s5e7/sample_submission.csv')

### Using personal notebook
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')
sample_subm = pd.read_csv('Data/sample_submission.csv')

print(f'Train-{train_df.shape}\nTest-{test_df.shape}')

Train-(18524, 9)
Test-(6175, 8)


In [5]:
train_df.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


## EDA

In [6]:
#Train
print(train_df.info())
print(train_df.isna().sum())
print(train_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB
None
id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Frien

In [7]:
#Test
print(test_df.info())
print(test_df.isna().sum())
print(test_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6175 entries, 0 to 6174
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         6175 non-null   int64  
 1   Time_spent_Alone           5750 non-null   float64
 2   Stage_fear                 5577 non-null   object 
 3   Social_event_attendance    5778 non-null   float64
 4   Going_outside              5709 non-null   float64
 5   Drained_after_socializing  5743 non-null   object 
 6   Friends_circle_size        5825 non-null   float64
 7   Post_frequency             5767 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 386.1+ KB
None
id                             0
Time_spent_Alone             425
Stage_fear                   598
Social_event_attendance      397
Going_outside                466
Drained_after_socializing    432
Friends_circle_size          350
Post_frequency               408
d

## Cleaning

In [8]:
# Fixing NA fields
def na_fill(df):
    num_cols = df.select_dtypes(include=['number']).columns
    obj_cols = df.select_dtypes(include=['object']).columns
    for n_col in num_cols:
        df[n_col].fillna(df[n_col].mean(), inplace=True)
    for o_col in obj_cols:
        df[o_col].fillna('No', inplace=True)
    return df

In [9]:
train_filled = na_fill(train_df)
print(f'---Train---\n{train_filled.isna().sum()}\n')

test_filled = na_fill(test_df)
print(f'---Test---\n{test_filled.isna().sum()}')

---Train---
id                           0
Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
Personality                  0
dtype: int64

---Test---
id                           0
Time_spent_Alone             0
Stage_fear                   0
Social_event_attendance      0
Going_outside                0
Drained_after_socializing    0
Friends_circle_size          0
Post_frequency               0
dtype: int64


In [10]:
# Mapping objects to numericals
map_cols = train_filled.describe(include=['object']).columns
train_filled[map_cols].head()

Unnamed: 0,Stage_fear,Drained_after_socializing,Personality
0,No,No,Extrovert
1,No,No,Extrovert
2,Yes,No,Introvert
3,No,No,Extrovert
4,No,No,Extrovert


In [11]:
train_filled['Stage_fear'] = train_filled['Stage_fear'].map({'Yes':1, 'No':0})
train_filled['Drained_after_socializing'] = train_filled['Drained_after_socializing'].map({'Yes':1, 'No':0})
train_filled['Personality'] = train_filled['Personality'].map({'Extrovert':1, 'Introvert':0})

test_filled['Stage_fear'] = test_filled['Stage_fear'].map({'Yes':1, 'No':0})
test_filled['Drained_after_socializing'] = test_filled['Drained_after_socializing'].map({'Yes':1, 'No':0})

In [12]:
test_filled.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,0,7.0,4.0,0,6.0,5.028958
1,18525,3.11687,1,0.0,0.0,1,5.0,1.0
2,18526,3.0,0,5.0,6.0,0,15.0,9.0
3,18527,3.0,0,4.0,4.0,0,5.0,6.0
4,18528,9.0,1,1.0,2.0,1,1.0,1.0


## Modeling Preparation

In [13]:
rdy_data = train_filled.drop(columns=['id','Personality'])
target = train_filled['Personality']

test_final = test_filled.copy()
test_rdy = test_filled.drop(columns='id')

In [25]:
rdy_data.shape

(18524, 7)

In [24]:
target.shape

(18524,)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(rdy_data, target, test_size=0.2, random_state=20)
print(f'X_train\n{X_train.shape}\n\nX_test\n{X_test.shape}\n\ny_train\n{y_train.shape}\n\ny_test\n{y_test.shape}')

X_train
(14819, 7)

X_test
(3705, 7)

y_train
(14819,)

y_test
(3705,)


## Model Creation

In [17]:
lr = LogisticRegression(random_state=21).fit(X_train, y_train)
y_pred = lr.predict(X_test)

## Metric Check

In [18]:
accuracy_score(y_test, y_pred)*100

96.49122807017544

## Official Run

In [27]:
## Accuracy is acceptable, train and test on full sets
lr_final = LogisticRegression(random_state=49).fit(rdy_data, target)
lr_final_pred = lr_final.predict(test_rdy)

## Test Dataset Update/Remap

In [37]:
test_final['Personality'] = pd.DataFrame(lr_final_pred)
test_final['Personality'] = test_final['Personality'].map({1:'Extrovert', 0:'Introvert'})

test_submit = test_final.iloc[:,[0,-1]]
test_submit.head()

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert


In [38]:
test_submit.isna().sum() #was 2470

id             0
Personality    0
dtype: int64

In [39]:
test_submit.to_csv('submission.csv', index=False)