# PlayGround Series - Introvert v/s Extrovert

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('../playground-series-s5e7/train.csv')
full_x_test=pd.read_csv('../playground-series-s5e7/test.csv')
full_y_test=pd.read_csv('../playground-series-s5e7/sample_submission.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().mean()*100

## Correleation between Numeric Features

In [None]:
corr = df.select_dtypes(include='number').corr()
sns.heatmap(corr)

## Bar-Plot

In [None]:
plt.figure(figsize=(12,10))
plt.subplot(221)
sns.barplot(y=df.Time_spent_Alone,data=df,hue=df.Personality)
plt.subplot(222)
sns.barplot(y=df.Social_event_attendance,data=df,hue=df.Personality)
plt.subplot(223)
sns.barplot(y=df.Going_outside,data=df,hue=df.Personality)
plt.subplot(224)
sns.barplot(y=df.Drained_after_socializing,data=df,hue=df.Personality)

In [None]:
numeric_col=[col for col in df.columns if df[col].dtype!='object']
numeric_col.remove('id')
numeric_col

## Dist-Plot

In [None]:
for col in numeric_col:
    sns.distplot(x=df[col])
    plt.title(col)
    plt.tight_layout()
    plt.show()

## Box-Plot

In [None]:
for col in numeric_col:
    sns.boxplot(x=df[col])
    plt.title(col)
    plt.tight_layout()
    plt.show()

## Removing extra features

In [None]:
df=df.drop(columns=['id'])

In [None]:
df.Personality.value_counts()

## Removing Outliers Rows

In [None]:
def remove_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]    
df = remove_outliers_iqr(df,'Time_spent_Alone')

## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split

x=df.drop(columns=['Personality'])
y=df.Personality

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=12)

In [None]:
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Encoding Target Column

In [None]:
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.transform(y_test)

In [None]:
cat_cols=['Drained_after_socializing','Stage_fear']
numeric_col=[col for col in x_train.columns if df[col].dtype!='object']

## Transforming the features

In [None]:
numeric_trf=(
    'num_trf',Pipeline([
        ('impute',SimpleImputer())
    ]),numeric_col
)
cat_trf=(
    'cat',Pipeline([
        ('impute',SimpleImputer(strategy='most_frequent')),
        ('encode',OrdinalEncoder())
    ]),cat_cols
)

In [None]:
transformer=ColumnTransformer(transformers=[numeric_trf,cat_trf],remainder='passthrough')

## Model Training — XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

xgb=XGBClassifier(colsample_bytree= 0.6,
 gamma= 5,
 learning_rate= 0.1,
 max_depth= 3,
 n_estimators= 200,
 subsample= 0.6,n_jobs=-1,random_state=12)

In [None]:
pipe=Pipeline([
    ('transformer',transformer),
    ('clf',xgb)
])

In [None]:
pipe.fit(x_train,y_train)
y_pred=pipe.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score,ConfusionMatrixDisplay

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred)

## Using GirdSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.01, 0.05, 0.1,0.2],
    'clf__subsample': [0.6, 0.8, 1.0],
    'clf__colsample_bytree': [0.6, 0.8, 1.0],
    'clf__gamma': [0, 1, 5]
}
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
pipe_xgb_grid=Pipeline([
    ('transformer',transformer),
    ('clf',xgb)
])
grid_xgb=GridSearchCV(estimator=pipe_xgb_grid, param_grid=param_grid,cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_xgb.fit(x_train,y_train)
y_pred=grid_xgb.predict(x_test)
accuracy_score(y_test,y_pred)
grid_xgb.best_params_

## Testing the Orginal Test dataset

In [None]:
full_x_test.drop(columns=['id'],inplace=True)

In [None]:
y_encode=le.fit_transform(y)

In [None]:
pipe.fit(x, y_encode)

In [None]:
test_preds = pipe.predict(full_x_test)
test_preds_original = le.inverse_transform(test_preds)

## Submission

In [None]:
submission = full_y_test.copy()
submission['Personality'] = test_preds_original
submission.to_csv('final_submission.csv', index=False)