# Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Import Data Set and Early Exploration

In [2]:
df = pd.read_csv('recruitment_data.csv') # Load Data Set

FileNotFoundError: [Errno 2] No such file or directory: 'recruitment_data.csv'

In [None]:
df.head() #See Data 5 row of the data ser

In [None]:
df.info() #Data Set Info

In [None]:
#Differentiate Categorical and Numerical Feature 
numerical = df.select_dtypes("number").columns.tolist() #Take numerical column name as a list
categorical = df.select_dtypes('object').columns.tolist() #Take categorical column name as a list
print(numerical)
print(categorical)

# Check for Missing Value and Duplicated Row

In [None]:
#check for Missing value for each column 
((df.isnull().sum()/len(df))*100).plot(kind='barh')
plt.xlabel('Percentage of Missing Value (%)')
plt.ylabel('Column Name')
plt.title('Missing Value')

In [None]:
#Check for duplicated rows 
df.duplicated().sum()

**Action:** Because the amount of data that is missing is quite few (Max. Null 8%), then we should drop the missing data

In [None]:
#Drop missing data 
df.dropna(axis=0, inplace=True)

In [None]:
#Recheck for Missing value 
df.isnull().sum()

In [None]:
#Data shape after removing missing values 
df.shape

In [None]:
#Check for Serial_no unique value 
df['Serial_no'].nunique()

In [None]:
#Since Serial_no column is unique in every row, we could drop the column because it has no meaning 
df.drop('Serial_no', axis=1, inplace=True)

In [None]:
#Check for Used Column 
df.head()

# Simple Exploratory Data Analysis

## Univariate Analysis

### Categorical Column 

In [None]:
#Value Counts of Categorical Column
fig = plt.figure(figsize=(20,7))
ax_value = []
for x in range(len(categorical)): 
    ax_value.append(f'ax{x}')
                    
for i in range(len(categorical)):
    ax_value[i] = fig.add_subplot(1, len(categorical), i + 1)
    df[categorical[i]].value_counts().plot(kind='bar', color='red', ax=ax_value[i])
    ax_value[i].set_title(f'count of {categorical[i]}')
    ax_value[i].set_xlabel(f'{categorical[i]} Column')
    ax_value[i].set_ylabel('Value Counts')
    plt.tight_layout()
        

**Conclusion :** Most of the candidate didn't have any prior experience for python as well as internship. The job was dominated by male and most of the candidate has a Degree (Graduate). 

Based on the target (Recruitment_Status), we can identify that the data set is imbalanced data set. Thus, Imbalanced Data Handling is necessary to be done before training the data set.

In [None]:
#Statistical Summary of Categorical Columns
df[categorical].describe()

### Numerical Column

In [None]:
#Since we have already drop Serial_no column,  we have to create a new list consisting the remaining numerical column 
numerical_new = df.select_dtypes('number').columns.tolist()
numerical_new

In [None]:
#Distribution Plot for Numerical Column 
fig = plt.figure(figsize=(20,7))
ax_value = []
for x in range(len(numerical_new)): 
    ax_value.append(f'ax{x}')
                    
for i in range(len(numerical_new)):
    ax_value[i] = fig.add_subplot(1, len(numerical_new), i + 1)
    df[numerical_new[i]].plot(kind='kde', color='Blue', ax=ax_value[i])
    df[numerical_new[i]].plot(kind='hist', density=True, bins=16, color='orange', ax=ax_value[i])
    ax_value[i].set_title(f'{numerical_new[i]} Distribution')
    ax_value[i].set_xlabel(f'{numerical_new[i]} Column')
    ax_value[i].set_ylabel('Value Counts')
    plt.tight_layout()
        

**Conclusion:** Experience Years and Offer History column are discrete numerical variable. While Score and Salary (continous variable) seems to be skewed distributed data. Thus, we will expect there are outliers in Score and Salary Column.

In [None]:
#Statistical Summary of Numerical Feature 
df[numerical_new].describe()

In [None]:
#Boxplot for Continuous Numerical Column
fig = plt.figure(figsize=(20,15))
continous_numerical = ['Score', 'Salary * 10E4']
ax_value = []
for x in range(len(continous_numerical)): 
    ax_value.append(f'ax{x}')
                    
for i in range(len(continous_numerical)):
    ax_value[i] = fig.add_subplot(1, len(continous_numerical), i + 1)
    df[continous_numerical[i]].plot(kind='box', color='orange', ax=ax_value[i])
    ax_value[i].set_title(f'{continous_numerical[i]} Box Plot')
    plt.tight_layout()

**Possible Action** : Since Score is achieved from candidate test and interview score, it is make sense that there are some candidates that are brilliant. For the salary, it is also make sense for candidates who have a bigger previous salary because of the his/her experience. So, this two outliers will be kept for the modelling.

## Correlation Matrix

In [None]:
#Plot the Correlation Matrix for Numerical Column
plt.figure(figsize=(15,10))
correlation = df.corr()
sns.heatmap(correlation, annot=True, fmt='.2f')

**Conclusion** : There is no column which has a correlation value greater than 0.8. Thus, we can carry on with all the features.

# Pre Processing Categorical Variable (Categorical ---> Numerical)

## Python_exp and Internship Column

In [None]:
df.info()

In [None]:
df['Internship'].unique()

In [None]:
df['Python_exp'].unique()

In [None]:
#Endocing Python_exp and internship column 
y_n_enc = {'Yes' : 1, 
          'No' : 0}
df['Internship'] = df['Internship'].map(y_n_enc)
df.info()

In [None]:
df['Python_exp'] = df['Python_exp'].map(y_n_enc)
df.info()

## Recruitment_Status Column

In [None]:
#Recruitment_Status Unique Value 
df['Recruitment_Status'].unique()

In [None]:
#Binary Encoding for Recruitment_Status Column 
rec_enc = {'Y' : 1, 
          'N' : 0}
df['Recruitment_Status'] = df['Recruitment_Status'].map(rec_enc)
df.info()

## Gender Column

In [None]:
#Gender Column Unique Value 
df['Gender'].unique()

In [None]:
#Binary Encoding for Gender Column 
gen_enc = {'Male' : 1, 
          'Female' : 0}
df['Gender'] = df['Gender'].map(gen_enc)
df.info()

## Education Column

In [None]:
#Education Column Unique Valie 
df['Education'].unique()

In [None]:
#Binary Encoding for Education Column 
ed_enc = {'Graduate' : 1, 
          'Not Graduate' : 0}
df['Education'] = df['Education'].map(ed_enc)
df.info()

## Location Column 

In [None]:
#Location Unique Value 
df['Location'].unique()

**Possible Action:** Because candidates who live in Urban Place has a higher tendency to become a top candidate (Urban Education is better than rural and also its has better competitiveness). Thus, we should perform ordinal encoding for this column.   

Rank for Ordinal Encoding: 

Urban (Highest) -> Semiurban -> Rural

In [None]:
#Ordinal Encoding for Location Column 
loc_enc = {'Urban':3, 
          'Semiurban' : 2, 
          'Rural' : 1}
df['Location'] = df['Location'].map(loc_enc)
df.info()

In [None]:
#Download Preprocessed Data Set
from IPython.display import FileLink
# Import a module to delete the file
import os
# Create a download function
def csv_download_link(df, csv_file_name, delete_prompt=True):
    """Display a download link to load a data frame as csv within a Jupyter notebook

    Parameters
    ----------
    df : pandas data frame
    csv_file_name : str
    delete_prompt : bool
    """
    df.to_csv(csv_file_name, index=False)
    display(FileLink(csv_file_name))
    if delete_prompt:
        a = input('Press enter to delete the file after you have downloaded it.')
        os.remove(csv_file_name)


# Use the function to diplay a download link
csv_download_link(df, 'cleaned Recruitment.csv')

# Modelling and Best Model Selection

In [None]:
#Split Target and Feature 
X = df.drop('Recruitment_Status', axis=1) #Feature
Y = df['Recruitment_Status'] #Target

In [None]:
#Perform SMOTE Imbalanced Data Handling 
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_res, y_res = smote.fit_resample(X, Y)

In [None]:
X_res.shape, y_res.shape

In [None]:
#Split Train Test Data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42) #Split by 80% and 20%

In [None]:
X_train.shape, y_train.shape

In [None]:
#Multiple Modelling and Evaluate using AUC Score 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score

model_alg = [RandomForestClassifier(), 
             DecisionTreeClassifier(),
             KNeighborsClassifier(), 
             LogisticRegression() 
             ]
df_mod_eval = pd.DataFrame(columns = ['Algorithm','AUC Score']) 

for model in model_alg : 
  model.fit(X_train, y_train)
  algorithm = str(type(model)).split('.')[-1][:-2]
  y_pred_proba = model.predict_proba(X_test)[::,1]
  auc = roc_auc_score(y_test, y_pred_proba)
  df_mod_eval = df_mod_eval.append({'Algorithm' : algorithm, 
                                    'AUC Score' : auc},ignore_index=True)
df_mod_eval

**Conclusion :** Random Forrest Classifier Gives the Best Model for the Classification

# Model Evaluation

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
#Classification Report 
from sklearn.metrics import classification_report
y_pred = rfc.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
#ROC Curve 
y_pred_proba = rfc.predict_proba(X_test)[:][:,1]

df_actual_predicted = pd.concat([pd.DataFrame(np.array(y_test), columns=['y_actual']), pd.DataFrame(y_pred_proba, columns=['y_pred_proba'])], axis=1)
df_actual_predicted.index = y_test.index
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, tr = roc_curve(df_actual_predicted['y_actual'], df_actual_predicted['y_pred_proba'])
auc = roc_auc_score(df_actual_predicted['y_actual'], df_actual_predicted['y_pred_proba'])

plt.plot(fpr, tpr, label='AUC = %0.4f' %auc)
plt.plot(fpr, fpr, linestyle = '--', color='k')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()

In [None]:
#Confusion Matrix 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

fig, ax = plt.subplots(figsize=(10,7))
target_pred = rfc.predict(X_test)

cm = confusion_matrix(y_test, target_pred, labels=rfc.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                             display_labels=rfc.classes_)
disp.plot(ax=ax)
plt.show()

**Conclusion :** The Model has Sufficient Accuracy (88%)

In [None]:
# Feature Importance 
arr_feature_importances = rfc.feature_importances_
arr_feature_names = X_train.columns.values
    
df_feature_importance = pd.DataFrame(index=range(len(arr_feature_importances)), columns=['feature', 'importance'])
df_feature_importance['feature'] = arr_feature_names
df_feature_importance['importance'] = arr_feature_importances
df_all_features = df_feature_importance.sort_values(by='importance', ascending=False)
df_all_features.set_index('feature', inplace=True)
df_all_features.plot(kind='barh')
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.ylabel('Feature')