In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

In [None]:
#Reading the data
df = pd.read_csv("/kaggle/input/employee-future-prediction/Employee.csv")

## Exploratory Data Analysis

In [None]:
#First five rows of the dataset
df.head()

In [None]:
#DataFrame's info
df.info()

In [None]:
#How many rows and columns in the dataset (rows, columns)
df.shape

In [None]:
#Are there any null values in the dataset?
df.isnull().sum()

In [None]:
#Are there any duplicated values in the dataset?
df.duplicated().sum()

In [None]:
#Dropping duplicate values, but one value from each remains
df.drop_duplicates(inplace=True)

In [None]:
#Checking duplicated values again
df.duplicated().sum()

## Data Visualization

In [None]:
plt.figure(figsize=(10,8))
sns.distplot(df["Age"])
plt.title("Age", size=15)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(df["Education"])
plt.title("Education", size=15)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.histplot(df["City"])
plt.title("City", size=15)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(x=df["ExperienceInCurrentDomain"],y=df["PaymentTier"])
plt.title("Payment Tier by Experience In Current Domain", size=15)
plt.show()

In [None]:
plt.figure(figsize=(14,8))
sns.barplot(x=df["Gender"],y=df["Age"])
plt.title("Age by Gender", size=15)
plt.show()

## Handling Categorical Variables

In [None]:
encoder = LabelEncoder()
columns = ["Education","EverBenched","Gender","City"]
for cols in columns:
    df[cols] = encoder.fit_transform(df[cols])

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
X = df.drop("LeaveOrNot", axis=1)
y = df["LeaveOrNot"]

## Splitting The Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

## Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=1000, n_jobs=-1, max_leaf_nodes=100, min_samples_leaf=0.001, min_samples_split=0.01).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

## Logistic Regression

In [None]:
log_model = LogisticRegression(C = 0.8, n_jobs=-1, max_iter=1000).fit(X_train, y_train)
y_pred = log_model.predict(X_test)
accuracy_score(y_test, y_pred)

## KNN

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 4, n_jobs=-1, leaf_size=10).fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
accuracy_score(y_test, y_pred)

## Support Vector Classifier (SVC)

In [None]:
svc_model = SVC(C = 0.001, max_iter = 1000).fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
accuracy_score(y_test, y_pred)