In [None]:
# =============================================================================
# # OPTIMUM TRAINNING PROCESS MODEL
# =============================================================================

# MODEL İÇİN GEREKLİ KÜTÜPHANELERİ TANIMLAYALIM

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import LocalOutlierFactor
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import RocCurveDisplay
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
import missingno as msno
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import plotly.express as px

In [None]:
# =============================================================================
# # STEP 1: IDENTIFICATION OF MISSING DATA
# =============================================================================

# Get Data

train_data = pd.read_csv(r"C:\Users\erdal\Desktop\training_and_development_data.csv")
employee_data = pd.read_csv(r"C:\Users\erdal\Desktop\employee_data.csv")
employee_data2 = pd.read_csv(r"C:\Users\erdal\Desktop\employee_engagement_survey_data.csv")

df = pd.concat([employee_data, employee_data2, train_data], axis=1)

In [None]:
# Check Data

df.shape

df = df.sample(n=25000, replace=True, random_state=42)

df.shape

df.info()
 
pd.isnull(df).sum()

In [None]:
# Delete Unnecassary Data

df.drop(['EmpID', 'FirstName', 'LastName', "StartDate","ExitDate", "ADEmail", "PayZone", "TerminationType", "TerminationDescription"], axis=1, inplace=True)

df.drop(['Employee ID', 'Survey Date', 'DOB', "State","JobFunctionDescription", "RaceDesc", "MaritalDesc", "Engagement Score"], axis=1, inplace=True)

df.drop(["Title",'Supervisor', 'EmployeeClassificationType', 'Division', 'BusinessUnit', "Location","GenderCode", "Current Employee Rating", "Performance Score", "Training Date", "Training Cost"], axis=1, inplace=True)

df.info()

In [None]:
# Target Analysis

df['Training Duration(Days)'].value_counts()
df['Training Duration(Days)'].max()
df['Training Duration(Days)'].min()

# df['Training Duration(Days)'] = pd.cut(df['Training Duration(Days)'], bins=2, labels=[0,1]) (EĞER İKİLİ SINIFLANDIRMA YAPMAK İSTERSK KULLANACAĞIZ)

sns.countplot(x='Training Duration(Days)', data=df)
plt.show()

In [None]:
# Exploratory Data Analysis

# EmployeeStatus
plt.figure(figsize=(8, 6))
sns.countplot(x='EmployeeStatus', data=df, palette='muted')
plt.title('Distribution of Employee Status')
plt.xlabel('Employee Status')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# EmployeeType
plt.figure(figsize=(8, 6))
sns.countplot(x='EmployeeType', data=df, palette='muted')
plt.title('Distribution of Employee Type')
plt.xlabel('Employee Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# DepartmentType
plt.figure(figsize=(8, 6))
sns.countplot(x="DepartmentType", data=df, palette='muted')
plt.title('Distribution of Department Type')
plt.xlabel("DepartmentType")
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# LocationCode
df["LocationCode"].value_counts()

# Satisfaction Score
plt.figure(figsize=(8, 6))
sns.countplot(x="Satisfaction Score", data=df, palette='muted')
plt.title('Distribution of Satisfaction Score')
plt.xlabel("Satisfaction Score")
plt.ylabel('Count')
plt.show()

# Work-Life Balance Score
plt.figure(figsize=(8, 6))
sns.countplot(x="Work-Life Balance Score", data=df, palette='muted')
plt.title('Distribution of Work-Life Balance Score')
plt.xlabel("Work-Life Balance Score")
plt.ylabel('Count')
plt.show()

# Training Program Name
plt.figure(figsize=(8, 6))
sns.countplot(x="Training Program Name", data=df, palette='muted')
plt.title('Distribution of Training Program Name')
plt.xlabel("Training Program Name")
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Training Type
plt.figure(figsize=(8, 6))
sns.countplot(x="Training Type", data=df, palette='muted')
plt.title('Distribution of Training Type')
plt.xlabel("Training Type")
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Training Outcome
plt.figure(figsize=(8, 6))
sns.countplot(x="Training Outcome", data=df, palette='muted')
plt.title('Distribution of Training Outcome')
plt.xlabel("Training Outcome")
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Trainer
df["Trainer"].value_counts()

# comparison of complated trainning DepartmentType
plt.subplots(figsize=(15,5))
sns.countplot(x = 'Training Outcome', data = df, hue = 'DepartmentType', palette = 'dark')
plt.show()

# comparison of complated trainning DepartmentType
plt.subplots(figsize=(15,5))
sns.countplot(x = 'DepartmentType', data = df, hue = 'Training Outcome', palette = 'dark')
plt.show()

# comparison of complated trainning DepartmentType
plt.subplots(figsize=(15,5))
sns.countplot(x = 'Training Duration(Days)', data = df, hue = 'EmployeeType', palette = 'dark')
plt.show()

# comparison of complated trainning DepartmentType
plt.subplots(figsize=(15,5))
sns.countplot(x = 'Training Duration(Days)', data = df, hue = 'Training Type', palette = 'dark')
plt.show()

In [None]:
# Finding The Missing Data

# Visulazing of null datas
msno.bar(df, color = 'y', figsize = (10,8))

pd.isnull(df).sum()

In [None]:

# =============================================================================
# # STEP 2: CONVERSION OF CATEGORICAL DATA TO NUMERIC
# =============================================================================

num_col = df.select_dtypes(include=['int', 'float']).columns
obj_col = df.select_dtypes(include=['object']).columns

obj_col.value_counts()
transformed = LabelEncoder()

for i in obj_col:
    df[i] = transformed.fit_transform(df[i])
  

In [None]:
# DEĞİŞKENLER ARASINDAKİ İLİŞKİYE BAKALIM

correlation_matrix = df.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Corelation Matrix')
plt.show()

In [None]:
# =============================================================================
# # STEP 3: FINDING OUTLIERS VALUES
# =============================================================================

lof = LocalOutlierFactor()
outliers = lof.fit_predict(df)

print(outliers[1:1001])

# ÇOK FAZLA SAYIDA AYKIRI DEĞER YOK DEVAM EDEBİLİRİZ

In [None]:
# =============================================================================
# # STEP 4: IRREGULAR DATA DETECTION
# =============================================================================

X = df.drop(['Training Duration(Days)'], axis=1)
y = df['Training Duration(Days)']


smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y) 

print("Distrubution of classes:\n", pd.crosstab(y, columns='count'))

In [None]:
# =============================================================================
# # STEP 5: PCA
# =============================================================================

scaler = StandardScaler()

X_scl = scaler.fit_transform(X)

pca = PCA(n_components=2)

X_pca = pca.fit_transform(X_scl)

X_pca.shape

In [None]:
# =============================================================================
# # STEP 6: TRAIN-TEST SPLIT
# =============================================================================

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [None]:
# =============================================================================
# # STEP 7: LOGISTIC REGRESSION
# =============================================================================

log_model = LogisticRegression()
log_model.fit(X_train, y_train)

In [None]:
# =============================================================================
# # MODEL EVALUATION
# =============================================================================

# ACC Score 

y_pred = log_model.predict(X_test)

train_accuracy = log_model.score(X_train, y_train)
test_accuracy = log_model.score(X_test, y_test)

print("train_accuracy: ", train_accuracy)
print("test_accuracy: ", test_accuracy)

print(classification_report(y_test, y_pred))

In [None]:
# ROC CURVE

roc_display = RocCurveDisplay.from_estimator(log_model, X_test, y_test)
roc_display.plot()
plt.plot([0,1], [0,1], 'r--')
plt.title('ROC Curve')
plt.show()

In [None]:
# AUC

y_pred = log_model.predict(X_test)
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC AUC Score:", roc_auc)

In [None]:
# CONFUSION MATRIX

def plot_confusion_matrix(y_true, y_pred):
    acc = round(accuracy_score(y_true, y_pred), 2)
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt=".0f")
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix (Accuracy Score: {0})'.format(acc), size=10)
    plt.show()

plot_confusion_matrix(y_test, y_pred)

In [None]:
# =============================================================================
# # STEP 8: MULTIPLE LINEAR REGRESSION
# =============================================================================

reg_model = LinearRegression()
reg_model.fit(X_train, y_train)


In [None]:
# =============================================================================
# # MODEL EVALUATION
# =============================================================================

# ACC Score 

y_pred = reg_model.predict(X_test)

train_accuracy = reg_model.score(X_train, y_train)
test_accuracy = reg_model.score(X_test, y_test)

print("train_accuracy: ", train_accuracy)
print("test_accuracy: ", test_accuracy)

np.sqrt(mean_squared_error(y, y_pred)) 


In [None]:
# =============================================================================
# # STEP 9: SVM
# =============================================================================

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy: ", accuracy)                   


In [None]:
# =============================================================================
# # STEP 10: SVM CLASSIFICATION
# =============================================================================

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

print("accuracy: ", accuracy) 

In [None]:
# =============================================================================
# # STEP 11: XG-BOOST
# =============================================================================

xg_model = XGBRegressor()
xg_model.fit(X_train, y_train, verbose=False)

y_pred = xg_model.predict(X_test)

# Modelin performansını değerlendirme
print("MAE: " + str(mean_absolute_error(y_pred, y_test)))

train_accuracy = xg_model.score(X_train, y_train)
test_accuracy = xg_model.score(X_test, y_test)

print("train_accuracy: ", train_accuracy)
print("test_accuracy: ", test_accuracy)

np.sqrt(mean_squared_error(y, y_pred)) 


In [None]:
# =============================================================================
# # STEP 12: ML MODELS PERFORMANCE COMPARISON
# =============================================================================