## Let's load data set and view it

In [None]:
import pandas as pd

df = pd.read_csv("aug_train.csv")


df.head()

## Let's check the nulls and info

In [None]:
df.info()

# Missing values
df.isnull().sum()


In [None]:
# Clean and Fill Missing Data
# Replace experience values
df['experience'] = df['experience'].replace('>20', '21').replace('<1', '0.5')
df['experience'] = df['experience'].astype(float)
df['experience'].fillna(df['experience'].median(), inplace=True)

# Fill categorical nulls with 'Unknown'
cat_cols = ['education_level', 'major_discipline', 'company_type', 'last_new_job', 'company_size']
for col in cat_cols:
    df[col].fillna('Unknown', inplace=True)


In [None]:
# visualize missing data
import seaborn as sns
import matplotlib.pyplot as plt


plt.figure(figsize=(10,6))
sns.heatmap(df.T.isnull(), cbar=False, cmap="Reds")
plt.title("Missing Values")
plt.show()


In [None]:
from sklearn.preprocessing import LabelEncoder

label_cols = ['gender', 'relevent_experience', 'enrolled_university',
              'education_level', 'major_discipline', 'company_type',
              'last_new_job', 'company_size', 'city']

le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['training_hours', 'experience']] = scaler.fit_transform(df[['training_hours', 'experience']])


In [None]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop(columns=['target'])
y = df['target']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Predict
y_pred_dt = dt_model.predict(X_test)

# Evaluate
print("Decision Tree Results:\n")
print(classification_report(y_test, y_pred_dt))
print(confusion_matrix(y_test, y_pred_dt))


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

print("KNN Results:\n")
print(classification_report(y_test, y_pred_knn))
print(confusion_matrix(y_test, y_pred_knn))


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("Logistic Regression Results:\n")
print(classification_report(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Store metrics for each model
results = {
    "Model": [],
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": []
}

def evaluate_model(name, y_true, y_pred):
    results["Model"].append(name)
    results["Accuracy"].append(accuracy_score(y_true, y_pred))
    results["Precision"].append(precision_score(y_true, y_pred))
    results["Recall"].append(recall_score(y_true, y_pred))
    results["F1-Score"].append(f1_score(y_true, y_pred))

# Evaluate each model
evaluate_model("Decision Tree", y_test, y_pred_dt)
evaluate_model("KNN", y_test, y_pred_knn)
evaluate_model("Logistic Regression", y_test, y_pred_lr)

results_df = pd.DataFrame(results)
results_df
# Plot comparison visualized
# results_df.set_index("Model").plot(kind="bar", figsize=(10, 6), ylim=(0, 1), title="Model Comparison")
# plt.ylabel("Score")
# plt.xticks(rotation=0)
# plt.show()



In [None]:
plt.figure(figsize=(6, 4))
sns.boxplot(x=df['training_hours'])
plt.title("Training Hours - Aykırı Değer Kontrolü")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Confusion Matrix for best model
ConfusionMatrixDisplay.from_estimator(dt_model, X_test, y_test)
plt.title("Confusion Matrix - Decision Tree")
plt.show()
