In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:

df = pd.read_csv("jobs_dataset.csv")

# 3️⃣ Explore dataset
print("✅ Dataset Loaded Successfully!")
print("Shape:", df.shape)
print("\nColumns:", list(df.columns))
print("\nMissing values:\n", df.isnull().sum())


✅ Dataset Loaded Successfully!
Shape: (735, 14)

Columns: ['company', 'rating', 'location', 'positionName', 'description', 'salary', 'url', 'jobType/0', 'jobType/1', 'jobType/2', 'jobType/3', 'searchInput/country', 'searchInput/position', 'externalApplyLink']

Missing values:
 company                   0
rating                    0
location                  0
positionName              0
description               0
salary                  229
url                       0
jobType/0               234
jobType/1               716
jobType/2               734
jobType/3               734
searchInput/country       0
searchInput/position      0
externalApplyLink       182
dtype: int64


In [15]:
target_col = df.columns[-1]  # آخر عمود

print("\nSelected target column:", target_col)

# 5️⃣ تقسيم البيانات إلى Features و Target
X = df.drop(columns=[target_col])
y = df[target_col]


Selected target column: externalApplyLink


In [16]:
# 4️⃣ اختار العمود الهدف (اللي عايز تتنبأ به)
# غيّر الاسم حسب الملف بتاعك
target_col = df.columns[-1]  # آخر عمود
# مثال لو عندك عمود اسمه "salary" استخدم:  target_col = "salary"

print("\nSelected target column:", target_col)

# 5️⃣ تقسيم البيانات إلى Features و Target
X = df.drop(columns=[target_col])
y = df[target_col]

# لو الهدف عددي (مثلاً الراتب) نحوله لتصنيف (Low / Medium / High)
if y.dtype.kind in 'ifu' and y.nunique() > 20:
    print("\nTarget is numeric → converting to 3 categories (low, mid, high)")
    y = pd.qcut(y, q=3, labels=["Low", "Mid", "High"], duplicates='drop')

# حذف الصفوف اللي فيها قيم مكررة
df = df.drop_duplicates()

# 6️⃣ تحديد أنواع الأعمدة
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("\nNumeric columns:", numeric_cols)
print("Categorical columns:", cat_cols)



Selected target column: externalApplyLink

Numeric columns: ['rating']
Categorical columns: ['company', 'location', 'positionName', 'description', 'salary', 'url', 'jobType/0', 'jobType/1', 'jobType/2', 'jobType/3', 'searchInput/country', 'searchInput/position']


In [17]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
])

if y.dtype == object or str(y.dtype).startswith('category'):
    le = LabelEncoder()
    y = le.fit_transform(y.astype(str))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y if len(np.unique(y)) > 1 else None
)

X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

print("\n✅ Preprocessing completed successfully!")
print("Training features shape:", X_train_prep.shape)


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train_prep, y_train)
    preds = model.predict(X_test_prep)
    
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds, average='weighted', zero_division=0)
    rec = recall_score(y_test, preds, average='weighted', zero_division=0)
    f1 = f1_score(y_test, preds, average='weighted', zero_division=0)
    
    results[name] = {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    }
    
    print(f"\n📊 {name}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")

results_df = pd.DataFrame(results).T
print("\n===== Model Performance Summary =====")
print(results_df)

results_df.plot(kind='bar', figsize=(10, 5))
plt.title("Model Performance Comparison")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.grid(True, axis='y')
plt.show()

best_model = results_df['F1'].idxmax()
print(f"\n🏆 Best Model: {best_model}")

best_clf = models[best_model]
y_pred_best = best_clf.predict(X_test_prep)

cm = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f"Confusion Matrix - {best_model}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
