In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.decomposition import TruncatedSVD

# ===========================
# 1) Load Excel data
# ===========================
file_path = "Odunc_Guncel_Kategorili_Liste.xlsx"  # الملف لازم يكون في نفس المجلد

df = pd.read_excel(file_path)

print("Data shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

print("\nMissing values per column:")
print(df.isna().sum())

print("\nDuplicated rows:", df.duplicated().sum())

print("\nTarget distribution (LC_Kategori):")
print(df["LC_Kategori"].value_counts())

# ===========================
# 2) Features and target
# ===========================
target_col = "LC_Kategori"

# حذف معرف الطالب
if "ÖğrenciKimlik-no" in df.columns:
    df_model = df.drop(columns=["ÖğrenciKimlik-no"])
else:
    df_model = df.copy()

X = df_model.drop(columns=[target_col])
y = df_model[target_col]

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

print("\nNumeric features:", numeric_features)
print("Categorical features:", categorical_features)

# ===========================
# 3) Train/Test split
# ===========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nX_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

# ===========================
# 4) Preprocessing (OneHot + scaling)
# ===========================
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(with_mean=False), numeric_features),
    ]
)

# نطبّق الـ preprocessing للحصول على مصفوفة عددية
X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)

print("\nEncoded X_train shape:", X_train_enc.shape)

# ===========================
# 5) Helper function لتقييم الموديلات
# ===========================
def eval_model(stage, name, model, X_tr, y_tr, X_te, y_te, results_list):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    f1 = f1_score(y_te, y_pred, average="macro", zero_division=0)

    print(f"\n=== {stage} | {name} ===")
    print("Accuracy:", acc)
    print("F1 macro:", f1)
    print("\nClassification report:\n")
    print(classification_report(y_te, y_pred, zero_division=0))

    results_list.append({
        "Stage": stage,
        "Model": name,
        "Accuracy": acc,
        "F1_macro": f1
    })

# نعرّف الموديلات البسيطة (Base Models)
logreg = LogisticRegression(max_iter=1000, multi_class="multinomial")
knn = KNeighborsClassifier(n_neighbors=7)

results = []

# ===========================
# 6) Stage 1: Original features (no FS / no DR)
# ===========================
stage1 = "Original features"

eval_model(stage1, "LogisticRegression", logreg, X_train_enc, y_train, X_test_enc, y_test, results)
eval_model(stage1, "KNN (k=7)", knn, X_train_enc, y_train, X_test_enc, y_test, results)

# ===========================
# 7) Stage 2: Feature Selection (SelectKBest + mutual_info)
# ===========================
stage2 = "After Feature Selection (SelectKBest, k=30)"

selector = SelectKBest(score_func=mutual_info_classif, k=30)
X_train_fs = selector.fit_transform(X_train_enc, y_train)
X_test_fs = selector.transform(X_test_enc)

print("\nAfter FS shapes:", X_train_fs.shape, X_test_fs.shape)

eval_model(stage2, "LogisticRegression", logreg, X_train_fs, y_train, X_test_fs, y_test, results)
eval_model(stage2, "KNN (k=7)", knn, X_train_fs, y_train, X_test_fs, y_test, results)

# ===========================
# 8) Stage 3: Dimension Reduction (TruncatedSVD)
# ===========================
stage3 = "After Dimension Reduction (TruncatedSVD, n=20)"

svd = TruncatedSVD(n_components=20, random_state=42)
X_train_dr = svd.fit_transform(X_train_enc)
X_test_dr = svd.transform(X_test_enc)

print("\nAfter DR shapes:", X_train_dr.shape, X_test_dr.shape)

eval_model(stage3, "LogisticRegression", logreg, X_train_dr, y_train, X_test_dr, y_test, results)
eval_model(stage3, "KNN (k=7)", knn, X_train_dr, y_train, X_test_dr, y_test, results)

# ===========================
# 9) Summary table of all experiments
# ===========================
results_df = pd.DataFrame(results)
print("\n\n===== Summary Table (Accuracy & F1) =====")
display(results_df)


Data shape: (13436, 6)

First 5 rows:


Unnamed: 0,ÖğrenciKimlik-no,Cinsiyet,Fakülte,Bölüm,Kitap-ID,LC_Kategori
0,509005326,Kadın,Turizm Fakültesi,Gastronomi ve Mutfak Sanatları,GY151,Genel Yapıtlar
1,421475486,Kadın,Mühendislik ve Doğa Bilimleri Fakültesi,Biyomühendislik,B3004,Bilim
2,660301107,Erkek,Sağlık Bilimleri Fakültesi,Odyoloji,Tip4081,Tıp
3,534575034,Kadın,Edebiyat Fakültesi,Sosyoloji Bölümü,DE2527,Dil ve Edebiyat
4,331718324,Kadın,Hukuk Fakültesi,Hukuk Bölümü,H9821,Hukuk



Missing values per column:
ÖğrenciKimlik-no    0
Cinsiyet            0
Fakülte             0
Bölüm               0
Kitap-ID            0
LC_Kategori         0
dtype: int64

Duplicated rows: 0

Target distribution (LC_Kategori):
LC_Kategori
Dil ve Edebiyat             3274
Tıp                         2703
Hukuk                       1016
Siyaset Bilimi              1001
Felsefe, Psikoloji, Din      965
Bilim                        957
Güzel Sanatlar               952
Eğitim                       913
Sosyal Bilimler              904
Müzik                         79
Askerlik                      74
Kaynakçalar                   72
Tarih                         71
Genel Yapıtlar                69
Denizcilik                    69
Tarihe Yardımcı Bilimler      67
Bilinmeyen                    66
Coğrafya                      64
Tarım                         63
Teknoloji                     57
Name: count, dtype: int64

Numeric features: []
Categorical features: ['Cinsiyet', 'Fakülte', 'Bölü

Unnamed: 0,Stage,Model,Accuracy,F1_macro
0,Original features,LogisticRegression,0.730655,0.345195
1,Original features,KNN (k=7),0.703497,0.334201
2,"After Feature Selection (SelectKBest, k=30)",LogisticRegression,0.730655,0.345195
3,"After Feature Selection (SelectKBest, k=30)",KNN (k=7),0.730655,0.345195
4,"After Dimension Reduction (TruncatedSVD, n=20)",LogisticRegression,0.730655,0.345195
5,"After Dimension Reduction (TruncatedSVD, n=20)",KNN (k=7),0.693452,0.33132
