In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, roc_auc_score

In [6]:
df = pd.read_csv("C:/Users/jwaaa/Downloads/KFUPM-Grads-2010-2018/KFUPM_grads_2010_2018.csv", encoding="utf-8")
df.head()
df

Unnamed: 0,degree,major,honour,id_year,graduated
0,بكالوريوس,الهندسة الميكانيكية التطبيقية,2,2014,2018
1,بكالوريوس,الهندسة المدنية التطبيقية,-1,2013,2018
2,بكالوريوس,العمارة,-1,2013,2018
3,بكالوريوس,الهندسة الكهربائية,2,2014,2018
4,بكالوريوس,الهندسة المعمارية,2,2014,2018
...,...,...,...,...,...
12617,بكالوريوس,المالية,3,2005,2010
12618,بكالوريوس,الهندسة المدنية,-1,2005,2010
12619,بكالوريوس,الهندسة الميكانيكية التطبيقية,-1,2005,2010
12620,ماجستير,الهندسة الكهربائية,-1,1998,2010


In [20]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12622 entries, 0 to 12621
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   degree     12622 non-null  object
 1   major      12622 non-null  object
 2   honour     12622 non-null  int64 
 3   id_year    12622 non-null  int64 
 4   graduated  12622 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 493.2+ KB


Unnamed: 0,honour,id_year,graduated
count,12622.0,12622.0,12622.0
mean,-0.036365,2008.862225,2014.070987
std,1.540523,3.740825,2.471298
min,-1.0,1973.0,2010.0
25%,-1.0,2007.0,2012.0
50%,-1.0,2009.0,2014.0
75%,2.0,2011.0,2016.0
max,3.0,2019.0,2018.0


In [13]:
# years to graduate
df["years_to_graduate"] = df["graduated"] - df["id_year"]
print(df["years_to_graduate"])

0         4
1         5
2         5
3         4
4         4
         ..
12617     5
12618     5
12619     5
12620    12
12621     5
Name: years_to_graduate, Length: 12622, dtype: int64


In [23]:
HONOR_LABELS = {
    -1: "No honor",
     1: "First honor",
     2: "Second honor",
     3: "Third honor"
}

GPA_RULES = [
    (3.75, "First honor"),
    (3.50, "Second honor"),
    (3.25, "Third honor"),
    (0.00, "No honor"),
]

avg_years_tbl = (
    df.groupby(["degree", "major"], as_index=False)["years_to_graduate"]
      .mean()
      .rename(columns={"years_to_graduate": "avg_years_to_graduate"})
)

honor_dist_tbl = (
    df.groupby(["degree", "major"])["honour"]
      .value_counts(normalize=True) 
        .rename("prop")
        .reset_index()
)
   
counts_tbl = (
    df.groupby(["degree", "major"], as_index=False)
      .size()
      .rename(columns={"size": "n_students"})
) 

summary_tbl = avg_years_tbl.merge(counts_tbl, on=["degree", "major"], how="left")


In [24]:
def gpa_to_expected_honor(gpa: float) -> str:
    for cutoff, label in GPA_RULES:
        if gpa >= cutoff:
            return label
    return "No honor"

def get_degree_options():
    return sorted(df["degree"].unique().tolist())

def get_major_options_for_degree(degree: str):
    return sorted(df.loc[df["degree"] == degree, "major"].unique().tolist())

def get_group_stats(degree: str, major: str):
    # avg years + sample size
    row = summary_tbl[(summary_tbl["degree"] == degree) & (summary_tbl["major"] == major)]
    if row.empty:
        return None

    avg_years = float(row.iloc[0]["avg_years_to_graduate"])
    n_students = int(row.iloc[0]["n_students"])

    # honor distribution
    dist = honor_dist_tbl[(honor_dist_tbl["degree"] == degree) & (honor_dist_tbl["major"] == major)]
    probs = {HONOR_LABELS[int(h)]: float(p) for h, p in zip(dist["honour"], dist["prob"])}

    # ensure all labels exist
    for k in HONOR_LABELS.values():
        probs.setdefault(k, 0.0)

    # sort by probability desc
    probs = dict(sorted(probs.items(), key=lambda x: x[1], reverse=True))

    return {
        "avg_years_to_graduate": avg_years,
        "n_students": n_students,
        "honor_probabilities": probs
    }


# ========= 5) Interactive CLI =========
def prompt_choice(prompt: str, options: list[str]) -> str:
    print("\n" + prompt)
    for i, opt in enumerate(options, start=1):
        print(f"{i}. {opt}")
    while True:
        try:
            idx = int(input("Choose a number: "))
            if 1 <= idx <= len(options):
                return options[idx - 1]
        except ValueError:
            pass
        print("Invalid choice. Try again.")

def run_app():
    print("=== Student Benchmark & Outcomes Explorer ===")

    degrees = get_degree_options()
    if not degrees:
        print("No degrees found in dataset.")
        return

    degree = prompt_choice("Select your degree:", degrees)

    majors = get_major_options_for_degree(degree)
    if not majors:
        print("No majors found for that degree.")
        return

    major = prompt_choice("Select your major:", majors)

    while True:
        try:
            gpa = float(input("\nEnter your GPA (e.g., 3.42): "))
            if 0.0 <= gpa <= 4.0:  # adjust scale if needed
                break
            print("GPA must be between 0.0 and 4.0 (adjust if your scale differs).")
        except ValueError:
            print("Please enter a valid number.")

    stats = get_group_stats(degree, major)
    if stats is None:
        print("\nNo matching records found for that degree+major.")
        return

    expected_honor_from_gpa = gpa_to_expected_honor(gpa)

    print("\n=== Results ===")
    print(f"Degree: {degree}")
    print(f"Major:  {major}")
    print(f"Your GPA: {gpa:.2f}")
    print(f"\nEstimated (rule-based) honor expectation from GPA: {expected_honor_from_gpa}")

    print(f"\nHistorical benchmark for {degree} / {major}:")
    print(f"- Sample size: {stats['n_students']}")
    print(f"- Average years to graduate: {stats['avg_years_to_graduate']:.2f}")

    print("\nHonor outcome distribution (historical):")
    for label, p in stats["honor_probabilities"].items():
        print(f"- {label}: {p:.3f}")

    top_label = next(iter(stats["honor_probabilities"]))
    print(f"\nMost common historical outcome in this group: {top_label}")

if __name__ == "__main__":
    run_app()

=== Student Benchmark & Outcomes Explorer ===

Select your degree:
1. بكالوريوس
2. دكتوراه
3. ماجستير

Select your major:
1. الادارة
2. التسويق
3. الجيوفيزياء
4. الجيولوجيا
5. العلوم الاكتوارية والرياضيات المالية
6. العلوم الرياضية
7. العمارة
8. الفيزياء
9. الكيمياء
10. الكيمياء الصناعية
11. المالية
12. المحاسبة
13. الهندسة الصناعية والنظم
14. الهندسة الكهربائية
15. الهندسة الكهربائية التطبيقية
16. الهندسة الكيميائية
17. الهندسة الكيميائية التطبيقية
18. الهندسة المدنية
19. الهندسة المدنية التطبيقية
20. الهندسة المعمارية
21. الهندسة الميكانيكية
22. الهندسة الميكانيكية التطبيقية
23. تخطيط المدن
24. علوم الحاسب الآلي
25. نظم المعلومات الإدارية
26. هندسة البترول
27. هندسة البرمجيات
28. هندسة الحاسب الآلي
29. هندسة الطيران والفضاء
30. هندسة الطيران والفضاء التطبيقية
31. هندسة نظم التحكم والقياس


KeyError: 'prob'

Log loss: 0.9893196805955978
              precision    recall  f1-score   support

          -1       0.91      0.64      0.75      1778
           1       0.13      0.67      0.21        93
           2       0.28      0.15      0.20       338
           3       0.16      0.31      0.21       316

    accuracy                           0.53      2525
   macro avg       0.37      0.44      0.34      2525
weighted avg       0.71      0.53      0.59      2525



{'most_likely': 'No honor',
 'prob_any_honour': 0.056,
 'full_distribution': {'No honor': 0.944,
  'First honor': 0.011,
  'Second honor': 0.022,
  'Third honor': 0.023}}