<a href="https://colab.research.google.com/github/HazemmoAlsady/AWN_Graduation_Project/blob/main/Hazem's%20edits/Final_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os
import joblib


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df = pd.read_excel('/content/drive/MyDrive/cleaned_awn_data.xlsx')

df = df[df["need_level"] != "Unknown"].reset_index(drop=True)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier


In [None]:
need_features = [
    "family_size",
    "income_monthly",
    "monthly_expenses",
    "debts",
    "number_of_children",
    "age",
    "expense_to_income_ratio",
    "case_type",
    "housing_type",
    "health_status",
    "city",
    "gender"
]

X = df[need_features]
y = df["need_level"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

need_preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_cols),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])


In [None]:
need_level_model = Pipeline(steps=[
    ("preprocess", need_preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        min_samples_split=10,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])


In [None]:
need_level_encoder = LabelEncoder()
y_train_enc = need_level_encoder.fit_transform(y_train)


In [None]:
need_level_model.fit(X_train, y_train_enc)


In [None]:
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt

y_pred = need_level_model.predict(X_test)
y_pred = need_level_encoder.inverse_transform(y_pred)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

ConfusionMatrixDisplay.from_predictions(
    y_test,
    y_pred,
    cmap="Blues"
)
plt.title("Need Level – Confusion Matrix")
plt.show()


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Encode y_test
y_test_enc = need_level_encoder.transform(y_test)

models = {
    "Logistic Regression": LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        class_weight="balanced",
        random_state=42
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softprob",
        num_class=3,
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )
}

results = []

for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", need_preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train, y_train_enc)
    preds = pipe.predict(X_test)

    acc = accuracy_score(y_test_enc, preds)
    results.append((name, acc))

results_df = pd.DataFrame(results, columns=["Model", "Accuracy"])
results_df


In [None]:
rf = need_level_model.named_steps["model"]

importances = rf.feature_importances_

pd.DataFrame({
    "feature": num_cols.tolist() +
               list(need_level_model.named_steps[
                   "preprocess"
               ].transformers_[1][1]
               .named_steps["onehot"]
               .get_feature_names_out(cat_cols)),
    "importance": importances
}).sort_values("importance", ascending=False)


In [None]:
df2 = df.copy()
df2 = df2[
    (df2["assistance_type"] != "Unknown") &
    (df2["request_text"] != "Unknown")
].reset_index(drop=True)


In [None]:
import re

def clean_text(text):
    text = str(text)
    leakage_words = [
        "سلة", "غذائية", "طعام",
        "علاج", "أدوية", "عملية",
        "مدارس", "تعليم",
        "كرسي", "إعاقة",
        "مالية", "إيجار", "سكن"
    ]
    for w in leakage_words:
        text = re.sub(w, "", text)
    return re.sub(r"\s+", " ", text).strip()

df2["request_text_clean"] = df2["request_text"].apply(clean_text)


In [None]:
num_features = [
    "family_size", "income_monthly", "monthly_expenses",
    "debts", "number_of_children", "age",
    "expense_to_income_ratio"
]

cat_features = [
    "housing_type",
    "health_status",
    "city",
    "gender"
]
text_feature = "request_text_clean"

X = df2[num_features + cat_features + [text_feature]]
y = df2["assistance_type"]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

assist_preprocessor = ColumnTransformer([
    ("text", TfidfVectorizer(max_features=1500, ngram_range=(1,1), min_df=5), text_feature),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_features),
    ("num", SimpleImputer(strategy="median"), num_features)
])


In [None]:
from xgboost import XGBClassifier

 ("model", XGBClassifier(
    objective="multi:softprob",
    num_class=len(assistance_encoder.classes_),
    n_estimators=400,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
))

In [None]:
assistance_encoder = LabelEncoder()
y_train_enc = assistance_encoder.fit_transform(y_train)


In [None]:
assistance_model.fit(X_train, y_train_enc)


In [None]:
from sklearn.metrics import accuracy_score


models = {
    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        max_depth=22,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softprob",
        num_class=len(assistance_encoder.classes_),
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )
}

from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np

encoder = LabelEncoder()
y_enc = encoder.fit_transform(y)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for name, model in models.items():

    pipe = Pipeline([
        ("preprocess", assist_preprocessor),
        ("model", model)
    ])

    scores = cross_val_score(
        pipe,
        X,
        y_enc,
        cv=skf,
        scoring="accuracy",
        n_jobs=-1
    )

    results.append((name, np.mean(scores)))

pd.DataFrame(results, columns=["Model", "CV Accuracy"])

In [None]:
BASE_PATH = "/content/drive/MyDrive/graduation_project/final_model"
os.makedirs(BASE_PATH, exist_ok=True)

joblib.dump(need_level_model, f"{BASE_PATH}/need_level_model.joblib")
joblib.dump(need_level_encoder, f"{BASE_PATH}/need_level_encoder.joblib")

joblib.dump(assistance_model, f"{BASE_PATH}/assistance_model.joblib")
joblib.dump(assistance_encoder, f"{BASE_PATH}/assistance_encoder.joblib")


In [None]:
import pickle
import os

BASE_PATH = "/content/drive/MyDrive/graduation_project/final_model"
os.makedirs(BASE_PATH, exist_ok=True)

# Need Level Pipeline
with open(f"{BASE_PATH}/need_level_pipeline.pkl", "wb") as f:
    pickle.dump(need_level_model, f)

# Need Level Encoder
with open(f"{BASE_PATH}/need_level_encoder.pkl", "wb") as f:
    pickle.dump(need_level_encoder, f)
# Assistance Type Pipeline
with open(f"{BASE_PATH}/assistance_type_pipeline.pkl", "wb") as f:
    pickle.dump(assistance_model, f)

# Assistance Type Encoder
with open(f"{BASE_PATH}/assistance_type_encoder.pkl", "wb") as f:
    pickle.dump(assistance_encoder, f)


In [None]:
from xgboost import XGBClassifier

model = Pipeline([
    ("preprocess", preprocessor),
    ("clf", XGBClassifier(
        objective="multi:softprob",
        num_class=len(encoder.classes_),
        n_estimators=400,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ))
])

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
import numpy as np

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(
    model,
    X_struct,
    y_enc,
    cv=skf,
    scoring="accuracy",
    n_jobs=-1
)

print("Fold Scores:", scores)
print("Mean Accuracy:", np.mean(scores))

In [None]:
model.fit(X_struct, y_enc)


In [None]:
sample = pd.DataFrame([{
    "income_monthly": 2500,
    "debts": 10000,
    "expense_to_income_ratio": 4000/2500,
    "family_size": 5,
    "number_of_children": 3,
    "age": 40,
    "health_status": "مريض ضغط",
    "housing_type": "إيجار"
}])

proba = model.predict_proba(sample)[0]

for cls, p in zip(encoder.classes_, proba):
    print(cls, round(p*100,2), "%")

In [None]:
!pip uninstall -y transformers accelerate sentence-transformers
!pip install transformers accelerate --no-cache-dir -q

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments
)

In [None]:
# فلترة البيانات
df2 = df.copy()
df2 = df2[
    (df2["assistance_type"] != "Unknown") &
    (df2["request_text"] != "Unknown")
].reset_index(drop=True)

# Encode labels
encoder = LabelEncoder()
df2["label"] = encoder.fit_transform(df2["assistance_type"])

# Split
train_df, test_df = train_test_split(
    df2,
    test_size=0.2,
    random_state=42,
    stratify=df2["label"]
)

In [None]:
model_name = "aubmindlab/bert-base-arabertv02"

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

In [17]:
num_features = [
    "income_monthly",
    "debts",
    "expense_to_income_ratio",
    "family_size",
    "number_of_children",
    "age"
]

cat_features = [
    "health_status",
    "housing_type"
]

structured_all = pd.get_dummies(df2[num_features + cat_features])
structured_cols = structured_all.columns

In [23]:
class AssistanceDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.texts = df["request_text"].astype(str).tolist()
        self.labels = df["label"].tolist()
        self.structured = pd.get_dummies(
            df[num_features + cat_features]
        ).reindex(columns=structured_cols, fill_value=0).astype(np.float32).values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=64,
            return_tensors="pt"
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["structured"] = torch.tensor(self.structured[idx], dtype=torch.float)
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [24]:
class HybridModel(nn.Module):
    def __init__(self, bert, structured_size, num_classes):
        super().__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.5)
        self.structured_layer = nn.Linear(structured_size, 128)
        self.classifier = nn.Linear(
            bert.config.hidden_size + 128,
            num_classes
        )

    def forward(self, input_ids, attention_mask, structured, labels=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        pooled_output = outputs.pooler_output
        structured_output = torch.relu(
            self.structured_layer(structured)
        )

        combined = torch.cat((pooled_output, structured_output), dim=1)
        logits = self.classifier(self.dropout(combined))

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}

In [25]:
train_dataset = AssistanceDataset(train_df)
test_dataset = AssistanceDataset(test_df)

model = HybridModel(
    bert_model,
    structured_size=len(structured_cols),
    num_classes=len(encoder.classes_)
)

In [26]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",   # ✔ الصحيح في 4.41.2
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    report_to="none"
)

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

  super().__init__(loader)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
preds = trainer.predict(test_dataset)
y_pred = np.argmax(preds.predictions, axis=1)

print(classification_report(
    test_df["label"],
    y_pred,
    target_names=encoder.classes_
))