<a href="https://colab.research.google.com/github/HazemmoAlsady/AWN_Graduation_Project/blob/main/Hazem's%20edits/Final_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os
import joblib


Mounted at /content/drive


In [2]:
df = pd.read_excel('/content/drive/MyDrive/cleaned_awn_data.xlsx')

df = df[df["need_level"] != "Unknown"].reset_index(drop=True)


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier


In [4]:
need_features = [
    "family_size",
    "income_monthly",
    "monthly_expenses",
    "debts",
    "number_of_children",
    "age",
    "expense_to_income_ratio",
    "case_type",
    "housing_type",
    "health_status",
    "city",
    "gender"
]

X = df[need_features]
y = df["need_level"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [5]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

need_preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_cols),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])


In [6]:
need_level_model = Pipeline(steps=[
    ("preprocess", need_preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=15,
        min_samples_split=10,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])


In [7]:
need_level_encoder = LabelEncoder()
y_train_enc = need_level_encoder.fit_transform(y_train)


In [8]:
need_level_model.fit(X_train, y_train_enc)


In [9]:
df2 = df.copy()
df2 = df2[
    (df2["assistance_type"] != "Unknown") &
    (df2["request_text"] != "Unknown")
].reset_index(drop=True)


In [10]:
import re

def clean_text(text):
    text = str(text)
    leakage_words = [
        "سلة", "غذائية", "طعام",
        "علاج", "أدوية", "عملية",
        "مدارس", "تعليم",
        "كرسي", "إعاقة",
        "مالية", "إيجار", "سكن"
    ]
    for w in leakage_words:
        text = re.sub(w, "", text)
    return re.sub(r"\s+", " ", text).strip()

df2["request_text_clean"] = df2["request_text"].apply(clean_text)


In [11]:
num_features = [
    "family_size", "income_monthly", "monthly_expenses",
    "debts", "number_of_children", "age",
    "expense_to_income_ratio"
]

cat_features = [
    "case_type", "housing_type", "health_status", "city", "gender"
]

text_feature = "request_text_clean"

X = df2[num_features + cat_features + [text_feature]]
y = df2["assistance_type"]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

assist_preprocessor = ColumnTransformer([
    ("text", TfidfVectorizer(max_features=6000, ngram_range=(1,2)), text_feature),
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_features),
    ("num", SimpleImputer(strategy="median"), num_features)
])


In [14]:
assistance_model = Pipeline(steps=[
    ("preprocess", assist_preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=22,
        min_samples_split=10,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])


In [15]:
assistance_encoder = LabelEncoder()
y_train_enc = assistance_encoder.fit_transform(y_train)


In [16]:
assistance_model.fit(X_train, y_train_enc)


In [17]:
BASE_PATH = "/content/drive/MyDrive/graduation_project/final_model"
os.makedirs(BASE_PATH, exist_ok=True)

joblib.dump(need_level_model, f"{BASE_PATH}/need_level_model.joblib")
joblib.dump(need_level_encoder, f"{BASE_PATH}/need_level_encoder.joblib")

joblib.dump(assistance_model, f"{BASE_PATH}/assistance_model.joblib")
joblib.dump(assistance_encoder, f"{BASE_PATH}/assistance_encoder.joblib")


['/content/drive/MyDrive/graduation_project/final_model/assistance_encoder.joblib']