**1. Authentication**

In [None]:
from google.colab import auth
auth.authenticate_user()

**2. Load Data**

In [None]:
import pandas as pd
from google.cloud import bigquery

project_id = 'oulad-analytics-project'
client = bigquery.Client(project=project_id)

query = "SELECT * FROM `oulad-analytics-project.oulad_raw_data.master_training_data`"
df = client.query(query).to_dataframe()

print(df.head())

**3. Python Preprocessing & Training**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# 1. Handling Categorical Data (Encoding)
cat_cols = ['code_module', 'code_presentation', 'gender', 'region',
            'highest_education', 'imd_band', 'age_band', 'disability']

# Make a copy to avoid SettingWithCopy warnings
train_df = df.copy()
le = LabelEncoder()

for col in cat_cols:
    train_df[col] = le.fit_transform(train_df[col].astype(str))

# 2. Define Features (X) and Target (y)
# Dropping non-training columns like student ID and original text result
X = train_df.drop(columns=['id_student', 'is_successful', 'final_result',
                           'date_registration', 'date_unregistration'])
y = train_df['is_successful']

# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 4. Train Model
print("Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 5. Evaluate
y_pred = rf_model.predict(X_test)
print("Model Evaluation:")
print(classification_report(y_test, y_pred))

**4. Generate Risk Score**

In [None]:
# 1. Predict Probabilities on the dataset
all_probs = rf_model.predict_proba(X)[:, 1] # Probability of class 1 (Success)

# 2. Create a Results DataFrame
# Combine the ID with the prediction
results_df = df[['id_student', 'code_module', 'code_presentation']].copy()
results_df['success_probability'] = all_probs
results_df['predicted_risk_group'] = pd.cut(
    results_df['success_probability'],
    bins=[0, 0.5, 0.8, 1.0],
    labels=['High Risk', 'Medium Risk', 'Low Risk'],
    include_lowest=True
)

print(results_df.head())

**5. Export Predictions Back to BigQuery**

In [None]:
# Save results to a new table in BigQuery
table_id = 'oulad-analytics-project.oulad_raw_data.predicted_student_risk'

results_df.to_gbq(
    table_id,
    project_id='oulad-analytics-project',
    if_exists='replace'
)

print(f"Predictions uploaded to {table_id}")