In [0]:
# -------------------------------------------------------
# Cell 1 ‚Äî Load Train and Test Data
# -------------------------------------------------------

train_df = spark.table("kusha_solutions.telecom_churn_ml.train_final_featured_transformed")
test_df  = spark.table("kusha_solutions.telecom_churn_ml.test_final_featured_transformed")

print("‚úÖ Train and Test data loaded successfully.")
train_df.printSchema()
test_df.printSchema()


In [0]:
#CELL 2 ‚Äî Convert Spark train_df ‚Üí pandas (flatten features)
import pyspark.sql.functions as F
from pyspark.ml.functions import vector_to_array

# -----------------------------
# Convert train_df
# -----------------------------
train_flat = train_df.withColumn("features_arr", vector_to_array("features_scaled"))

first_train = train_flat.select("features_arr").head()
n_features = len(first_train["features_arr"])
feature_cols = [f"f{i}" for i in range(n_features)]

train_flat = train_flat.select(
    *[F.col("features_arr")[i].alias(f"f{i}") for i in range(n_features)],
    F.col("Churn_index").alias("label")
)

pdf_train = train_flat.toPandas()
print("Train pandas shape:", pdf_train.shape)

X_train = pdf_train[feature_cols]
y_train = pdf_train["label"]


In [0]:
#CELL 3 ‚Äî Convert Spark test_df ‚Üí pandas (same features)
# -----------------------------
# Convert test_df 
# ----------------------------- 
test_flat = test_df.withColumn("features_arr", vector_to_array("features_scaled"))

test_flat = test_flat.select(
    *[F.col("features_arr")[i].alias(f"f{i}") for i in range(n_features)],
    F.col("Churn_index").alias("label")
)

pdf_test = test_flat.toPandas()

print("Test pandas shape:", pdf_test.shape)

X_test = pdf_test[feature_cols]
y_test = pdf_test["label"]


In [0]:
# -------------------------------------------------------
# CELL 4 ‚Äî Train PURE PYTHON SKLEARN MODEL (FAST VERSION)
# -------------------------------------------------------

from sklearn.ensemble import RandomForestClassifier

# üöÄ Optimized RF model (fast training, similar accuracy)
rf_model = RandomForestClassifier(
    n_estimators=120,        # ‚Üì faster, accuracy same
    max_depth=10,           # ‚Üì faster, avoids overfitting
    min_samples_split=5,    # ‚Üë better generalization
    random_state=42,
    n_jobs=-1               # use all CPU cores
)

print("‚è≥ Training sklearn RandomForest (optimized)...")
rf_model.fit(X_train, y_train)
print("üéâ Pure Python sklearn RandomForest trained successfully!")


In [0]:
#CELL 5 ‚Äî Evaluate sklearn model on EXACT SAME Spark test_df
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import pandas as pd

preds = rf_model.predict(X_test)

acc  = accuracy_score(y_test, preds)
f1   = f1_score(y_test, preds, average="weighted")
prec = precision_score(y_test, preds, average="weighted")
rec  = recall_score(y_test, preds, average="weighted")
cm   = confusion_matrix(y_test, preds)

print("üìä sklearn model performance (using Spark test_df)")
print("Accuracy :", acc)
print("F1 Score :", f1)
print("Precision:", prec)
print("Recall   :", rec)

class_labels = sorted(y_test.unique())
df_cm = pd.DataFrame(cm, index=class_labels, columns=class_labels)
print("\nConfusion Matrix:")
print(df_cm)



In [0]:
# CELL 6 ‚Äî Create new experiment + log & register sklearn model

import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# -------------------------------------------------------
# 1Ô∏è‚É£ Create a NEW experiment
# -------------------------------------------------------
mlflow.set_experiment("/Shared/telecom_churn_pure_python_exp")

print("üìå New experiment set: /Shared/telecom_churn_pure_python_exp")


# -------------------------------------------------------
# 2Ô∏è‚É£ Create signature + input example
# -------------------------------------------------------
input_example = X_train.head(5)
signature = infer_signature(X_train, rf_model.predict(X_train))

model_name = "Churn_PurePython_Model_Final"


# -------------------------------------------------------
# 3Ô∏è‚É£ Start MLflow run and log + register model
# -------------------------------------------------------
with mlflow.start_run(run_name="Churn_PurePython_Run_Final") as run:

    # Log parameters
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_param("n_estimators", 300)
    mlflow.log_param("max_depth", 12)

    # Log + register pure Python model
    mlflow.sklearn.log_model(
        sk_model=rf_model,
        artifact_path="model",
        input_example=input_example,
        signature=signature,
        registered_model_name=model_name
    )

    run_id = run.info.run_id

print("üî• Model logged successfully!")
print("üÜî Run ID:", run_id)
print("üì¶ Model registered as:", model_name)
print("üëâ Use this model URI for loading: runs:/{}/model".format(run_id))


In [0]:
import pandas as pd

# Suppose X_train is your training DataFrame
input_example = X_train.head(5)  # 5 sample rows


In [0]:
loaded = mlflow.sklearn.load_model("runs:/5ffd86a2eeb34968ba610d13e9250f3b/model")


In [0]:
# CELL 10 ‚Äî Load the logged sklearn model (pure Python) and test locally
import mlflow

loaded = mlflow.sklearn.load_model("runs:/5ffd86a2eeb34968ba610d13e9250f3b/model")
print("Loaded model:", loaded)

sample_preds = loaded.predict(input_example)
print("Sample predictions:", sample_preds)
