In [27]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from typing import List, Dict
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate

class LogisticRegressionModel:
    def __init__(self, learning_rate: float = 0.01, iterations: int = 1000):
        self.weights = None
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.scaler = StandardScaler()

    def sigmoid(self, z: float) -> float:
        return 1 / (1 + np.exp(-z))

    def predict(self, features: np.ndarray) -> float:
        z = np.dot(features, self.weights)
        return float(self.sigmoid(z))

    def train(self, dataset: List[Dict[str, float]], target: List[int]) -> None:
        X = np.array([
            [1, (datetime.now() - parse_date(user["lastVisit"])).days,
             user["timeSpent"], user["pagesViewed"], user["interactionScore"]]
            for user in dataset
        ])
        y = np.array(target)

        # Normalize feature columns (excluding intercept)
        X[:, 1:] = self.scaler.fit_transform(X[:, 1:])

        # Initialize weights
        self.weights = np.zeros(X.shape[1])

        # Gradient Descent
        for _ in range(self.iterations):
            predictions = self.sigmoid(np.dot(X, self.weights))
            errors = predictions - y
            gradient = np.dot(X.T, errors) / len(y)
            self.weights -= self.learning_rate * gradient

    def predict_churn_probability(self, user: Dict[str, float]) -> float:
        features = np.array([
            1,  # Intercept
            (datetime.now() - parse_date(user["lastVisit"])).days,
            user["timeSpent"],
            user["pagesViewed"],
            user["interactionScore"]
        ])

        # Normalize features (excluding intercept)
        features[1:] = self.scaler.transform([features[1:]])[0]

        return self.predict(features)

    def segment_users_by_risk(self, users: List[Dict[str, float]]) -> Dict[str, List[Dict[str, float]]]:
        high_risk, medium_risk, low_risk = [], [], []

        for user in users:
            churn_probability = self.predict_churn_probability(user)

            if churn_probability > 0.7:
                high_risk.append(user)
            elif churn_probability > 0.4:
                medium_risk.append(user)
            else:
                low_risk.append(user)

        return {"highRisk": high_risk, "mediumRisk": medium_risk, "lowRisk": low_risk}

# Function to handle mixed date formats
def parse_date(date_str: str) -> datetime:
    """Parses date from multiple formats into a standard datetime object."""
    for fmt in ("%m-%d-%Y", "%m/%d/%Y"):  # Try both formats
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    raise ValueError(f"Unknown date format: {date_str}")  # Handle unexpected formats

# Main Execution
if __name__ == "__main__":
    if not os.path.exists("MOCK_DATA (1).csv"):
        raise FileNotFoundError("MOCK_DATA (1).csv not found. Please ensure the file exists.")

    df = pd.read_csv("MOCK_DATA (1).csv")

    # Convert the DataFrame into a list of dictionaries
    dataset = df[["user_id", "lastVisit", "timeSpent", "pagesViewed", "interactionScore"]].to_dict(orient="records")

    # Define target based on interactionScore threshold
    threshold = df["interactionScore"].median()  # Use median as threshold
    target = (df["interactionScore"] > threshold).astype(int).tolist()

    # Initialize the model
    model = LogisticRegressionModel(learning_rate=0.01, iterations=1000)

    # Train the model
    model.train(dataset, target)

    # Predict churn probabilities
    user_churn_data = [(user["user_id"], f"{model.predict_churn_probability(user):.2f}") for user in dataset]

    print(tabulate(user_churn_data, headers=["User ID", "Churn Probability"]))


  User ID    Churn Probability
---------  -------------------
        1                 0.05
        2                 0.88
        3                 0.25
        4                 0.4
        5                 0.26
        6                 0.71
        7                 0.27
        8                 0.07
        9                 0.3
       10                 0.17
       11                 0.89
       12                 0.03
       13                 0.96
       14                 0.87
       15                 0.87
       16                 0.2
       17                 0.23
       18                 0.07
       19                 0.77
       20                 0.12
       21                 0.03
       22                 0.75
       23                 0.42
       24                 0.07
       25                 0.17
       26                 0.96
       27                 0.14
       28                 0.04
       29                 0.91
       30                 0.05
       31  