# Churn Prediction (Dev Sample)
This notebook builds a small churn dataset from Cosmos DB events and trains a simple model.

In [None]:
import os
from datetime import datetime, timedelta, timezone
import pandas as pd
from azure.cosmos import CosmosClient
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import joblib

In [None]:
COSMOS_CONNECTION = os.getenv('COSMOS_CONNECTION', '')
COSMOS_DB_NAME = os.getenv('COSMOS_DB_NAME', 'fitnessGame')
WINDOW_DAYS = 7

In [None]:
client = CosmosClient.from_connection_string(COSMOS_CONNECTION)
db = client.get_database_client(COSMOS_DB_NAME)
events = list(db.get_container_client('Events').read_all_items())
users = list(db.get_container_client('Users').read_all_items())
events_df = pd.DataFrame(events)
users_df = pd.DataFrame(users)
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'], utc=True)

In [None]:
now = datetime.now(timezone.utc)
window_start = now - timedelta(days=WINDOW_DAYS)
window_end = now
window_df = events_df[(events_df['timestamp'] >= window_start) & (events_df['timestamp'] <= window_end)]

In [None]:
def current_streak(dates, end_date):
    date_set = set(dates)
    streak = 0
    day = end_date.date()
    while day in date_set:
        streak += 1
        day = day - timedelta(days=1)
    return streak

def agg_user(group):
    active_days = group['timestamp'].dt.date.nunique()
    unique_dates = group['timestamp'].dt.date.unique().tolist()
    streak = current_streak(unique_dates, window_end)
    matches = group[group['eventType'].isin(['MATCH_STARTED', 'MATCH_FINISHED'])].shape[0]
    distance = group['data'].apply(lambda x: x.get('distance', 0) if isinstance(x, dict) else 0).sum()
    calories = group['data'].apply(lambda x: x.get('calories', 0) if isinstance(x, dict) else 0).sum()
    friends = group[group['eventType'] == 'FRIEND_ADDED'].shape[0]
    return pd.Series({
        'active_days': active_days,
        'matches_played': matches,
        'total_distance': distance,
        'total_calories': calories,
        'current_streak': streak,
        'friends_count': friends
    })

features = window_df.groupby('userId').apply(agg_user).reset_index()

if not users_df.empty:
    users_df = users_df.rename(columns={'id': 'userId'})
    if 'friendsCount' in users_df.columns:
        users_df = users_df.rename(columns={'friendsCount': 'friends_count'})
    if 'friends_count' in users_df.columns:
        features = features.merge(users_df[['userId', 'friends_count']], on='userId', how='left')
        features['friends_count'] = features['friends_count_y'].fillna(features['friends_count_x']).fillna(0)
        features = features.drop(columns=['friends_count_x', 'friends_count_y'])

In [None]:
label_cutoff = window_end + timedelta(days=WINDOW_DAYS)
active_next = set(events_df[(events_df['timestamp'] > window_end) & (events_df['timestamp'] <= label_cutoff)]['userId'].unique())
features['churn'] = features['userId'].apply(lambda uid: 0 if uid in active_next else 1)
features.head()

In [None]:
X = features.drop(columns=['userId', 'churn'])
y = features['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
print('Train accuracy:', model.score(X_train, y_train))
print('Test accuracy:', model.score(X_test, y_test))
joblib.dump(model, 'churn_model.joblib')