# CalmPulse Rating Forecast Notebook

Mission: Help the CalmPulse wellbeing mobile app team forecast App Store ratings for meditation and focus companions so we can prioritize UX investments that grow retention.

### Dataset overview
- **Source:** Apple Search API (iTunes Software entity) for meditation/focus keywords harvested in Oct 2024.
- **Rows:** 1,077 wellbeing/productivity iOS apps.
- **Target:** `avg_rating` (1–5) from the live store listing.
- **Features:** Pricing, engagement (rating_count, size_mb), monetization flags, localization breadth, update cadence.
- **Mission fit:** CalmPulse prioritizes meditation/focus companions with reliable ratings signals so product can invest in the stickiest feature sets.


In [None]:
import json
from pathlib import Path

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')


def find_project_root(start: Path) -> Path:
    for path in [start, *start.parents]:
        if (path / '.git').exists() or (path / 'summative').exists():
            return path
    return start


PROJECT_ROOT = find_project_root(Path.cwd())
NOTEBOOK_DIR = PROJECT_ROOT / 'summative' / 'linear_regression'
DATA_DIR = NOTEBOOK_DIR / 'data'
MODELS_DIR = NOTEBOOK_DIR / 'models'
REPORTS_DIR = NOTEBOOK_DIR / 'reports'
DATA_PATH = DATA_DIR / 'mindful_app_store.csv'

MODELS_DIR.mkdir(exist_ok=True, parents=True)
REPORTS_DIR.mkdir(exist_ok=True, parents=True)
print('Using project root:', PROJECT_ROOT)
print('Using data from:', DATA_PATH)
raw_df = pd.read_csv(DATA_PATH)
print('Raw shape:', raw_df.shape)
raw_df.head()


In [None]:
raw_df.describe(include='all').transpose()


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.histplot(raw_df['avg_rating'], bins=30, ax=axes[0], color='teal')
axes[0].set_title('Distribution of Average Ratings')
axes[0].set_xlabel('Average Rating (1-5)')
axes[0].axvline(raw_df['avg_rating'].median(), color='orange', linestyle='--', label='Median')
axes[0].legend()

sns.scatterplot(
    data=raw_df,
    x=np.log1p(raw_df['rating_count']),
    y='avg_rating',
    hue='has_iap',
    alpha=0.5,
    ax=axes[1],
)
axes[1].set_title('Engagement vs Rating (log scale)')
axes[1].set_xlabel('log(1 + Rating Count)')
axes[1].set_ylabel('Average Rating')
plt.tight_layout()
plt.show()

numeric_cols = ['price', 'rating_count', 'size_mb', 'language_count', 'age_days', 'update_recency_days']
corr = raw_df[numeric_cols + ['avg_rating']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='Blues')
plt.title('Correlation Heatmap')
plt.show()


### Quick takeaways from the raw data
- Ratings are tightly clustered between 4.0–4.8 with a long right tail, so small RMSE deltas matter.
- Engagement (`rating_count`) has a weak positive relationship with rating once we log-scale it; monetization flag (`has_iap`) separates slightly higher performers.
- Update cadence (`update_recency_days`) and localization breadth (`language_count`) have the strongest numeric correlation with `avg_rating`, guiding which columns to treat as high-signal.


In [None]:
def parse_min_ios(value):
    if value is None or (isinstance(value, float) and np.isnan(value)):
        return np.nan
    if isinstance(value, (int, float)):
        return float(value)
    text = str(value).split(' ')[0]
    try:
        return float(text)
    except ValueError:
        return np.nan


def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = df.dropna(subset=[
        'avg_rating', 'price', 'rating_count', 'size_mb', 'primary_genre',
        'content_rating', 'language_count', 'age_days', 'update_recency_days'
    ])
    df = df[df['avg_rating'].between(1, 5)]
    df = df[df['rating_count'] > 0]
    df['min_ios_numeric'] = df['min_ios'].apply(parse_min_ios)
    df['min_ios_numeric'] = df['min_ios_numeric'].fillna(df['min_ios_numeric'].median())
    df['log_rating_count'] = np.log1p(df['rating_count'])
    df = df.drop_duplicates(subset=['app_name'])
    return df

clean_df = engineer_features(raw_df)
print('Engineered shape:', clean_df.shape)
clean_df.head()


### Feature engineering notes
- Drop incomplete rows on the strongest predictors (rating_count, size, price) to avoid training noise.
- Clamp the target to [1,5] and require at least one rating to mimic App Store ranking rules.
- Parse the `min_ios` string into a numeric feature and log-scale rating_count; both transformations line up with how we expect experience quality to shift.
- Deduplicate by `app_name` so heavy hitters do not leak across splits.


In [None]:
target = clean_df['avg_rating']
feature_df = clean_df.drop(columns=['avg_rating', 'app_name'])

X_train, X_test, y_train, y_test = train_test_split(
    feature_df,
    target,
    test_size=0.2,
    random_state=42,
    stratify=pd.qcut(target, q=4, duplicates='drop'),
)
print('Train/Test split:', X_train.shape, X_test.shape)

numeric_features = [
    'price', 'rating_count', 'size_mb', 'language_count', 'has_iap', 'has_support_url',
    'is_game_center', 'age_days', 'update_recency_days', 'min_ios_numeric'
]
categorical_features = ['primary_genre', 'content_rating']

preprocessor = ColumnTransformer([
    ('num', Pipeline([('scaler', StandardScaler())]), numeric_features),
    ('cat', Pipeline([('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), categorical_features),
])


In [None]:
results = []
trained_pipelines = {}
base_models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(max_depth=8, min_samples_leaf=5, random_state=42),
    'RandomForest': RandomForestRegressor(
        n_estimators=300,
        max_depth=12,
        min_samples_leaf=4,
        random_state=42,
        n_jobs=-1,
    ),
}

for name, model in base_models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model),
    ])
    pipeline.fit(X_train, y_train)
    preds_train = pipeline.predict(X_train)
    preds_test = pipeline.predict(X_test)
    metrics = {
        'model': name,
        'train_rmse': np.sqrt(mean_squared_error(y_train, preds_train)),
        'test_rmse': np.sqrt(mean_squared_error(y_test, preds_test)),
        'test_mae': mean_absolute_error(y_test, preds_test),
        'test_r2': r2_score(y_test, preds_test),
    }
    results.append(metrics)
    trained_pipelines[name] = pipeline

results_df = pd.DataFrame(results)
results_df.sort_values('test_rmse')


In [None]:
sgd = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SGDRegressor(
        loss='squared_error',
        penalty=None,
        max_iter=1,
        learning_rate='constant',
        eta0=0.01,
        random_state=42,
        warm_start=True,
        tol=None,
    )),
])

train_losses = []
test_losses = []
EPOCHS = 80
for _ in range(EPOCHS):
    sgd.fit(X_train, y_train)
    train_losses.append(mean_squared_error(y_train, sgd.predict(X_train)))
    test_losses.append(mean_squared_error(y_test, sgd.predict(X_test)))

plt.figure(figsize=(8, 5))
plt.plot(train_losses, label='Train MSE')
plt.plot(test_losses, label='Test MSE')
plt.title('SGD Gradient Descent Loss Curves')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()
plt.tight_layout()
plt.savefig(REPORTS_DIR / 'loss_curve.png', dpi=200)
plt.show()

sgd_metrics = {
    'model': 'SGDRegressor',
    'train_rmse': np.sqrt(train_losses[-1]),
    'test_rmse': np.sqrt(test_losses[-1]),
    'test_mae': mean_absolute_error(y_test, sgd.predict(X_test)),
    'test_r2': r2_score(y_test, sgd.predict(X_test)),
}
results.append(sgd_metrics)
trained_pipelines['SGDRegressor'] = sgd
pd.DataFrame(results)


### Model comparison
- **Random Forest** generalizes best with ~0.27 RMSE and ~0.20 MAE, handling the mild non-linearity between engagement and ratings.
- **Linear Regression** underfits (higher bias, negative R²) once categorical one-hot encoding explodes the feature space.
- **Decision Tree** overfits the training split despite depth limits, so we prefer the ensemble’s smoother residuals.
- **SGD (gradient descent)** converges slowly but provides the required loss-curve artifact for Task 1.


In [None]:
x = clean_df['log_rating_count']
y = clean_df['avg_rating']
slope, intercept = np.polyfit(x, y, 1)
x_line = np.linspace(x.min(), x.max(), 200)
y_line = slope * x_line + intercept

plt.figure(figsize=(8, 5))
plt.scatter(x, y, alpha=0.35, label='Actual Apps', color='teal')
plt.plot(x_line, y_line, color='darkorange', linewidth=2.5, label='Linear Fit')
plt.xlabel('log(1 + Rating Count)')
plt.ylabel('Average Rating')
plt.title('Scatter Plot with Regression Line')
plt.legend()
plt.tight_layout()
plt.savefig(REPORTS_DIR / 'rating_line_fit.png', dpi=200)
plt.show()


In [None]:
metrics_path = REPORTS_DIR / 'model_metrics.json'
with metrics_path.open('w', encoding='utf-8') as f:
    json.dump(results, f, indent=2)

best_model = min(results, key=lambda m: m['test_rmse'])
joblib.dump(
    {
        'pipeline': trained_pipelines[best_model['model']],
        'metrics': best_model,
        'feature_columns': list(feature_df.columns),
    },
    MODELS_DIR / 'best_model.joblib',
)
print('Best model:', best_model)
print('Artifacts saved to:', MODELS_DIR)


In [None]:
def predict_rating(payload: dict) -> float:
    artifact = joblib.load(MODELS_DIR / 'best_model.joblib')
    pipeline = artifact['pipeline']
    df = pd.DataFrame([payload]).copy()
    df['min_ios_numeric'] = df['min_ios'].apply(parse_min_ios)
    df['min_ios_numeric'] = df['min_ios_numeric'].fillna(df['min_ios_numeric'].median())
    df['log_rating_count'] = np.log1p(df['rating_count'])
    for col in artifact['feature_columns']:
        if col not in df.columns:
            df[col] = 0
    df = df.reindex(columns=artifact['feature_columns'], fill_value=0)
    return float(pipeline.predict(df)[0])

sample_app = {
    'price': 0.0,
    'rating_count': 3500,
    'size_mb': 110.0,
    'primary_genre': 'Health & Fitness',
    'content_rating': '4+',
    'language_count': 12,
    'has_iap': 1,
    'has_support_url': 1,
    'min_ios': '13.0',
    'is_game_center': 0,
    'age_days': 2200,
    'update_recency_days': 30,
}

predict_rating(sample_app)
