In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import seaborn as sns
from scipy.stats import f_oneway

In [None]:
def run_device_regression(dataframe, keyword=None, target_column='ConversionRate', baseline_device=None):
    # Filter by keyword only if keyword is specified
    if keyword is not None:
        subset = dataframe[dataframe['Keyword'] == keyword]
    else:
        subset = dataframe

    # Reorder categories so the baseline device is first
    if baseline_device is not None and baseline_device in subset['Device'].unique():
        subset['Device'] = pd.Categorical(
            subset['Device'],
            categories=[baseline_device] + [d for d in subset['Device'].unique() if d != baseline_device],
            ordered=True
        )
    elif baseline_device is not None:
        print(f"Warning: baseline device '{baseline_device}' not found in subset.")

    # One-hot encode devices, dropping the baseline
    X = pd.get_dummies(subset['Device'], drop_first=True)
    X = sm.add_constant(X)
    X = X.astype(float)

    # Target variable
    y = subset[target_column].astype(float)

    # Run regression
    model = sm.OLS(y, X).fit()

    # Print summary
    title = f"Keyword: '{keyword}'" if keyword else "All Keywords"
    print(f"\n--- Regression Summary for {title} ---\n")
    print(model.summary())

    return model

In [None]:
def run_device_diff_regression(dataframe, target_column, baseline_device=None):
    # Sort the dataframe by Device and Ad_Date to ensure proper ordering
    dataframe = dataframe.sort_values(by=['Device', 'Ad_Date'])

    # Compute difference of the target variable within each Device group
    dataframe['Target_Diff'] = dataframe.groupby('Device', observed=True)[target_column].diff(-1)

    # Drop rows with NaNs caused by diff, and make a copy to avoid SettingWithCopyWarning
    subset = dataframe.dropna(subset=['Target_Diff']).copy()

    # Set baseline device if specified
    if baseline_device and baseline_device in subset['Device'].unique():
        subset['Device'] = pd.Categorical(
            subset['Device'],
            categories=[baseline_device] + [d for d in subset['Device'].unique() if d != baseline_device],
            ordered=True
        )
    elif baseline_device:
        print(f"Warning: baseline device '{baseline_device}' not found in data.")

    # Dummy encode Device (drop first category = baseline)
    X = pd.get_dummies(subset['Device'], drop_first=True)
    X = sm.add_constant(X)
    X = X.astype(float)

    # Response variable is the difference
    y = subset['Target_Diff'].astype(float)

    # Run regression
    model = sm.OLS(y, X).fit()

    print(f"\n--- Regression Summary on Differences in '{target_column}' by Device ---\n")
    print(model.summary())
    
    return model

In [None]:
def run_keyword_device_regression(dataframe, keyword_col='Keyword', device_col='Device', target_col='', baseline=None):
    # Make a copy of the input dataframe to avoid modifying the original
    df = dataframe.copy()
    
    # Create a combined column for Keyword and Device (e.g., "data analytics_mobile")
    df['Keyword_Device'] = df[keyword_col].astype(str) + '_' + df[device_col].astype(str)
    
    # Convert the categorical Keyword_Device column into dummy/indicator variables (one-hot encoding)
    dummies = pd.get_dummies(df['Keyword_Device']).astype(float)
    
    # Optionally drop a baseline category to avoid multicollinearity
    if baseline is not None:
        dummies = dummies.drop(columns=baseline)
    
    # Add an intercept/constant term to the regression design matrix
    X = sm.add_constant(dummies)
    
    # Define the target variable (dependent variable), ensuring it’s numeric
    y = df[target_col].astype(float)
    
    # Fit an Ordinary Least Squares (OLS) regression model
    model = sm.OLS(y, X).fit()
    
    # Extract the regression results summary table (coefficients, std errors, p-values, etc.)
    sig_results = model.summary2().tables[1]
    
    # Filter results to keep only statistically significant predictors (p-value ≤ 0.002)
    filtered = sig_results[sig_results['P>|t|'] <= 0.002]
    
    # Print the significant results
    print(filtered)
    
    # Return the fitted regression model object for further inspection or use
    return model

In [None]:
def run_subset_regression(dataframe, keyword, target_column, baseline=None):
    # Filter the dataframe to include only rows for the specified keyword
    subset = dataframe[dataframe['Keyword'] == keyword]
    
    # Check if there are at least 2 rows AND at least 2 unique devices for regression
    if subset.shape[0] < 2 or subset['Device'].nunique() < 2:
        print(f"Not enough data or device variation for keyword: {keyword}")
        return None
    
    # One-hot encode the Device column (create dummy variables for each device)
    dummies = pd.get_dummies(subset['Device'])
    
    # Optionally drop a baseline device column to avoid multicollinearity
    if baseline is not None and baseline in dummies.columns:
        dummies = dummies.drop(columns=baseline)
    
    # Add an intercept/constant column and ensure predictors are numeric
    X = sm.add_constant(dummies).astype(float)
    
    # Define the dependent variable (target) as numeric
    y = subset[target_column].astype(float)
    
    # Fit an Ordinary Least Squares (OLS) regression model
    model = sm.OLS(y, X).fit()
    
    # Print a header so each regression output is labeled by keyword
    print(f"\n--- Regression Summary for Keyword: '{keyword}' ---\n")
    
    # Print the full regression summary (coefficients, significance, R², etc.)
    print(model.summary())
    
    # Return the fitted model object for further inspection/use
    return model
