In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import mahalanobis

import statsmodels.api as sm
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import linear_reset, het_breuschpagan
from statsmodels.tools.tools import add_constant

# Load data

In [None]:
research = pd.read_csv('./research.csv')
design = pd.read_csv('./design.csv')
improvement = pd.read_csv('./improvement.csv')

In [None]:
survey = pd.read_csv('./survey.csv')

# Log analysis


In [None]:
def mahalanobis_distance(row, mean, inv_cov_matrix):
    diff = row - mean
    return np.sqrt(diff.T @ inv_cov_matrix @ diff)

## Research

In [None]:
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

results_i = []
results_h3 = []

research

for kkk, j in enumerate(survey.columns):

    # Outlier Removal using Mahalanobis Distance
    data_mean = np.mean(research, axis=0)
    cov_matrix = np.cov(research.values.T)
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    mahal_distances = research.apply(mahalanobis_distance, axis=1, args=(data_mean, inv_cov_matrix))
    threshold = np.percentile(mahal_distances, 97.5)
    research_filtered = research[mahal_distances < threshold]
    survey_tmp = pd.DataFrame(survey[j][research_filtered.index]).dropna()

    # Normalize Data
    min_max_scaler = MinMaxScaler()
    research_scaled = min_max_scaler.fit_transform(research_filtered)
    research_scaled = pd.DataFrame(research_scaled, columns=research_filtered.columns)
    survey_tmp_scaled = min_max_scaler.fit_transform(survey_tmp)

    # Model Fitting
    research_scaled = sm.add_constant(research_scaled[['research_time_sum', 'research_click_sum']])
    model_filtered = sm.OLS(survey_tmp_scaled, research_scaled)
    results_robust = model_filtered.fit(cov_type='HC3')

    # Append Results
    results_h3.append(results_robust)

    # Statistical Tests
    residuals = results_robust.resid
    def linearity_test():
        reset_test = linear_reset(results_robust, power=2, test_type='fitted')
        return reset_test.pvalue

    def normality_test():
        _, p_value = stats.normaltest(residuals)
        return p_value

    def homoscedasticity_test():
        _, p_value, _, _ = het_breuschpagan(residuals, results_robust.model.exog)
        return p_value

    def independence_test():
        return durbin_watson(residuals)

    # Filter based on p-values
    linearity_p_value = linearity_test()
    normality_p_value = normality_test()
    homoscedasticity_p_value = homoscedasticity_test()
    independence_statistic = independence_test()

    # Interpretation of Results
    if (
        linearity_p_value > 0.05 and
        normality_p_value > 0.05 and
        homoscedasticity_p_value > 0.05 and
        independence_statistic > 0.05
    ):
        print(f"Column: {j}")   
        print("\nInterpretation:")
        print("Linearity: Satisfied")
        print("Normality: Satisfied")
        print("Homoscedasticity: Satisfied")
        print("Independence: Satisfied")
        print("=" * 80)
        print(results_robust.summary())


## Design

In [None]:
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import statsmodels.api as sm

results_i = []
results_h3 = []

research

for kkk, j in enumerate(survey.columns):

    # Outlier Removal using Mahalanobis Distance
    data_mean = np.mean(design, axis=0)
    cov_matrix = np.cov(design.values.T)
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    mahal_distances = design.apply(mahalanobis_distance, axis=1, args=(data_mean, inv_cov_matrix))
    threshold = np.percentile(mahal_distances, 97.5)
    design_filtered = design[mahal_distances < threshold]
    survey_tmp = pd.DataFrame(survey[j][design_filtered.index]).dropna()

    # Normalize Data
    min_max_scaler = MinMaxScaler()
    design_scaled = min_max_scaler.fit_transform(design_filtered)
    design_scaled = pd.DataFrame(design_scaled, columns=design_filtered.columns)
    survey_tmp_scaled = min_max_scaler.fit_transform(survey_tmp)

    # Model Fitting
    design_scaled = sm.add_constant(design_scaled[['design_time_sum', 'design_click_sum']])
    model_filtered = sm.OLS(survey_tmp_scaled, design_scaled)
    results_robust = model_filtered.fit(cov_type='HC3')

    # Append Results
    results_h3.append(results_robust)

    # Statistical Tests
    residuals = results_robust.resid
    def linearity_test():
        reset_test = linear_reset(results_robust, power=2, test_type='fitted')
        return reset_test.pvalue

    def normality_test():
        _, p_value = stats.normaltest(residuals)
        return p_value

    def homoscedasticity_test():
        _, p_value, _, _ = het_breuschpagan(residuals, results_robust.model.exog)
        return p_value

    def independence_test():
        return durbin_watson(residuals)

    # Filter based on p-values
    linearity_p_value = linearity_test()
    normality_p_value = normality_test()
    homoscedasticity_p_value = homoscedasticity_test()
    independence_statistic = independence_test()

    # Interpretation of Results
    if (
        linearity_p_value > 0.05 and
        normality_p_value > 0.05 and
        homoscedasticity_p_value > 0.05 and
        independence_statistic > 0.05
    ):
        print(f"Column: {j}")   
        print("\nInterpretation:")
        print("Linearity: Satisfied")
        print("Normality: Satisfied")
        print("Homoscedasticity: Satisfied")
        print("Independence: Satisfied")
        print("=" * 80)
        print(results_robust.summary())


# Improvement

In [None]:
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import statsmodels.api as sm

results_i = []
results_h3 = []

research

for kkk, j in enumerate(survey.columns):

    # Outlier Removal using Mahalanobis Distance
    data_mean = np.mean(improvement, axis=0)
    cov_matrix = np.cov(improvement.values.T)
    inv_cov_matrix = np.linalg.inv(cov_matrix)
    mahal_distances = improvement.apply(mahalanobis_distance, axis=1, args=(data_mean, inv_cov_matrix))
    threshold = np.percentile(mahal_distances, 97.5)
    improvement_filtered = improvement[mahal_distances < threshold]
    survey_tmp = pd.DataFrame(survey[j][improvement_filtered.index]).dropna()

    # Normalize Data
    min_max_scaler = MinMaxScaler()
    improvement_scaled = min_max_scaler.fit_transform(improvement_filtered)
    improvement_scaled = pd.DataFrame(improvement_scaled, columns=improvement_filtered.columns)
    survey_tmp_scaled = min_max_scaler.fit_transform(survey_tmp)

    # Model Fitting
    improvement_scaled = sm.add_constant(improvement_scaled[['improvement_time_sum', 'improvement_click_sum']])
    model_filtered = sm.OLS(survey_tmp_scaled, improvement_scaled)
    results_robust = model_filtered.fit(cov_type='HC3')

    # Append Results
    results_h3.append(results_robust)

    # Statistical Tests
    residuals = results_robust.resid
    def linearity_test():
        reset_test = linear_reset(results_robust, power=2, test_type='fitted')
        return reset_test.pvalue

    def normality_test():
        _, p_value = stats.normaltest(residuals)
        return p_value

    def homoscedasticity_test():
        _, p_value, _, _ = het_breuschpagan(residuals, results_robust.model.exog)
        return p_value

    def independence_test():
        return durbin_watson(residuals)

    # Filter based on p-values
    linearity_p_value = linearity_test()
    normality_p_value = normality_test()
    homoscedasticity_p_value = homoscedasticity_test()
    independence_statistic = independence_test()

    # Interpretation of Results
    if (
        linearity_p_value > 0.05 and
        normality_p_value > 0.05 and
        homoscedasticity_p_value > 0.05 and
        independence_statistic > 0.05
    ):
        print(f"Column: {j}")   
        print("\nInterpretation:")
        print("Linearity: Satisfied")
        print("Normality: Satisfied")
        print("Homoscedasticity: Satisfied")
        print("Independence: Satisfied")
        print("=" * 80)
        print(results_robust.summary())
