In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# Keeping things consistent
np.random.seed(42)

# Number of groups we're working with
num_groups = 1000

# Naming the groups sequentially
group_names = [f"Group {i+1}" for i in range(num_groups)]

# Assigning group types with probabilities
group_types = np.random.choice(["A", "B", "C"], size=num_groups, p=[0.05, 0.15, 0.80])

# Defining how many members each group has (ensuring a reasonable minimum)
total_members = np.maximum(np.random.normal(loc=35000, scale=25000, size=num_groups).astype(int), 5000)

# Probability of making a post per group type
post_probabilities = {"A": 0.40, "B": 0.15, "C": 0.10}

# Estimating posts using a binomial distribution
estimated_posts = [np.random.binomial(n=int(members * 0.20), p=post_probabilities[gtype]) for members, gtype in zip(total_members, group_types)]

# Comment behavior for different group types
comment_params = {"A": {"mean": 6, "std": 10}, "B": {"mean": 5, "std": 7}, "C": {"mean": 3, "std": 4}}

# Generating comment counts per post
estimated_comments = [sum(np.random.normal(
    loc=comment_params[gtype]["mean"],
    scale=comment_params[gtype]["std"],
    size=posts
).astype(int)) for gtype, posts in zip(group_types, estimated_posts)]

# Reaction stats per post and comment
post_reaction_params = {"A": {"mean": 10, "std": 15}, "B": {"mean": 5, "std": 12}, "C": {"mean": 3, "std": 5}}
comment_reaction_params = {"A": {"mean": 4, "std": 4}, "B": {"mean": 2, "std": 3}, "C": {"mean": 1, "std": 3}}

# Calculating reactions
estimated_reactions = []
for i in range(num_groups):
    group_type = group_types[i]
    
    # Reactions on posts
    post_reactions = sum(np.random.normal(
        loc=post_reaction_params[group_type]["mean"],
        scale=post_reaction_params[group_type]["std"],
        size=estimated_posts[i]
    ).astype(int))
    
    # Reactions on 10% of comments
    comment_reactions = sum(np.random.normal(
        loc=comment_reaction_params[group_type]["mean"],
        scale=comment_reaction_params[group_type]["std"],
        size=int(0.1 * estimated_comments[i])
    ).clip(min=0).astype(int))
    
    estimated_reactions.append(post_reactions + comment_reactions)

# Engagement calculation (as a percentage)
engagement_percentage = ((np.array(estimated_reactions) + np.array(estimated_comments) + np.array(estimated_posts)) / (total_members * 12)) * 100

# Creating DataFrame
df_facebook_groups = pd.DataFrame({
    "Group Name": group_names,
    "Group Type": group_types,
    "Total Members": total_members,
    "Estimated Posts": estimated_posts,
    "Estimated Comments": estimated_comments,
    "Estimated Reactions": estimated_reactions,
    "Engagement %": engagement_percentage
})

# Sorting groups alphabetically
df_facebook_groups = df_facebook_groups.sort_values(by="Group Type")

# Save results
df_facebook_groups.to_csv("facebook_groups_simulation.csv", index=False)

# Engagement summary per group type
engagement_by_type = df_facebook_groups.groupby("Group Type")["Engagement %"].mean().reset_index()
engagement_by_type.rename(columns={"Engagement %": "Average Engagement %"}, inplace=True)

# Overall engagement calculation
total_engagement = (df_facebook_groups["Estimated Reactions"].sum() + 
                    df_facebook_groups["Estimated Comments"].sum() + 
                    df_facebook_groups["Estimated Posts"].sum()) / (df_facebook_groups["Total Members"].sum() * 12) * 100

# Print the insights
print(engagement_by_type)
print(f"\nTotal Engagement %: {total_engagement:.2f}%")

#-------------------------------------------------------#

# Making some changes: reducing posts slightly
rec = -0.05
estimated_posts = [int(posts * (1-rec)) for posts in estimated_posts]

# Slight increase in comment interactions
inc = 0.16
comment_reaction_params = {"A": {"mean": 4 *(1+inc), "std": 4 *(1+inc)}, "B": {"mean": 2 *(1+inc), "std": 3 *(1+inc)}, "C": {"mean": 1 *(1+inc), "std": 3 *(1+inc)}}

# Adjusting comment counts
estimated_comments = [sum(np.random.normal(
    loc=comment_params[gtype]["mean"],
    scale=comment_params[gtype]["std"],
    size=posts
).astype(int)) for gtype, posts in zip(group_types, estimated_posts)]

# Applying the increase
estimated_comments = [int(comments *(1+inc)) for comments in estimated_comments]

# Recalculating reactions
estimated_reactions = []
for i in range(num_groups):
    group_type = group_types[i]
    post_reactions = sum(np.random.normal(
        loc=post_reaction_params[group_type]["mean"],
        scale=post_reaction_params[group_type]["std"],
        size=estimated_posts[i]
    ).astype(int))
    
    comment_reactions = sum(np.random.normal(
        loc=comment_reaction_params[group_type]["mean"],
        scale=comment_reaction_params[group_type]["std"],
        size=int(0.1 * estimated_comments[i])
    ).clip(min=0).astype(int))
    
    estimated_reactions.append(post_reactions + comment_reactions)

# Recalculating engagement
engagement_percentage = ((np.array(estimated_reactions) + np.array(estimated_comments) + np.array(estimated_posts)) / (total_members * 12)) * 100

df_facebook_groups = pd.DataFrame({
    "Group Name": group_names,
    "Group Type": group_types,
    "Total Members": total_members,
    "Estimated Posts": estimated_posts,
    "Estimated Comments": estimated_comments,
    "Estimated Reactions": estimated_reactions,
    "Engagement %": engagement_percentage
})

df_facebook_groups = df_facebook_groups.sort_values(by="Group Type")

df_facebook_groups.to_csv("facebook_groups_simulation_2.csv", index=False)

engagement_by_type = df_facebook_groups.groupby("Group Type")["Engagement %"].mean().reset_index()
engagement_by_type.rename(columns={"Engagement %": "Average Engagement %"}, inplace=True)

total_engagement = (df_facebook_groups["Estimated Reactions"].sum() + 
                    df_facebook_groups["Estimated Comments"].sum() + 
                    df_facebook_groups["Estimated Posts"].sum()) / (df_facebook_groups["Total Members"].sum() * 12) * 100

print(engagement_by_type)
print(f"\nTotal Engagement %: {total_engagement:.2f}%")

  Group Type  Average Engagement %
0          A             12.476469
1          B              2.896408
2          C              1.146694

Total Engagement %: 2.09%
  Group Type  Average Engagement %
0          A             14.339017
1          B              3.314835
2          C              1.309133

Total Engagement %: 2.39%


## Hypothesis Testing

In [3]:
def load_data(file_path_1, file_path_2):
    df1 = pd.read_csv(file_path_1)
    df2 = pd.read_csv(file_path_2)
    return df1, df2

def hypothesis_test(df1, df2):
    engagement_1 = df1["Engagement %"]
    engagement_2 = df2["Engagement %"]
    
    t_stat, p_value = stats.ttest_ind(engagement_1, engagement_2, equal_var=False)
    return t_stat, p_value

def interpret_results(t_stat, p_value, alpha=0.05):
    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    
    if p_value < alpha:
        print("Conclusion: Reject the null hypothesis. There is a significant difference in engagement between Model 1 and Model 2.")
    else:
        print("Conclusion: Fail to reject the null hypothesis. No significant difference in engagement between Model 1 and Model 2.")

# Example usage:
file_path_1 = "facebook_groups_simulation.csv"
file_path_2 = "facebook_groups_simulation_2.csv"

df1, df2 = load_data(file_path_1, file_path_2)
t_stat, p_value = hypothesis_test(df1, df2)
interpret_results(t_stat, p_value)


T-statistic: -2.3984
P-value: 0.0166
Conclusion: Reject the null hypothesis. There is a significant difference in engagement between Model 1 and Model 2.
