In [None]:
# QUESTION 1

In [1]:
import numpy as np
import pandas as pd

In [4]:
# defining the function using the parameters
def random_data(size=2500):
    # Simulating 'age' from 1 to 100, then repeating till it's 2500
    age = np.random.choice(np.arange(1, 101), size=size, replace=True)
    
    # Simulating other variables with fixed parameters
    success = np.random.geometric(p=0.30, size=size)
    accident = np.random.binomial(n=10, p=0.60, size=size)
    events = np.random.poisson(lam=4, size=size)
    time = np.random.exponential(scale=1/7, size=size)
    income = np.random.normal(loc=10, scale=2, size=size)
    
    # Creating a DataFrame
    data = pd.DataFrame({
        'age': age,
        'success': success,
        'accident': accident,
        'events': events,
        'time': time,
        'income': income
    })
    
    return data

In [8]:
# Function to split data by 'age' and compute mean for each age

def split_and_compute_mean(data):

    age_group_means = data.groupby('age').mean()
    return age_group_means

In [12]:
# Function to replicate the entire procedure 100 times and compute overall mean
def replicate_procedure(replications=100):
    replicated_means = []
    
    for i in range(replications):
        # Generate the data
        data = random_data()
        
        # Split by age and computing mean for each
        age_means = split_and_compute_mean(data)
        
        # Store the mean values
        replicated_means.append(age_means)
    
    # Concatenate all means and compute overall mean
    all_means = pd.concat(replicated_means)
    overall_mean = all_means.groupby(level=0).mean()
    
    return overall_mean


In [57]:
# Running the data generation and replication procedure
if __name__ == "__main__":
    # Generate and print the created dataset
    data = random_data()
    print("Generated Dataset:")
    print(data)
    
    data = random_data()
    print("\nGenerated Statistics:")
    print(data.describe())
    
    # Running the replication procedure
    overall_mean = replicate_procedure()
    print("\nOverall Mean after Replication:")
    print(overall_mean)


Generated Dataset:
      age  success  accident  events      time     income
0      71        2         6       3  0.013653  11.800730
1      77        5         8       4  0.196568  11.623282
2      27        3         6       7  0.008520  11.009765
3      87        2         5       2  0.039298   9.540502
4      82        3         5       3  0.246127   7.150790
...   ...      ...       ...     ...       ...        ...
2495   80        1         4       4  0.079880  11.110601
2496    2        4         7       7  0.122050  11.075364
2497    5       14         7       3  0.030829   8.345092
2498   57        1         7       5  0.399290   9.654622
2499    2        1         8       3  0.110846  13.103193

[2500 rows x 6 columns]

Generated Statistics:
               age      success     accident       events         time  \
count  2500.000000  2500.000000  2500.000000  2500.000000  2500.000000   
mean     50.566000     3.316800     5.953200     4.009600     0.144124   
std      28.751

In [None]:
# QUESTION 2

In [19]:
# Creating a function to generate dataset with variable parameters

# defining the function

def variable_data(size=2500): 
    # Simulating 'age' from 1 to 100 (from a uniform distribution)
    age = np.random.choice(np.arange(1, 101), size=size, replace=True)

    # simulating other variables
    success_p = np.random.uniform(0.2, 0.3)
    success = np.random.geometric(p=success_p, size=size)
    
    accident_p = np.random.uniform(0.4, 0.7)
    accident = np.random.binomial(n=10, p=accident_p, size=size)
    
    events_lambda = np.random.uniform(3, 5)  # Assuming a range for Poisson lambda
    events = np.random.poisson(lam=events_lambda, size=size)
    
    time_lambda = np.random.uniform(0.3, 0.7)
    time = np.random.exponential(scale=1/time_lambda, size=size)
    
    income_mean = np.random.uniform(10, 20)
    income = np.random.normal(loc=income_mean, scale=2, size=size)
    
    # Creating a DataFrame
    data = pd.DataFrame({
        'age': age,
        'success': success,
        'accident': accident,
        'events': events,
        'time': time,
        'income': income
    })
    
    return data


In [21]:
# Function to split data by 'age' and compute mean for each age
def split_and_compute_mean(data):
    age_group_means = data.groupby('age').mean()
    
    return age_group_means

In [25]:
# Function to replicate the entire procedure 100 times and compute overall mean
def replicate_procedure_variable(replications=100):
    replicated_means = []
    
    for i in range(replications):
        # Generate data with variable parameters
        data = variable_data()
        
        # Split by age and computing the mean
        age_means = split_and_compute_mean(data)
        
        # Storing the mean values in replicated_mean
        replicated_means.append(age_means)
    
    # Concatenating all means and generating the overall mean
    all_means = pd.concat(replicated_means)
    overall_mean = all_means.groupby(level=0).mean()
    
    return overall_mean

In [53]:
# Running the data generation and replication procedure with variable parameters
if __name__ == "__main__":
    # Generate and print the created dataset with variable parameters
    data = variable_data()
    print("Generated Dataset with Variable Parameters:")
    print(data)

    data = variable_data()
    print("\nGenerated Dataset statistics:")
    print(data.describe())
  
    # Running the replication procedure with variable parameters
    overall_mean_variable = replicate_procedure_variable()
    print("\nOverall Mean after Replication with Variable Parameters:")
    print(overall_mean_variable)


Generated Dataset with Variable Parameters:
      age  success  accident  events      time     income
0      92        7         7       5  0.459593  18.675449
1      91        4         8       0  4.292134  20.451837
2      40        1         8       4  4.196458  19.210888
3      93        1         6       3  0.888880  19.088646
4      82       11         6       2  1.007455  20.898062
...   ...      ...       ...     ...       ...        ...
2495   39        3         7       3  3.233668  22.337687
2496   21        1         6       1  0.716899  18.629122
2497   25        8         9       8  0.510788  17.904729
2498   44        7         6       3  0.683715  18.823504
2499   26        6        10       4  2.784445  18.964468

[2500 rows x 6 columns]

Generated Dataset statistics:
               age      success     accident       events         time  \
count  2500.000000  2500.000000  2500.000000  2500.000000  2500.000000   
mean     51.048800     4.101200     6.952400     3.83520