In [None]:
# Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#Upload binary-coded matrix/interview data
from google.colab import files
uploaded = files.upload()

In [None]:
#Preview first 'n' data contents (Interview IDs in the first column and codes in the first row)
df = pd.read_excel('S3. Binary_coded_matrix_data.xlsx')
df.head()

In [None]:
#Extract interview IDs and determine the number of interviews and codes
interview_ids = df.iloc[:, 0].values
total_interviews = len(interview_ids)
total_codes = df.shape[1] - 1  #remove the first column since it contains interview IDs

# Function to calculate the saturation curve
def calculate_saturation_curve(interview_sequence):
    discovered_codes = set()
    saturation_percentages = []

    for interview_id in interview_sequence:
        codes = df[df.iloc[:, 0] == interview_id].iloc[:, 1:].values.flatten()  # gets codes for the current interview
        discovered_codes.update(np.where(codes == 1)[0])   #updates the set of discovered codes
        saturation = len(discovered_codes) / total_codes * 100   #calculate the percentage of codes discovered so far
        saturation_percentages.append(saturation)

    return saturation_percentages


saturation_curve = calculate_saturation_curve(interview_ids)   #calculate the saturation curve using the original interview order

#Function for finding the number of interviews needed to reach a saturation threshold
def find_saturation_point(curve, threshold):
    for index, value in enumerate(curve):
        if value >= threshold:
            return index + 1   # +1 because interview count is 1-based (counting starting at 1 instead of 0)
    return None

#Determine when 80% and 90% saturation is reached
saturation_80 = find_saturation_point(saturation_curve, 80)
saturation_90 = find_saturation_point(saturation_curve, 90)

#Plot the saturation curve
plt.figure(figsize=(12, 6))
x_values = np.arange(1, len(saturation_curve) + 1)
plt.plot(x_values, saturation_curve, color='black', label='New Codes')

#Vertical lines for saturation thresholds if found
if saturation_80 is not None:
    plt.axvline(saturation_80, color='blue', linestyle='--', label='80% Saturation')
if saturation_90 is not None:
    plt.axvline(saturation_90, color='orange', linestyle='--', label='90% Saturation')

#labels and legend
plt.xlabel('Number of Interviews')
plt.ylabel('Percentage of Codes Discovered')
plt.title('Code Saturation Curve')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

#Print summary
print("Summary Table:")
if saturation_80 is not None:
    print(f"Interviews needed to reach 80% saturation: {saturation_80}")
if saturation_90 is not None:
    print(f"Interviews needed to reach 90% saturation: {saturation_90}")



In [None]:
#Function to calculate the number of new codes discovered per interview

def calculate_new_codes(dataframe, interview_sequence):
    discovered_codes = set()
    new_codes_per_interview = []

    for interview_id in interview_sequence:
        codes = dataframe[dataframe.iloc[:, 0] == interview_id].iloc[:, 1:].values.flatten() #get the codes for the current interview
        current_codes = set(np.where(codes == 1)[0])

        new_codes = current_codes - discovered_codes  #identify newly discovered codes
        new_codes_per_interview.append(len(new_codes))

        discovered_codes.update(current_codes)  #update the set of discovered codes

    return new_codes_per_interview

#Calculate new codes for the actual interview order
actual_order = interview_ids
actual_new_codes = calculate_new_codes(df, actual_order)


# Set the random seed for reproducibility
np.random.seed(122)


#Calculate new codes for a random interview order
random_order = np.random.permutation(interview_ids)
random_new_codes = calculate_new_codes(df, random_order)

#Set up x-axis positions for the bar chart
x_positions = np.arange(total_interviews)  # 0-based indexing for plotting

#Define bar width and offset for side-by-side bars and create the bar chart
bar_width = 0.35
offset = bar_width / 2

plt.figure(figsize=(15, 7))

#Plot for actual interview order
plt.bar(x_positions - offset, actual_new_codes, bar_width, alpha=0.7, label='Actual Order', color='blue')

#Plot for random interview order
plt.bar(x_positions + offset, random_new_codes, bar_width, alpha=0.7, label='Random Order', color='grey')
plt.xticks(x_positions, np.arange(1, total_interviews + 1))

plt.xlabel('Interview Number')
plt.ylabel('Number of New Codes')
plt.title('New Codes Discovered per Interview')
plt.legend()
plt.grid(axis='y')

plt.tight_layout()
plt.show()

total_unique_codes = (df.iloc[:, 1:] == 1).any(axis=0).sum()
print(f"Total number of unique codes: {total_unique_codes}")



In [None]:
#Define the number of iterations
num_iterations = 10000

#Extract interview IDs and determine dimensions (uncomment if sequential saturation curve has not been run)
#interview_ids = df.iloc[:, 0].values
#total_interviews = len(interview_ids)
#total_codes = df.shape[1] - 1

#Lists to store the number of interviews needed to reach 80% and 90% saturation
saturation_80_list = []
saturation_90_list = []

#Run bootstrap iterations
for _ in range(num_iterations):
    sampled_ids = np.random.choice(interview_ids, total_interviews, replace=True)   #sample interviews with replacement
    discovered_codes = set()
    reached_80 = False
    reached_90 = False

    #Track saturation as interviews accumulate
    for i, interview_id in enumerate(sampled_ids):
        codes = df[df.iloc[:, 0] == interview_id].iloc[:, 1:].values.flatten()  #obtain codes for the current interview
        discovered_codes.update(np.where(codes == 1)[0])
        saturation_pct = len(discovered_codes) / total_codes * 100

        #Record the first interview where 80% saturation is reached
        if not reached_80 and saturation_pct >= 80:
            saturation_80_list.append(i + 1)
            reached_80 = True

        # Record the first interview where 90% saturation is reached
        if not reached_90 and saturation_pct >= 90:
            saturation_90_list.append(i + 1)
            reached_90 = True

#Function to plot curves
def plot_ecdf(data, label):
    sorted_data = np.sort(data)
    ecdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data) * 100
    plt.plot(sorted_data, ecdf, label=label, linewidth=2)

#Plot curves for 80% and 90% saturation
plt.figure(figsize=(10, 6))
plot_ecdf(saturation_80_list, '80% Saturation')
plot_ecdf(saturation_90_list, '90% Saturation')

plt.xlabel("Number of Interviews")
plt.ylabel("Percentage of Bootstrap Samples")
plt.title("ECDF of Interviews Needed to Reach Code Saturation")
plt.grid(True)
plt.legend()
plt.ylim(0, 100)
plt.tight_layout()
plt.show()

#summary statistics
def print_bounds(data, level):
    data = np.array(data)
    median = int(np.median(data))
    p5 = int(np.percentile(data, 5))
    p95 = int(np.percentile(data, 95))
    print(f"{level}% Saturation:")
    print(f"  Median = {median}")
    print(f"  5th Percentile = {p5}")
    print(f"  95th Percentile = {p95}\n")

#Print summary
print_bounds(saturation_80_list, 80)
print_bounds(saturation_90_list, 90)

