In [1]:
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'pandas'

In [None]:
df = pd.read_csv("mistral_instruction_induction_no_shots.csv", na_values=np.nan)
print("Are there NaN values: ", df.isna().any().sum() is True)
df.head()

Are there NaN values:  False


Unnamed: 0,task,pnum,iteration,score
0,word_in_context,0,1,0.54
1,word_in_context,0,2,0.52
2,word_in_context,0,3,0.48
3,word_in_context,0,4,0.55
4,word_in_context,0,5,0.54


In [40]:
# Base case (pnum=0) statistics
base_case_stats = df[df['pnum'] == 0].groupby('task')['score'].agg(['mean', 'std'])

# Compute the same statistics for each other pnum (pnum > 0)
modified_stats = df[df['pnum'] > 0].groupby(['task', 'pnum'])['score'].agg(['mean', 'std'])

# Display the statistics for comparison
print("pnum==0")
print(base_case_stats)

print("pnum>0")
print(modified_stats)

pnum==0
                             mean       std
task                                       
active_to_passive        0.310000  0.018708
antonyms                 0.402000  0.043243
cause_and_effect         0.520000  0.126491
common_concept           0.009804  0.011172
diff                     0.678000  0.014832
first_word_letter        0.982000  0.008367
informal_to_formal       0.331919  0.033609
larger_animal            0.604000  0.021909
letters_list             0.056000  0.008944
negation                 0.316000  0.008944
num_to_verbal            0.636000  0.008944
orthography_starts_with  0.066000  0.023022
rhymes                   0.082000  0.021679
second_word_letter       0.222000  0.020494
sentence_similarity      0.304000  0.043932
sentiment                0.872000  0.021679
singular_to_plural       0.738000  0.025884
sum                      0.658000  0.029496
synonyms                 0.084000  0.016733
taxonomy_animal          0.054000  0.018166
translation_en-de       

In [41]:
# Calculate the differences between base case (pnum=0) and modified prompts (pnum > 0) for each task
# Merge base case with modified stats to compute differences
base_scores = df[df['pnum'] == 0][['task', 'iteration', 'score']].rename(columns={'score': 'base_score'})

# Merge with original dataframe (pnum > 0)
diff_df = pd.merge(df[df['pnum'] > 0], base_scores, on=['task', 'iteration'], how='inner')

# Calculate the difference between modified and base scores
diff_df['score_diff'] = diff_df['score'] - diff_df['base_score']

# Display the first few rows to check the differences
diff_df[['task', 'pnum', 'iteration', 'score', 'base_score', 'score_diff']].head(10)


Unnamed: 0,task,pnum,iteration,score,base_score,score_diff
0,word_in_context,1,1,0.52,0.54,-0.02
1,word_in_context,2,1,0.5,0.54,-0.04
2,word_in_context,3,1,0.51,0.54,-0.03
3,word_in_context,4,1,0.46,0.54,-0.08
4,word_in_context,5,1,0.55,0.54,0.01
5,word_in_context,6,1,0.58,0.54,0.04
6,word_in_context,7,1,0.49,0.54,-0.05
7,word_in_context,8,1,0.43,0.54,-0.11
8,word_in_context,9,1,0.47,0.54,-0.07
9,word_in_context,10,1,0.49,0.54,-0.05


In [61]:
from scipy.stats import ttest_rel
from statsmodels.stats.power import TTestPower

# Cohen's d for independent samples
def cohen_d(x, y):
    # Mean difference between the two groups
    mean_diff = np.mean(x) - np.mean(y)
    
    # Pooled standard deviation
    n_x, n_y = len(x), len(y)
    pooled_std = np.sqrt(((n_x - 1) * np.std(x, ddof=1) ** 2 + (n_y - 1) * np.std(y, ddof=1) ** 2) / (n_x + n_y - 2))
    
    # Cohen's d
    return mean_diff / pooled_std

# Statistical power calculation
def statistical_power(base_scores: list, modified_scores: list, alpha=0.05):
    # Calculate Cohen's d
    d = cohen_d(modified_scores, base_scores)

    # power analysis
    power_analysis = TTestPower()
    sample_size = len(base_scores) + len(modified_scores)  # Total sample size for both groups
    return power_analysis.power(effect_size=d, nobs=sample_size, alpha=alpha, alternative='two-sided')

    



# Initialize an empty list to store results
results = []
# threshold for significance
alpha = 0.05

# Group by task and pnum to perform the paired t-test
for (task, pnum), group in diff_df.groupby(['task', 'pnum']):
    base_scores = group['base_score']
    modified_scores = group['score']
    
    # paired t-test
    t_stat, p_value = ttest_rel(modified_scores, base_scores)

    # power analysis
    power = statistical_power(base_scores=base_scores, modified_scores=modified_scores)
    
    # Store the results
    results.append({'task': task, 'pnum': pnum, 't_stat': t_stat, 'p_value': p_value, 'power': power})

# Convert results into a dataframe for easier viewing
ttest_results_df = pd.DataFrame(results)

non_significant_df = ttest_results_df[ttest_results_df['p_value'] > alpha]
print(f"There are {len(non_significant_df)} non-significant pnum-tasks combinations on a total of {len(ttest_results_df)} ({100*len(non_significant_df)/len(ttest_results_df):.2f}%)")

# Display the results
ttest_results_df.sort_values(by='power', ascending=True).head(15)


There are 112 non-significant pnum-tasks combinations on a total of 240 (46.67%)


Unnamed: 0,task,pnum,t_stat,p_value,power
117,orthography_starts_with,8,2.745223e-16,1.0,0.05
74,larger_animal,5,0.0,1.0,0.05
143,sentence_similarity,4,-0.08347839,0.937482,0.053023
20,cause_and_effect,1,-0.09829464,0.926427,0.053762
130,second_word_letter,1,-0.1034175,0.922609,0.054188
197,taxonomy_animal,8,0.1727737,0.871219,0.058162
145,sentence_similarity,6,0.2857143,0.789282,0.059913
35,common_concept,6,0.2048393,0.847699,0.062501
225,translation_en-fr,6,-0.2242305,0.833566,0.063969
142,sentence_similarity,3,-0.2544567,0.811689,0.069207


In [62]:
from scipy.stats import ttest_ind
from pprint import pprint

# 1. Calculate mean and standard deviation for all pnum == 0 (base case)
base_scores = df[df['pnum'] == 0]['score']
base_mean = base_scores.mean()
base_std = base_scores.std()

# 2. Calculate mean and standard deviation for all pnum > 0 (modified cases)
modified_scores = df[df['pnum'] > 0]['score']
modified_mean = modified_scores.mean()
modified_std = modified_scores.std()

# 3. Perform independent two-sample t-test
t_stat, p_value = ttest_ind(base_scores, modified_scores, equal_var=False)  # Welch's t-test
power = statistical_power(base_scores=base_scores, modified_scores=modified_scores)

# 4. Check significance at alpha = 0.05
alpha = 0.05
significant = p_value < alpha

# 5. Output results
results = {
    'base_mean': base_mean,
    'base_std': base_std,
    'modified_mean': modified_mean,
    'modified_std': modified_std,
    't_stat': t_stat,
    'p_value': p_value,
    'significant': significant,
    'power': power
}

# Display the results
pprint(results)

{'base_mean': 0.3924051348737096,
 'base_std': 0.2728550474864339,
 'modified_mean': 0.33392197945239094,
 'modified_std': 0.2625328608703769,
 'p_value': 0.02623052485174041,
 'power': 1.0,
 'significant': True,
 't_stat': 2.246277411265254}


In [64]:
# 1. Calculate mean scores for each pnum > 0
pnum_means = df[df['pnum'] > 0].groupby('pnum')['score'].mean()

# 2. Identify the pnum with the highest mean score
best_pnum = pnum_means.idxmax()

# 3. Calculate mean and std for pnum == 0 (base case)
base_scores = df[df['pnum'] == 0]['score']
base_mean = base_scores.mean()
base_std = base_scores.std()

# 4. Calculate mean and std for the pnum with the highest mean score
best_pnum_scores = df[df['pnum'] == best_pnum]['score']
best_pnum_mean = best_pnum_scores.mean()
best_pnum_std = best_pnum_scores.std()

# 5. Perform independent two-sample t-test between pnum == 0 and the best pnum
t_stat, p_value = ttest_ind(base_scores, best_pnum_scores, equal_var=False)  # Welch's t-test
power = statistical_power(base_scores=base_scores, modified_scores=best_pnum_scores)

# 6. Check significance at alpha = 0.05
alpha = 0.05
significant = p_value < alpha

# 7. Output results
results = {
    'base_mean': base_mean,
    'base_std': base_std,
    'best_pnum': best_pnum,
    'best_pnum_mean': best_pnum_mean,
    'best_pnum_std': best_pnum_std,
    't_stat': t_stat,
    'p_value': p_value,
    'significant': significant,
    'power': power
}

# Display the results
pprint(results)


{'base_mean': 0.3924051348737096,
 'base_std': 0.2728550474864339,
 'best_pnum': 2,
 'best_pnum_mean': 0.36823392539631905,
 'best_pnum_std': 0.27792442756518954,
 'p_value': 0.4972655741214477,
 'power': 0.2728027424168394,
 'significant': False,
 't_stat': 0.6798417953260264}
