<a href="https://colab.research.google.com/github/MK316/workspace/blob/main/ASR02/ASR_stats01b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

Add Gender column:

[data](https://raw.githubusercontent.com/MK316/workspace/main/ASR02/results/data_combined_0520.csv)

In [None]:
url = "https://raw.githubusercontent.com/MK316/workspace/main/ASR02/results/data_combined_0520.csv"
df = pd.read_csv(url)
df.head()

In [None]:
dfw = df[['Subjects','Gender','Conditions','WER','WER_lev']]


In [None]:
# Select the data based on the condition
# dfsub = dfw.loc[(dfw['Conditions'] == 'PRE') & (dfw['WER'] > threshold)]
dfsub = dfw.groupby('Subjects').filter(lambda x: x['WER'].loc[x['Conditions'] == 'PRE'].values[0] > 0.12)


# Print the selected data
print(dfsub)
print(len(dfsub['Subjects']))


In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Fit the linear mixed model

model = smf.mixedlm("WER ~ Conditions + Gender", data=dfsub, groups=dfsub["Subjects"])


result = model.fit()

# Print the summary of the model
print(result.summary())


boxplot by Gender and Conditions

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the figure size
plt.figure(figsize=(8, 6))

# Create the boxplot
sns.boxplot(x="Conditions", y="WER", data=dfsub) # , hue="Gender"

# Set labels and title
plt.xlabel("Conditions")
plt.ylabel("WER")
plt.ylim(0,0.6)
plt.title("Boxplot of WER by Gender and Conditions")

# Show the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the figure size
plt.figure(figsize=(7, 4))

# Define the order of the conditions
condition_order = ['PRE', 'POST']

# Create the boxplot with notches
sns.boxplot(x="Conditions", y="WER", data=dfsub, order=condition_order, notch=True) # , hue="Gender"

# Set labels and title
plt.xlabel("Conditions")
plt.ylabel("WER")
plt.ylim(0, 0.6)
plt.title("Boxplot of WER by Conditions")

# Save the plot as a PNG file
plt.savefig("boxplot_WERsub01.png")

# Show the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Encode the 'Conditions' column into numerical values
conditions_mapping = {'PRE': 0, 'POST': 1}
dfsub['Conditions_encoded'] = dfsub['Conditions'].map(conditions_mapping)

# Set the figure size
plt.figure(figsize=(8, 6))

# Create the linear regression plot
sns.lmplot(x="Conditions_encoded", y="WER", hue="Gender", data=dfsub)

# Set labels and title
plt.xlabel("Conditions")
plt.ylabel("WER")
plt.title("Linear Regression Plot of WER by Conditions and Gender")

# Show the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Encode the 'Conditions' column with numeric labels
condition_labels = dfsub['Conditions'].unique()
condition_mapping = {label: i for i, label in enumerate(condition_labels)}
dfsub['Condition_Label'] = dfsub['Conditions'].map(condition_mapping)

# Set the figure size
plt.figure(figsize=(8, 6))

# Create the linear regression plot
sns.lmplot(x='Condition_Label', y='WER', hue='Subjects', data=dfsub)

# Set labels and title
plt.xlabel('Conditions')
plt.ylabel('WER')
plt.title('Linear Regression of WER by Conditions (Grouped by Subjects)')

# Show the plot
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Manually reorder the categories in 'Conditions' column
dfsub['Conditions'] = pd.Categorical(dfsub['Conditions'], categories=['PRE', 'POST'], ordered=True)

# Encode the 'Conditions' column with numeric labels
dfsub['Condition_Label'] = dfsub['Conditions'].cat.codes

# Set the figure size
plt.figure(figsize=(12, 4))

# Create the linear regression plot
sns.lmplot(x='Condition_Label', y='WER', hue='Subjects', data=dfsub)

# Set x-axis tick labels
plt.xticks([0, 1], ['PRE', 'POST'])

# Set labels and title
plt.xlabel('Conditions')
plt.ylabel('WER')
plt.title('Linear Regression of WER by Conditions (Grouped by Subjects)')

# Show the plot
plt.show()


In [None]:
#@markdown + Individual plots for subjects (skip)
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Set the figure size
# plt.figure(figsize=(12, 10))

# # Loop over each subject
# for subject in dfw['Subjects'].unique():
#     # Filter the data for the current subject
#     subject_data = dfw[dfw['Subjects'] == subject]
    
#     # Create the box plot for the current subject
#     sns.boxplot(x='Conditions', y='WER', data=subject_data)
#     plt.title(f'Box Plot of WER by Conditions for Subject {subject}')
#     plt.xlabel('Conditions')
#     plt.ylabel('WER')
    
#     # Show the plot
#     plt.show()


In [None]:
#@markdown + Individual plots in one figure (skip)
import matplotlib.pyplot as plt

# Define the number of rows and columns in the subplot grid
num_rows = 6
num_cols = 5

# Create a figure and a grid of subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 15))

# Flatten the axes array for easier indexing
axes = axes.flatten()

# Loop over each subject
for i, subject in enumerate(dfsub['Subjects'].unique()):
    # Filter the data for the current subject
    subject_data = dfsub[dfsub['Subjects'] == subject]
    
    # Select the subplot for the current subject
    ax = axes[i]
    
    # Create the scatter plot for the current subject
    ax.scatter(x='Conditions', y='WER', data=subject_data)
    ax.set_title(f' {subject}')
    ax.set_ylim(0,1)
    ax.set_xlabel('Conditions')
    ax.set_ylabel('WER')

# Remove any unused subplots
for j in range(i+1, num_rows*num_cols):
    fig.delaxes(axes[j])

# Adjust the spacing between subplots
fig.tight_layout()

# Show the plot
plt.show()


# Perception result analysis (re-try)

[EPA data 0520](https://raw.githubusercontent.com/MK316/workspace/main/ASR02/results/EPA_results_all0520_gender.csv) + WER, REPs

In [None]:
import pandas as pd
import numpy as np
url = "https://raw.githubusercontent.com/MK316/workspace/main/ASR02/results/EPA_results_all0520_gender.csv"

df2 = pd.read_csv(url)
df2.tail()

Linear mixed model

In [None]:
# Proficiency
import statsmodels.formula.api as smf

model = smf.mixedlm("Prof ~ Cons + Gender + Reps", data=df2, groups=df2["Stimuli"])
result = model.fit()


print(result.summary())


In [None]:
# Accuracy
import statsmodels.formula.api as smf

model = smf.mixedlm("Acc ~ Cons + Gender + Reps", data=df2, groups=df2["Stimuli"])
result = model.fit()


print(result.summary())

In [None]:
# Intelligibility
import statsmodels.formula.api as smf

model = smf.mixedlm("Intel ~ Cons + Gender + Reps", data=df2, groups=df2["Stimuli"])
result = model.fit()


print(result.summary())

In [None]:
# Intelligibility
import statsmodels.formula.api as smf

model = smf.mixedlm("Comp ~ Cons + Gender + Reps", data=df2, groups=df2["Stimuli"])
result = model.fit()


print(result.summary())

# [1] EPA subdata Prof < 6

data: df2

In [None]:
# Select the data based on the condition
# dfsub = dfw.loc[(dfw['Conditions'] == 'PRE') & (dfw['WER'] > threshold)]
df2prof = df2.groupby('Stimuli').filter(lambda x: x['Prof'].loc[x['Cons'] == 'PRE'].values[0] < 6)
df2comp = df2.groupby('Stimuli').filter(lambda x: (x['Comp'].loc[x['Cons'] == 'PRE'].values[0] < 6) and ('POST' in x['Cons'].values))
df2acc = df2.groupby('Stimuli').filter(lambda x: (x['Acc'].loc[x['Cons'] == 'PRE'].values[0] < 6) and ('POST' in x['Cons'].values))
df2intel = df2.groupby('Stimuli').filter(lambda x: (x['Intel'].loc[x['Cons'] == 'PRE'].values[0] < 6) and ('POST' in x['Cons'].values))

# Print the selected data
print(len(df2prof['Stimuli']))
print(len(df2comp['Stimuli']))
print(len(df2acc['Stimuli']))
print(len(df2intel['Stimuli']))
summary = df2prof.groupby(['Gender', 'Raters']).size().reset_index(name='Count')
print(summary)

In [None]:
# Proficiency
import statsmodels.formula.api as smf

model_intel = smf.mixedlm("Intel ~ Cons + Gender + Reps", data=df2intel, groups=df2intel["Stimuli"])
result = model_intel.fit()


print(result.summary())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the figure size
plt.figure(figsize=(4, 4))

# Define the order of the conditions
condition_order = ['PRE', 'POST']

# Create the boxplot with notches
sns.boxplot(x="Cons", y="Prof", data=df2prof, hue = "Gender", order=condition_order, notch=True) # , hue="Gender" 

# Set labels and title
plt.xlabel("Conditions")
plt.ylabel("Proficiency ratings")
plt.ylim(0, 7.5)
plt.title("Proficiency ratings by Conditions")

# Save the plot as a PNG file
plt.savefig("boxplot_Profsub.png")

# Show the plot
plt.show()

In [None]:
mean_prof_by_cons = df2prof.groupby('Cons')['Prof'].mean()
mean_prof_by_cons

In [None]:
mean_prof_by_cons_gender = df2prof.groupby(['Cons', 'Gender'])['Prof'].mean()
mean_prof_by_cons_gender

# [2] EPA data by comprehensibility

In [None]:
# Select the data based on the condition
# dfsub = dfw.loc[(dfw['Conditions'] == 'PRE') & (dfw['WER'] > threshold)]
# df2prof = df2.groupby('Stimuli').filter(lambda x: x['Prof'].loc[x['Cons'] == 'PRE'].values[0] < 6)
df2comp = df2.groupby('Stimuli').filter(lambda x: (x['Comp'].loc[x['Cons'] == 'PRE'].values[0] < 6) and ('POST' in x['Cons'].values))


# Print the selected data
print(len(df2comp['Stimuli']))
summary = df2prof.groupby(['Gender', 'Raters']).size().reset_index(name='Count')
print(summary)

In [None]:
# Proficiency
import statsmodels.formula.api as smf

model = smf.mixedlm("Comp ~ Cons + Gender + Reps", data=df2comp, groups=df2comp["Stimuli"])
result = model.fit()


print(result.summary())

Intercept: The intercept coefficient is 4.789 with a standard error of 0.161. The z-score of 29.716 indicates a highly significant effect (p < 0.001). This suggests that the expected value of the 'Comp' variable when all other predictors are zero is 4.789.

Cons[T.PRE]: The coefficient for the 'Cons' variable (PRE vs POST) is -0.213 with a standard error of 0.082. The z-score of -2.589 indicates a significant effect (p = 0.010). This suggests that there is a significant difference in the 'Comp' variable between the PRE and POST conditions.

Gender[T.Male]: The coefficient for the 'Gender' variable (Male vs Female) is -0.504 with a standard error of 0.200. The z-score of -2.517 indicates a significant effect (p = 0.012). This suggests that there is a significant difference in the 'Comp' variable between males and females.

Reps[T.R2]: The coefficient for the 'Reps' variable (R2 vs R1) is 0.155 with a standard error of 0.101. The z-score of 1.536 indicates that the effect is not statistically significant (p = 0.125). This suggests that there is no significant difference in the 'Comp' variable between the R2 and R1 repetitions.

Reps[T.R3]: The coefficient for the 'Reps' variable (R3 vs R1) is 0.215 with a standard error of 0.101. The z-score of 2.130 indicates a significant effect (p = 0.033). This suggests that there is a significant difference in the 'Comp' variable between the R3 and R1 repetitions.

Overall, these results indicate that the 'Cons' variable (PRE vs POST) and the 'Gender' variable have significant effects on the 'Comp' variable. However, the 'Reps' variable does not have a significant effect.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the figure size
plt.figure(figsize=(7, 4))

# Define the order of the conditions
condition_order = ['PRE', 'POST']

# Create the boxplot with notches
sns.boxplot(x="Cons", y="Prof", data=df2comp, hue = "Gender", order=condition_order, notch=True) # , hue="Gender" 

# Set labels and title
plt.xlabel("Conditions")
plt.ylabel("Comprehensibility ratings")
plt.ylim(0, 7.5)
plt.title("Comprehensibility ratings by Conditions")

# Save the plot as a PNG file
plt.savefig("boxplot_Compsub.png")

# Show the plot
plt.show()

In [None]:
mean_prof_by_cons = df2prof.groupby('Cons')['Prof'].mean()
mean_prof_by_cons

In [None]:
mean_prof_by_cons_gender = df2prof.groupby(['Cons', 'Gender'])['Prof'].mean()
mean_prof_by_cons_gender

# Reaction time calculations

In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/MK316/workspace/main/ASR02/results/EPA_results_all0520_gender.csv"

df3 = pd.read_csv(url)
df3.tail()

In [None]:
t1 = df3.groupby('Raters')['RT1'].mean()
t1

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 4))
sns.boxplot(x='Raters', y='RT1', data=df3, notch=True)
plt.title('Notched Boxplot of RT1 by Raters')
plt.show()
