<a href="https://colab.research.google.com/github/MK316/Workingpapers/blob/main/2025-insights/Recall_predata0803.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recall predata analysis
+ 2025. 7. 27

data: "C:\Users\user\iCloudDrive\01_workingpapers\00_2025-workingpapers\2025c-DL-Kim&Lee\analysis\predata-analysis\predata.csv"

| **Question #** | **Survey Question (Paraphrased)**                                                               | **Suggested Keyphrase**     |
| -------------- | ----------------------------------------------------------------------------------------------- | --------------------------- |
| **Q1\_DB**     | Have you experienced or learned basic coding?                                                   | **Coding Exposure**         |
| **Q2\_DC**     | How would you rate your current digital literacy?                                               | **Digital Competence**      |
| **Q3\_CB**     | How beneficial do you think coding is for English teachers?                                     | **Coding Benefits**         |
| **Q4\_CA**     | If coding helps you create tailored materials, how willing are you to learn coding?             | **Coding Acceptance**       |
| **Q5**         | List five keywords about coding for language educators—its benefits, challenges, and relevance. | **Coding-Related Keywords** |


In [None]:
# 📌 Step 1: Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 📌 Step 2: Upload your file (CSV or Excel)
from google.colab import files
uploaded = files.upload()

# 📌 Step 3: Load your file
# Replace the file name below with your uploaded filename
df = pd.read_csv("predata2.csv")  # or .xlsx if applicable

# 📌 Step 4: Check column names and preview data
print("Columns:", df.columns.tolist())
df.head()

In [None]:
# 📌 Step 5: Convert Likert scale columns to numeric if not already
likert_cols = ['Q1_DB', 'Q2_DC', 'Q3_CB', 'Q4_CA']
df[likert_cols] = df[likert_cols].apply(pd.to_numeric, errors='coerce')

# 📌 Step 6: Descriptive statistics by Group
group_stats = df.groupby('Group')[likert_cols].agg(['mean', 'std', 'count']).round(2)
display(group_stats)

# 📌 Step 7: Keyword frequency analysis from Q5
from collections import Counter
from wordcloud import WordCloud

# Combine all Q5 responses by group
for group_name, group_data in df.groupby('Level'):
    print(f"\n📝 Keyword Summary for Group: {group_name}")
    text = " ".join(group_data['Q5'].dropna().astype(str))
    words = [word.strip().lower() for word in text.split()]
    freq = Counter(words)
    common = freq.most_common(20)
    for word, count in common:
        print(f"{word}: {count}")

    # Optional: Generate word cloud
    wordcloud = WordCloud(width=600, height=300, background_color='white').generate_from_frequencies(freq)
    plt.figure(figsize=(8, 4))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'WordCloud for {group_name}')
    plt.show()


In [None]:
# data to read

df1 = pd.read_csv("02_combined.csv")  # or .xlsx if applicable

# 📌 Step 4: Check column names and preview data
print("Columns:", df1.columns.tolist())
df1.head()

# Text length

In [None]:
# # Add a new column with text length
# df1['Essay-E'] = df1['Essay-E'].astype(str).apply(len)

# # Get descriptive statistics for text length
# length_stats = df1['Essay-E'].describe()

# print(length_stats)


In [None]:
# Add a new column with word count
df1['Essay-E'] = df1['Essay-E'].astype(str)
df1['word_count'] = df1['Essay-E'].apply(lambda x: len(x.split()))

# Get descriptive statistics for word count
length_stats = df1['word_count'].describe()

print(length_stats)


## 1. Descriptive statistics

In [None]:
import pandas as pd

# STEP 1: Load your CSV file
# Replace with your filename if different
df = pd.read_csv('/content/predata2.csv')  # Update this path if needed

# STEP 2: Convert relevant columns to numeric
cols_to_convert = ['Age', 'Q1_DB', 'Q2_DC', 'Q3_CB', 'Q4_CA']
df[cols_to_convert] = df[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# STEP 3: Overall Descriptive Statistics
print("✅ Overall Summary Statistics:\n")
print(f"Number of Participants: {df['SID'].nunique()}")
print(f"Mean Age: {df['Age'].mean():.2f} (SD = {df['Age'].std():.2f})\n")

for col in ['Q1_DB', 'Q2_DC', 'Q3_CB', 'Q4_CA']:
    print(f"{col} - Mean: {df[col].mean():.2f}, SD: {df[col].std():.2f}")

# STEP 4: Descriptive Statistics by Level
print("\n✅ Summary by Level:\n")
grouped = df.groupby('Level').agg({
    'SID': 'count',
    'Age': ['mean', 'std'],
    'Q1_DB': ['mean', 'std'],
    'Q2_DC': ['mean', 'std'],
    'Q3_CB': ['mean', 'std'],
    'Q4_CA': ['mean', 'std']
})

# Clean column names for better readability
grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]
display(grouped.reset_index())


In [None]:
len(df['SID'])

## Comparison by Level

In [None]:
# Normality of each group
import pandas as pd
from scipy.stats import shapiro

# Load your data
df = pd.read_csv('/content/predata2.csv')  # Replace with your file path

# Columns of interest
questions = ['Q2_DC', 'Q3_CB', 'Q4_CA']
df[questions] = df[questions].apply(pd.to_numeric, errors='coerce')

# Drop rows with missing Level or question scores
df = df.dropna(subset=['Level'] + questions)

# Grouping
levels = df['Level'].unique()

print("✅ Shapiro-Wilk Normality Test Results\n")

for q in questions:
    print(f"--- {q} ---")
    for level in levels:
        group_scores = df[df['Level'] == level][q]
        stat, p = shapiro(group_scores)
        print(f"Level: {level} | W = {stat:.3f}, p = {p:.4f}")
        if p < 0.05:
            print("  ❗ Not normally distributed")
        else:
            print("  ✅ Normally distributed")
    print()



## Group comparison; using median
✅ Use Mann-Whitney U Test (for comparing two independent groups)

In [None]:
from scipy.stats import mannwhitneyu

# Loop through each question
questions = ['Q2_DC', 'Q3_CB', 'Q4_CA']
levels = df['Level'].unique()

# Assuming only two levels exist (e.g., 'UG' and 'Grad')
level1 = levels[1]
level2 = levels[2]

print("✅ Mann-Whitney U Test (Independent Samples)\n")

for q in questions:
    group1 = df[df['Level'] == level1][q].dropna()
    group2 = df[df['Level'] == level2][q].dropna()

    stat, p = mannwhitneyu(group1, group2, alternative='two-sided')

    print(f"{q}:")
    print(f"  {level1} (n={len(group1)}), {level2} (n={len(group2)})")
    print(f"  U = {stat:.2f}, p = {p:.4f}")

    if p < 0.05:
        print("  ❗ Statistically significant difference between groups")
    else:
        print("  ✅ No significant difference between groups")
    print()


### 3 group comparison with median
✅ Kruskal-Wallis H Test for Q2–Q4 by Level



In [None]:
from scipy.stats import kruskal

# Questions to analyze
questions = ['Q2_DC', 'Q3_CB', 'Q4_CA']

# Unique group labels (e.g., 'UG', 'GRAD', 'Control')
group_labels = df['Level'].unique()

print("✅ Kruskal-Wallis H Test for Q2–Q4 by Level\n")

for q in questions:
    print(f"{q}:")

    # Collect values per group
    group_data = [df[df['Level'] == group][q].dropna() for group in group_labels]

    # Run Kruskal-Wallis test
    stat, p = kruskal(*group_data)

    print(f"  H = {stat:.2f}, p = {p:.4f}")

    if p < 0.05:
        print("  ❗ Statistically significant difference among groups")
    else:
        print("  ✅ No significant difference among groups")

    # Optional: print group sizes
    for label, g in zip(group_labels, group_data):
        print(f"    {label}: n = {len(g)}, mean = {g.mean():.2f}")
    print()
