<a href="https://colab.research.google.com/github/MK316/Workingpapers/blob/main/2025-insights/Recall25_analysis01_0728.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data analysis (25. 07.28)

+ data: 02_combined.csv (Drive/Research/Recall25/)
+ Q1~Q6
+ Questionnaire construct: DC(Q1, Q4), PTB(Q2, Q4), E&M(Q3, Q6).
+ Digital Competence, Perceived Teaching Benefits, Engagement & Motivation

In [None]:
# Step 1: Install and import required libraries
import pandas as pd

# Step 2: Mount Google Drive (if file is stored there)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Step 3: Load your CSV file (adjust path if necessary)
# Example path: '/content/drive/MyDrive/survey_data/02_combined.csv'

file_path = '/content/drive/MyDrive/Research/Recall25/02_combined.csv'  # <-- Change to your actual path
df = pd.read_csv(file_path)

# Select only relevant columns
cols_to_use = ['SID','Group', 'Level', 'Age', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6','Essay-E']
df = pd.read_csv(file_path, usecols=cols_to_use)

# Preview selected columns
df.head()


# Descriptive stats

In [None]:
# Step 4: Descriptive stats for Level and Age by Group
level_age_stats = df.groupby('Group')[['Level', 'Age']].describe()
print("Descriptive statistics for Level and Age by Group:")
display(level_age_stats)

In [None]:
# Step 5: Descriptive stats for Q1~Q6 by Group
q_cols = ['Q1', 'Q2', 'Q3']
q_stats = df.groupby('Group')[q_cols].describe()
print("Descriptive statistics for Q1~Q3 by Group:")
display(q_stats)

In [None]:
# Step 5: Descriptive stats for Q1~Q6 by Group
q_cols = ['Q4', 'Q5', 'Q6']
q_stats = df.groupby('Group')[q_cols].describe()
print("Descriptive statistics for Q4~Q6 by Group:")
display(q_stats)

In [None]:
# Step 5: Descriptive stats for Q4~Q6 by Group
q_cols = ['Q4', 'Q5', 'Q6']
q_stats = df.groupby('Group')[q_cols].describe()

# Drop quantiles: 25%, 50%, 75%
q_stats = q_stats.drop(columns=[('Q4', '25%'), ('Q4', '50%'), ('Q4', '75%'),
                                ('Q5', '25%'), ('Q5', '50%'), ('Q5', '75%'),
                                ('Q6', '25%'), ('Q6', '50%'), ('Q6', '75%')])

print("Descriptive statistics for Q4~Q6 by Group (quantiles hidden):")
display(q_stats)


## Plotting

In [None]:
#@markdown Boxplots A and B groups (works well)
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load data
file_path = '/content/drive/MyDrive/Research/Recall25/02_combined.csv'
cols_to_use = ['Group', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6']
df = pd.read_csv(file_path, usecols=cols_to_use)

# Step 2: Melt the data to long format for pre-post pairing
df_long = pd.melt(df,
                  id_vars='Group',
                  value_vars=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6'],
                  var_name='Question',
                  value_name='Response')

# Step 3: Add a 'Pair' column and a 'Time' column
question_map = {
    'Q1': ('DC', 'Pre'), 'Q4': ('DC', 'Post'),
    'Q2': ('PTB', 'Pre'), 'Q5': ('PTB', 'Post'),
    'Q3': ('E&M', 'Pre'), 'Q6': ('E&M', 'Post')
}
df_long['Pair'] = df_long['Question'].map(lambda x: question_map[x][0])
df_long['Time'] = df_long['Question'].map(lambda x: question_map[x][1])

# Step 4: Filter by group and plot boxplots
for group in df_long['Group'].unique():
    plt.figure(figsize=(10, 6))
    subset = df_long[df_long['Group'] == group]
    sns.boxplot(data=subset, x='Pair', y='Response', hue='Time', palette='pastel')
    plt.title(f'Pre vs Post Responses by Pair – {group} Group')
    plt.ylim(0, 7)
    plt.ylabel('Likert Scale (1~6)')
    plt.xlabel('Construct')
    plt.legend(title='Time')
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.savefig(f"Boxplot_{group}.png", dpi=300)

    plt.show()


## Plot as categorical data

In [None]:
#@markdown Stacked bar plot (works but ugly)
import pandas as pd
import matplotlib.pyplot as plt

# Load your data
file_path = '/content/drive/MyDrive/Research/Recall25/02_combined.csv'
df = pd.read_csv(file_path)

# Question-to-pair mapping
question_map = {
    'Q1': ('DC', 'Pre'), 'Q4': ('DC', 'Post'),
    'Q2': ('PTB', 'Pre'), 'Q5': ('PTB', 'Post'),
    'Q3': ('E&M', 'Pre'), 'Q6': ('E&M', 'Post')
}

# Reshape
records = []
for col, (pair, time) in question_map.items():
    temp = df[['Group', col]].copy()
    temp.columns = ['Group', 'Response']
    temp['Pair'] = pair
    temp['Time'] = time
    records.append(temp)

df_long = pd.concat(records)

# Summarize
counts = df_long.groupby(['Group', 'Pair', 'Time', 'Response']).size().unstack(fill_value=0)
counts_pct = counts.div(counts.sum(axis=1), axis=0) * 100

# Desired order for x-axis
desired_order = [('DC', 'Pre'), ('DC', 'Post'), ('PTB', 'Pre'), ('PTB', 'Post'), ('E&M', 'Pre'), ('E&M', 'Post')]

# Plot
for group in counts_pct.index.get_level_values(0).unique():
    fig, ax = plt.subplots(figsize=(10, 6))
    group_data = counts_pct.loc[group]

    # Reindex to force horizontal order
    group_data = group_data.reindex(desired_order)

    bottom = pd.Series(0, index=group_data.index)
    for resp in range(1, 7):
        if resp in group_data.columns:
            values = group_data[resp]
        else:
            values = pd.Series(0, index=group_data.index)
        ax.bar(
            x=[f"{pair} ({time})" for pair, time in group_data.index],
            height=values,
            bottom=bottom,
            label=f'{resp}',
            width=0.6
        )
        bottom += values

    ax.set_title(f'Likert Response Distribution – {group} Group')
    ax.set_ylabel('Percentage (%)')
    ax.set_xlabel('Construct (Time)')
    ax.legend(title='Likert Scale', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.set_ylim(0, 100)
    plt.xticks(rotation=30)
    plt.grid(axis='y', linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()


## Slope graph

In [None]:
#@markdown Line plot: 3 constructs * 2 groups all
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
file_path = '/content/drive/MyDrive/Research/Recall25/02_combined.csv'
df = pd.read_csv(file_path)

# Construct and time mapping
mean_map = {
    'Q1': ('DC', 'Pre'), 'Q4': ('DC', 'Post'),
    'Q2': ('PTB', 'Pre'), 'Q5': ('PTB', 'Post'),
    'Q3': ('E&M', 'Pre'), 'Q6': ('E&M', 'Post')
}

# Reshape data
slope_data = []
for q, (pair, time) in mean_map.items():
    temp = df[['Group', q]].copy()
    temp.columns = ['Group', 'Score']
    temp['Construct'] = pair
    temp['Time'] = time
    slope_data.append(temp)

df_slope = pd.concat(slope_data)

# Ensure Time is ordered: Pre → Post
df_slope['Time'] = pd.Categorical(df_slope['Time'], categories=['Pre', 'Post'], ordered=True)
# Replace group codes for clarity
df_slope['Group'] = df_slope['Group'].replace({'A': 'UG', 'B': 'GRAD'})
# Compute means
mean_df = df_slope.groupby(['Group', 'Construct', 'Time'])['Score'].mean().reset_index()

# Plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=mean_df, x='Time', y='Score', hue='Construct', style='Group',
             markers=True, dashes=False)

plt.title('Mean Likert Score Change (Pre to Post)')
plt.ylim(1, 6)
plt.ylabel('Mean Score (1–6)')
plt.xlabel('Time')
plt.grid(True, linestyle='--', alpha=0.5)
plt.legend(title='Construct / Group', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


## Split line plots by group

In [None]:
#@markdown Line plot works
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your data
file_path = '/content/drive/MyDrive/Research/Recall25/02_combined.csv'
df = pd.read_csv(file_path)

# Prepare long-format data
question_map = {
    'Q1': ('DC', 'Pre'), 'Q4': ('DC', 'Post'),
    'Q2': ('PTB', 'Pre'), 'Q5': ('PTB', 'Post'),
    'Q3': ('E&M', 'Pre'), 'Q6': ('E&M', 'Post')
}

records = []
for col, (construct, time) in question_map.items():
    temp = df[['Group', col]].copy()
    temp.columns = ['Group', 'Score']
    temp['Construct'] = construct
    temp['Time'] = time
    records.append(temp)

df_long = pd.concat(records)

# Fix group names
df_long['Group'] = df_long['Group'].replace({'A': 'UG', 'B': 'GRAD'})

# Fix Time order explicitly
df_long['Time'] = pd.Categorical(df_long['Time'], categories=['Pre', 'Post'], ordered=True)

# Compute group means
df_mean = df_long.groupby(['Group', 'Construct', 'Time'])['Score'].mean().reset_index()

# Create side-by-side plots
fig, axes = plt.subplots(1, 2, figsize=(14, 7), sharey=True)

for i, group in enumerate(['UG', 'GRAD']):
    ax = axes[i]
    sns.lineplot(
        data=df_mean[df_mean['Group'] == group],
        x='Time', y='Score', hue='Construct',
        marker='o', linewidth=4, ax=ax  # You can adjust linewidth as needed
    )

    ax.set_title(f'{group} Group', fontsize=16)
    ax.set_ylim(1, 6)
    ax.set_ylabel('Mean Score (1–6)' if i == 0 else '', fontsize=14)
    ax.set_xlabel('Time', fontsize=14)
    ax.tick_params(axis='both', labelsize=12)
    ax.grid(True, linestyle='--', alpha=0.5)
    ax.legend(title='Construct', fontsize=10, loc='lower left', title_fontsize=18)
    if i == 1:
        ax.get_legend().set_title('Construct')
    else:
        ax.legend_.remove()

plt.suptitle('Mean Likert Score Change (Pre to Post)', fontsize=14)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


## Error bar plot

In [None]:
print(df_long.columns)


In [None]:
#@markdown Plot (working but the one below is better)
import seaborn as sns
import matplotlib.pyplot as plt

# Replace abbreviations (spelling fixed)
df_long['Construct'] = df_long['Construct'].replace({
    'DC': 'Digital Confidence',
    'PTB': 'Perceived Teaching Benefits',  # Fixed spelling
    'E&M': 'Engagement & Motivation'
})

# Confirm unique constructs to match styles
print(df_long['Construct'].unique())  # Should show exactly 3 labels

# Plot
# ✅ Corrected plotting code
g = sns.catplot(
    data=df_long,
    x='Time',
    y='Score',  # <-- fixed here
    hue='Construct',
    col='Group',
    kind='point',
    dodge=0.3,
    ci=95,
    markers=['o', 's', 'D'],
    linestyles=['-.', '-', '--'],
    palette='Set1',
    height=4,
    aspect=1.5
)


g.set_titles("{col_name} Group")
g.set_axis_labels("Time", "Mean Score (1–6)")
g.set(ylim=(0, 6))
g.fig.suptitle("Construct-wise Response Change with 95% CI", fontsize=14)

# Adjust title/legend spacing
g.fig.subplots_adjust(top=0.85, right=0.75)

# Legend formatting
g._legend.set_bbox_to_anchor((0.54, 0.28))
g._legend.set_loc('center left')
g._legend.set_title("Construct")
g._legend.set_frame_on(True)

plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Load your data
file_path = '/content/drive/MyDrive/Research/Recall25/02_combined.csv'
df = pd.read_csv(file_path)

# Prepare long-format data
question_map = {
    'Q1': ('DC', 'Pre'), 'Q4': ('DC', 'Post'),
    'Q2': ('PTB', 'Pre'), 'Q5': ('PTB', 'Post'),
    'Q3': ('E&M', 'Pre'), 'Q6': ('E&M', 'Post')
}

records = []
for col, (construct, time) in question_map.items():
    temp = df[['Group', col]].copy()
    temp.columns = ['Group', 'Response']
    temp['Construct'] = construct
    temp['Time'] = time
    records.append(temp)

df_long = pd.concat(records)

# Replace abbreviations BEFORE plotting
df_long['Construct'] = df_long['Construct'].replace({
    'DC': 'Digital Confidence',
    'PTB': 'Perceived Teaching Benefits',
    'E&M': 'Engagement & Motivation'
})

df_long['Group'] = df_long['Group'].replace({
    'A': 'Course A: Undergraduate',
    'B': 'Course B: Graduate'
})

# Create catplot
g = sns.catplot(
    data=df_long,
    x='Time',
    y='Response',
    hue='Construct',
    col='Group',
    kind='point',
    dodge=0.3,
    errorbar=('ci', 95),  # updated from deprecated ci=
    markers=['o', 's', 'D'],
    linestyles=['-.', '-', '--'],
    palette='Set1',
    height=4,
    aspect=1.6
)

# Update figure size to ensure space
g.fig.set_size_inches(12, 5)

# Set titles and labels
g.set_titles("{col_name} Group")
g.set_axis_labels("Time", "Mean Score (1–6)")
g.set(ylim=(0, 6))
g.fig.suptitle("Construct-wise Response Change with 95% CI", fontsize=14)

# Remove default legend
g._legend.remove()

# Add a custom legend below the plot
handles, labels = g.axes[0][0].get_legend_handles_labels()
legend = g.fig.legend(
    handles=handles,
    labels=labels,
    loc='lower center',
    bbox_to_anchor=(0.5, -0.05),  # place legend below the axes
    ncol=3,
    title='Construct',
    frameon=True
)

# Final layout adjustment
g.fig.tight_layout()
g.fig.subplots_adjust(top=0.85, bottom=0.2)
  # make room for title and legend
g.fig.savefig("construct_response_plot.png", dpi=300, bbox_inches='tight')

plt.show()

---
# Thematic analysis

In [None]:
# Step 3: Load your CSV file (adjust path if necessary)
# Example path: '/content/drive/MyDrive/survey_data/02_combined.csv'

file_path = '/content/drive/MyDrive/Research/Recall25/02_combined.csv'  # <-- Change to your actual path
df = pd.read_csv(file_path)

# Select only relevant columns
cols_to_use = ['SID','Group', 'Level', 'Age', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6','Essay-E']
df = pd.read_csv(file_path, usecols=cols_to_use)

# Preview selected columns
df.head()

## ✅ Step 2: Preprocess the text
We’ll use basic cleaning and tokenization with TfidfVectorizer.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Basic pre-processing (stopwords removal is done inside vectorizer)
vectorizer = TfidfVectorizer(
    stop_words='english',
    lowercase=True,
    max_df=0.8,
    min_df=2,
    ngram_range=(1, 2)  # unigrams and bigrams
)

# Create a new column to ensure no missing values
df['Essay-E'] = df['Essay-E'].fillna('')

# Fit TF-IDF on all essays
tfidf_matrix = vectorizer.fit_transform(df['Essay-E'])

# Get feature names
terms = vectorizer.get_feature_names_out()


## ✅ Step 3: Separate TF-IDF matrices by group

In [None]:
import numpy as np

# Get group indices
group_a_idx = df[df['Group'] == 'A'].index
group_b_idx = df[df['Group'] == 'B'].index

# Subset the TF-IDF matrix
tfidf_a = tfidf_matrix[group_a_idx]
tfidf_b = tfidf_matrix[group_b_idx]

# Compute mean TF-IDF scores per term for each group
mean_a = np.asarray(tfidf_a.mean(axis=0)).flatten()
mean_b = np.asarray(tfidf_b.mean(axis=0)).flatten()


## ✅ Step 4: Create a comparison DataFrame

This table helps you identify:

Terms more frequently used by Group A (diff > 0)

Terms more frequently used by Group B (diff < 0)

In [None]:
comparison_df = pd.DataFrame({
    'term': terms,
    'mean_A': mean_a,
    'mean_B': mean_b,
    'diff': mean_a - mean_b
})

# Sort by absolute difference
comparison_df['abs_diff'] = comparison_df['diff'].abs()
comparison_df_sorted = comparison_df.sort_values(by='abs_diff', ascending=False)


## ✅ Step 5: Visualize key differences (optional)

In [None]:
import matplotlib.pyplot as plt

top_n = 15
top_diff = comparison_df_sorted.head(top_n)

plt.figure(figsize=(10, 6))
plt.barh(top_diff['term'], top_diff['diff'], color=(top_diff['diff'] > 0).map({True: 'blue', False: 'orange'}))
plt.axvline(0, color='gray', linewidth=1)
plt.xlabel("TF-IDF Difference (A - B)")
plt.title("Top Keyword Differences Between Group A and B")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


# Thematic analysis - with actual data

In [None]:
import pandas as pd
import re

# Assume df is already loaded as:
# df = pd.read_csv('data03.csv')

# Ensure the text is string type
df['Essay-E'] = df['Essay-E'].astype(str)

# Step 1: Replace 'artificial intelligence' with 'AI' explicitly (before lemmatization)
df['Essay-E'] = df['Essay-E'].str.replace(r'\bartificial intelligence\b', 'AI', flags=re.IGNORECASE, regex=True)

# Step 2: Replace other multi-word expressions with underscores
multiword_terms = {
    r'\bdigital literacy\b': 'digital_literacy',
    r'\benglish education\b': 'english_education'
}

for pattern, replacement in multiword_terms.items():
    df['Essay-E'] = df['Essay-E'].str.replace(pattern, replacement, flags=re.IGNORECASE, regex=True)

# ✅ Optional preview
df['Essay-E'].head()

# 3. Clean the text and lemmatize

In [None]:
import nltk
from nltk.corpus import wordnet, stopwords
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preserve this token as-is
important_tokens = {'AI'}

manual_replace = {
    'apps': 'app',
    'cod': 'code',         # Fix mislemmatized 'coding'
    'coding': 'code'       # Prevent future errors if it escapes lemmatizer
}

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_and_lemmatize_pos(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)

    lemmatized_tokens = []
    for token, tag in tagged_tokens:
        if token in important_tokens:
            lemmatized_tokens.append(token)  # Keep exact casing (e.g., 'AI')
        else:
            token = token.lower()
            if token not in stop_words and len(token) > 2:
                pos = get_wordnet_pos(tag)
                lemma = lemmatizer.lemmatize(token, pos)
                lemma = manual_replace.get(lemma, lemma)
                lemmatized_tokens.append(lemma)

    return ' '.join(lemmatized_tokens)

# Apply to the DataFrame
df['lemmatized'] = df['Essay-E'].apply(clean_and_lemmatize_pos)


✅ Step 3: TF-IDF with Unigrams and Bigrams

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(
    min_df=2,
    max_df=0.9,
    stop_words='english',
    ngram_range=(1, 1)  # unigram only
)

tfidf_matrix = vectorizer.fit_transform(df['lemmatized'])
feature_names = vectorizer.get_feature_names_out()
tfidf_means = np.asarray(tfidf_matrix.mean(axis=0)).flatten()

# View top 30
top_n = 30
top_indices = tfidf_means.argsort()[::-1][:top_n]
for i in top_indices:
    print(f"{feature_names[i]:<30} {tfidf_means[i]:.6f}")


# Get top N words

In [None]:
top_n = 30
top_indices = tfidf_means.argsort()[::-1][:top_n]
for i in top_indices:
    print(f"{feature_names[i]:<30} {tfidf_means[i]:.6f}")


# Get top N words

#4. Tokenization, remove stopwords and lemmatize

+ WordNetLemmatizer

## 3) Display keywords

In [None]:
# Create a DataFrame of top terms and their scores
import pandas as pd

top_keywords = pd.DataFrame({
    'term': [feature_names[i] for i in top_indices],
    'score': [tfidf_means[i] for i in top_indices]
})


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(top_keywords['term'][::-1], top_keywords['score'][::-1])
plt.xlabel('Average TF-IDF Score')
plt.title(f'Top {top_n} Keywords by TF-IDF')
plt.tight_layout()
plt.show()


### Heatmap with top N keywords

In [None]:
import numpy as np
import pandas as pd

# Compute average TF-IDF scores
tfidf_means = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = pd.DataFrame({'term': feature_names, 'score': tfidf_means})

# Sort and select top 30
top_n = 30
top_keywords = tfidf_scores.sort_values(by='score', ascending=False).head(top_n)['term'].tolist()

# Convert the sparse matrix to dense and wrap in a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Keep only the top 30 keywords
tfidf_top_df = tfidf_df[top_keywords]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
sns.heatmap(tfidf_top_df, cmap='YlGnBu', linewidths=0.5)

plt.title("TF-IDF Heatmap of Top 30 Keywords Across Reflection Essays")
plt.xlabel("Keywords")
plt.ylabel("Essays")
plt.tight_layout()

# Save the plot before showing it
plt.savefig("tfidf_heatmap.png", dpi=300)  # You can also use .pdf or .svg
plt.show()


### Heatmap with top N keywords

In [None]:
import numpy as np
import pandas as pd

# Compute average TF-IDF scores
tfidf_means = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = pd.DataFrame({'term': feature_names, 'score': tfidf_means})

# Sort and select top 30
top_n = 30
top_keywords = tfidf_scores.sort_values(by='score', ascending=False).head(top_n)['term'].tolist()

# Convert the sparse matrix to dense and wrap in a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Keep only the top 30 keywords
tfidf_top_df = tfidf_df[top_keywords]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
sns.heatmap(tfidf_top_df, cmap='YlGnBu', linewidths=0.5)

plt.title("TF-IDF Heatmap of Top 30 Keywords Across Reflection Essays")
plt.xlabel("Keywords")
plt.ylabel("Essays")
plt.tight_layout()

# Save the plot before showing it
plt.savefig("tfidf_heatmap.png", dpi=300)  # You can also use .pdf or .svg
plt.show()


# Part II. Group similar keywords into potential themes

Identify and label recurring themes across the 33 reflection essays, using a semi-automated method based on high-TF-IDF terms.

### ✅ 1. Elbow Method
This helps you choose the k (number of clusters) where the gain in performance (inertia) starts to diminish.

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

inertia = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(tfidf_matrix.T)  # Transpose: terms x documents
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(6, 4))
plt.plot(k_range, inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal k')
plt.xticks(k_range)
plt.grid(True)
plt.tight_layout()
plt.show()


### ✅ 2. Silhouette Score
This evaluates how well-separated the clusters are. Higher score = better-defined clusters.

In [None]:
from sklearn.metrics import silhouette_score

sil_scores = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(tfidf_matrix.T)
    score = silhouette_score(tfidf_matrix.T, labels)
    sil_scores.append(score)

# Plot silhouette scores
plt.figure(figsize=(6, 4))
plt.plot(k_range, sil_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis for Optimal k')
plt.xticks(k_range)
plt.grid(True)
plt.tight_layout()
plt.show()


Number of themes = 3 or 4 based on the above k values

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Set number of themes (you can adjust after evaluation)
num_themes = 4

# Transpose to get (terms x documents) shape
term_matrix = tfidf_matrix.T

# Apply KMeans
kmeans = KMeans(n_clusters=num_themes, random_state=42, n_init=10)
clusters = kmeans.fit_predict(term_matrix)

# Build dataframe with terms and scores
tfidf_scores = pd.DataFrame({
    'term': feature_names,
    'score': tfidf_means,
    'cluster': clusters
})

# Sort for easy inspection
tfidf_scores.sort_values(by=['cluster', 'score'], ascending=[True, False], inplace=True)

# Display top terms per cluster
for c in range(num_themes):
    print(f"\nCluster {c}:")
    print(tfidf_scores[tfidf_scores['cluster'] == c].head(10)[['term', 'score']])


k=4

Plot

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce the term matrix to 2D for plotting
pca = PCA(n_components=2)
reduced = pca.fit_transform(term_matrix.toarray())

# Plot the terms with cluster coloring
plt.figure(figsize=(10, 6))
scatter = plt.scatter(reduced[:, 0], reduced[:, 1], c=clusters, cmap='tab10', s=100)

# Annotate each point with its term
for i, term in enumerate(feature_names):
    plt.annotate(term, (reduced[i, 0], reduced[i, 1]), fontsize=9)

plt.title("TF-IDF Term Clustering (PCA-reduced)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.tight_layout()
plt.savefig("tfidf_clustering_pca.png", dpi=300)
plt.show()


### Show only top keywords per cluster

In [None]:
# Show only top N keywords per cluster
top_n_per_cluster = 7

top_keywords = (
    tfidf_scores
    .groupby('cluster')
    .apply(lambda x: x.sort_values('score', ascending=False).head(top_n_per_cluster))
    .reset_index(drop=True)
)

# Filter PCA-reduced coordinates to just top keywords
selected_indices = [feature_names.tolist().index(term) for term in top_keywords['term']]
reduced_selected = reduced[selected_indices]
cluster_selected = clusters[selected_indices]
terms_selected = top_keywords['term'].tolist()

# Plot
plt.figure(figsize=(10, 6))
scatter = plt.scatter(reduced_selected[:, 0], reduced_selected[:, 1], c=cluster_selected, cmap='tab10', s=100)

# Annotate terms
for i, term in enumerate(terms_selected):
    plt.annotate(term, (reduced_selected[i, 0], reduced_selected[i, 1]), fontsize=10)

plt.title("TF-IDF Term Clustering (Top Terms Only)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.tight_layout()
plt.show()


## Convex hulls to the above PCA plot

In [None]:
from scipy.spatial import ConvexHull
import numpy as np
import matplotlib.pyplot as plt

# Group coordinates, terms, and clusters together
points = np.array(reduced_selected)
labels = np.array(cluster_selected)
terms = np.array(terms_selected)

# Plot setup
plt.figure(figsize=(10, 6))
colors = plt.cm.tab10(np.arange(10))

# Plot points per cluster with convex hulls
for cluster_id in np.unique(labels):
    mask = labels == cluster_id
    cluster_points = points[mask]

    # Plot points
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], s=100,
                color=colors[cluster_id], label=f"Theme {cluster_id + 1}", alpha=0.6)

    # Draw convex hull if enough points
    if len(cluster_points) >= 3:
        hull = ConvexHull(cluster_points)
        for simplex in hull.simplices:
            plt.plot(cluster_points[simplex, 0], cluster_points[simplex, 1],
                     color=colors[cluster_id], linewidth=2)

    # Annotate terms
    for (x, y), term in zip(cluster_points, terms[mask]):
        plt.text(x, y, term, fontsize=10, ha='center', va='center')

plt.title("TF-IDF Term Clustering (Top Terms Only) with Convex Hulls")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend()
plt.tight_layout()
plt.text(x, y, term, fontsize=10, ha='center', va='center',
         bbox=dict(facecolor='white', edgecolor='none', alpha=0.7))

plt.savefig("term_clustering_convex.png", dpi=300)
plt.show()


#### Avoid overlapping text

In [None]:
%%capture
!pip install adjustText

In [None]:
from adjustText import adjust_text
from scipy.spatial import ConvexHull
import numpy as np
import matplotlib.pyplot as plt

# Group coordinates, terms, and clusters together
points = np.array(reduced_selected)
labels = np.array(cluster_selected)
terms = np.array(terms_selected)

# Plot setup
plt.figure(figsize=(10, 6))
colors = plt.cm.tab10(np.arange(10))
texts = []

# Plot points per cluster with convex hulls
for cluster_id in np.unique(labels):
    mask = labels == cluster_id
    cluster_points = points[mask]

    # Plot points
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], s=100,
                color=colors[cluster_id], label=f"Theme {cluster_id + 1}", alpha=0.6)

    # Draw convex hull if enough points
    if len(cluster_points) >= 3:
        hull = ConvexHull(cluster_points)
        for simplex in hull.simplices:
            plt.plot(cluster_points[simplex, 0], cluster_points[simplex, 1],
                     color=colors[cluster_id], linewidth=2)

    # Collect text objects for adjustText
    for (x, y), term in zip(cluster_points, terms[mask]):
        text = plt.text(x, y, term, fontsize=10, ha='center', va='center')
        texts.append(text)

# Adjust overlapping texts
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

plt.title("TF-IDF Term Clustering (Top Terms Only) with Convex Hulls and Adjusted Labels")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend()
plt.tight_layout()
plt.savefig("term_clustering_convex_good.png", dpi=300)
plt.show()


---
# Group comparison (trial) before Step2 below

In [None]:
terms = vectorizer.get_feature_names_out()  # should return 751 terms
print(len(terms))  # should now print 751


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Basic pre-processing (stopwords removal is done inside vectorizer)
vectorizer = TfidfVectorizer(
    stop_words='english',
    lowercase=True,
    max_df=0.8,
    min_df=2,
    ngram_range=(1, 2)  # unigrams and bigrams
)

# Create a new column to ensure no missing values
df['Essay-E'] = df['Essay-E'].fillna('')

# Fit TF-IDF on all essays
tfidf_matrix = vectorizer.fit_transform(df['Essay-E'])

# Get feature names
terms = vectorizer.get_feature_names_out()


In [None]:
# ✅ Step 3: Separate TF-IDF matrices by group
import numpy as np

# Get group indices
group_a_idx = df[df['Group'] == 'A'].index
group_b_idx = df[df['Group'] == 'B'].index

# Subset the TF-IDF matrix
tfidf_a = tfidf_matrix[group_a_idx]
tfidf_b = tfidf_matrix[group_b_idx]

# Compute mean TF-IDF scores per term for each group
mean_a = np.asarray(tfidf_a.mean(axis=0)).flatten()
mean_b = np.asarray(tfidf_b.mean(axis=0)).flatten()


In [None]:
# Step 4: Double-check lengths
print(len(terms))       # Number of terms in the vocabulary
print(len(mean_a))      # Number of TF-IDF features in Group A
print(len(mean_b))      # Number of TF-IDF features in Group B


In [None]:
# ✅ Step 4: Create a comparison DataFrame
comparison_df = pd.DataFrame({
    'term': terms,
    'mean_A': mean_a,
    'mean_B': mean_b,
    'diff': mean_a - mean_b
})

# Sort by absolute difference
comparison_df['abs_diff'] = comparison_df['diff'].abs()
comparison_df_sorted = comparison_df.sort_values(by='abs_diff', ascending=False)


In [None]:
# ✅ Step 5: Visualize key differences (optional)
import matplotlib.pyplot as plt

top_n = 30
top_diff = comparison_df_sorted.head(top_n)

plt.figure(figsize=(10, 6))
plt.barh(top_diff['term'], top_diff['diff'], color=(top_diff['diff'] > 0).map({True: 'blue', False: 'orange'}))
plt.axvline(0, color='gray', linewidth=1)
plt.xlabel("TF-IDF Difference (A - B)")
plt.title("Top Keyword Differences Between Group A and B")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig("tfidf_group_comparison.png", dpi=300)
plt.show()


# (Continued codes) - before the group comparison.

In [None]:
# ✅ Step 2: Extract top-N terms by mean TF-IDF
top_n = 30
top_indices = tfidf_means.argsort()[::-1][:top_n]

top_terms = feature_names[top_indices]
top_scores = tfidf_means[top_indices]

# Create new matrix with only those top terms
tfidf_top_matrix = tfidf_matrix[:, top_indices]



In [None]:
# 🎯 Step 3: Cluster the top terms (NOT full terms)

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

num_themes = 5
kmeans = KMeans(n_clusters=num_themes, random_state=42)
clusters = kmeans.fit_predict(tfidf_top_matrix.T)  # cluster terms

# PCA for visualization
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(tfidf_top_matrix.T)



In [None]:
# ✅ Step 4: Prepare plotting DataFrame

import pandas as pd

cluster_df = pd.DataFrame({
    'term': top_terms,
    'score': top_scores,
    'cluster': clusters,
    'x': pca_result[:, 0],
    'y': pca_result[:, 1]
})



In [None]:
# 🖼️ Step 5: Plot with adjusted text (for overlap)

import matplotlib.pyplot as plt
from adjustText import adjust_text

plt.figure(figsize=(10, 7))
colors = ['red', 'blue', 'green', 'orange', 'purple']

for c in range(num_themes):
    subset = cluster_df[cluster_df['cluster'] == c]
    plt.scatter(subset['x'], subset['y'], label=f"Theme {c+1}", s=60, alpha=0.7, color=colors[c])

# Add keyword labels
texts = []
for i, row in cluster_df.iterrows():
    texts.append(plt.text(row['x'], row['y'], row['term'], fontsize=10))

adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray'))

plt.title("Top 30 Keywords Clustered by TF-IDF Similarity (PCA + KMeans)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
from scipy.spatial import ConvexHull

plt.figure(figsize=(10, 7))
colors = ['red', 'blue', 'green', 'orange', 'purple']

for c in range(num_themes):
    subset = cluster_df[cluster_df['cluster'] == c]
    plt.scatter(subset['x'], subset['y'], label=f"Theme {c+1}", color=colors[c], alpha=0.7)

    # Draw convex hull
    if len(subset) >= 3:
        hull = ConvexHull(subset[['x', 'y']])
        for simplex in hull.simplices:
            plt.plot(subset['x'].values[simplex], subset['y'].values[simplex], colors[c], linewidth=2)

# Label keywords
texts = []
for _, row in cluster_df.iterrows():
    texts.append(plt.text(row['x'], row['y'], row['term'], fontsize=10))
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray'))

plt.title("Keyword Clusters with Convex Hulls")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
themes = {}

for i in range(num_themes):
    terms_in_cluster = tfidf_scores[tfidf_scores['cluster'] == i]
    top_terms = terms_in_cluster.sort_values(by='score', ascending=False).head(6)
    themes[f"Theme {i+1}"] = list(top_terms['term'])

# Print results
for name, keywords in themes.items():
    print(f"{name}: {', '.join(keywords)}")


# ✅ Next Steps After Theme Extraction

## Step 1: Interpret and Label Each Theme

In [None]:
for i, (name, keywords) in enumerate(themes.items(), 1):
    print(f"Theme {i}: {', '.join(keywords)}")
    # After printing, manually add:
    # → Suggested Label: e.g., "Digital Confidence and Skill Growth"


## Step 2: Visualize Theme-Keyword Relationships

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

G = nx.Graph()

# Add edges between theme and its keywords
for theme, keywords in themes.items():
    for kw in keywords:
        G.add_edge(theme, kw)

# Plot the network
plt.figure(figsize=(10, 6))
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=1500, font_size=10)
plt.title("Thematic Keyword Network")
plt.show()


---
GMM trials (7/27)

In [None]:
# STEP 1: Install required libraries
!pip install matplotlib networkx scikit-learn pandas --quiet

In [None]:
# STEP 2: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.covariance import GraphicalLassoCV
from sklearn.preprocessing import StandardScaler

# STEP 3: Upload your data
from google.colab import files
uploaded = files.upload()

# STEP 4: Load your CSV file
df = pd.read_csv(list(uploaded.keys())[0])

# STEP 5: Select relevant columns by name or index
# Replace with your actual column names if different
teaching_cols = ['Q1', 'Q4']
learning_cols = ['Q2','Q5']
tech_cols = ['Q3','Q6']

selected_cols = teaching_cols + learning_cols + tech_cols
data = df[selected_cols].dropna()

# STEP 6: Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(data)

# STEP 7: Estimate GGM using Graphical Lasso
model = GraphicalLassoCV()
model.fit(X)

# STEP 8: Build graph from precision matrix
precision = model.precision_
partial_corr = -precision / np.outer(np.sqrt(np.diag(precision)), np.sqrt(np.diag(precision)))
np.fill_diagonal(partial_corr, 0)

# Threshold to remove very weak edges
threshold = 0.1
adjacency = (np.abs(partial_corr) > threshold).astype(int)

# Create graph
G = nx.Graph()
labels = selected_cols

# Add nodes with color group
for i, label in enumerate(labels):
    if label in teaching_cols:
        G.add_node(label, group='Teaching')
    elif label in learning_cols:
        G.add_node(label, group='Learning')
    else:
        G.add_node(label, group='Tech')

# Add edges with weight (partial correlation)
for i in range(len(labels)):
    for j in range(i + 1, len(labels)):
        if adjacency[i, j]:
            G.add_edge(labels[i], labels[j], weight=partial_corr[i, j])

# STEP 9: Visualize with colored groups
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, seed=42)

# Colors for groups
group_colors = {
    'Teaching': '#66c2a5',
    'Learning': '#fc8d62',
    'Tech': '#8da0cb'
}
node_colors = [group_colors[G.nodes[node]['group']] for node in G.nodes]

# Draw
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=800, alpha=0.9)
nx.draw_networkx_labels(G, pos, font_size=10)
nx.draw_networkx_edges(G, pos, edge_color='gray', width=1.5, alpha=0.7)

plt.title("GGM Network of ChatGPT Survey Beliefs", fontsize=14)
plt.axis('off')
plt.show()
