<a href="https://colab.research.google.com/github/MK316/Workingpapers/blob/main/2025-insights/2025c_Kim%26Lee_paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Thematic analysis [1]: 2025. 7. 26

# 1. Data to upload

Notes:

words replaced: Artificial intelligence = AI, applications = apps

+ data01.csv: 33 essays with SID, Level, Essay
+ data02.csv: 33 essays ('artificial intelligence' = AI)

In [None]:
from google.colab import files
uploaded = files.upload()

# 2. Load and view the data

In [None]:
import pandas as pd

# Load data
df = pd.read_csv('data03.csv')
df.head()

In [None]:
import pandas as pd
import re

# Assume df is already loaded as:
# df = pd.read_csv('data03.csv')

# Ensure the text is string type
df['Reflection'] = df['Reflection'].astype(str)

# Step 1: Replace 'artificial intelligence' with 'AI' explicitly (before lemmatization)
df['Reflection'] = df['Reflection'].str.replace(r'\bartificial intelligence\b', 'AI', flags=re.IGNORECASE, regex=True)

# Step 2: Replace other multi-word expressions with underscores
multiword_terms = {
    r'\bdigital literacy\b': 'digital_literacy',
    r'\benglish education\b': 'english_education'
}

for pattern, replacement in multiword_terms.items():
    df['Reflection'] = df['Reflection'].str.replace(pattern, replacement, flags=re.IGNORECASE, regex=True)

# ✅ Optional preview
df['Reflection'].head()


# 3. Clean the text and lemmatize

In [None]:
import nltk
from nltk.corpus import wordnet, stopwords
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preserve this token as-is
important_tokens = {'AI'}

manual_replace = {
    'apps': 'app',
    'cod': 'code',         # Fix mislemmatized 'coding'
    'coding': 'code'       # Prevent future errors if it escapes lemmatizer
}

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_and_lemmatize_pos(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)

    lemmatized_tokens = []
    for token, tag in tagged_tokens:
        if token in important_tokens:
            lemmatized_tokens.append(token)  # Keep exact casing (e.g., 'AI')
        else:
            token = token.lower()
            if token not in stop_words and len(token) > 2:
                pos = get_wordnet_pos(tag)
                lemma = lemmatizer.lemmatize(token, pos)
                lemma = manual_replace.get(lemma, lemma)
                lemmatized_tokens.append(lemma)

    return ' '.join(lemmatized_tokens)

# Apply to the DataFrame
df['lemmatized'] = df['Reflection'].apply(clean_and_lemmatize_pos)


✅ Step 3: TF-IDF with Unigrams and Bigrams

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(
    min_df=2,
    max_df=0.9,
    stop_words='english',
    ngram_range=(1, 1)  # unigram only
)

tfidf_matrix = vectorizer.fit_transform(df['lemmatized'])
feature_names = vectorizer.get_feature_names_out()
tfidf_means = np.asarray(tfidf_matrix.mean(axis=0)).flatten()

# View top 30
top_n = 30
top_indices = tfidf_means.argsort()[::-1][:top_n]
for i in top_indices:
    print(f"{feature_names[i]:<30} {tfidf_means[i]:.6f}")


### Fix 'app-apps' 'student-students'

# Get top N words

In [None]:
top_n = 30
top_indices = tfidf_means.argsort()[::-1][:top_n]
for i in top_indices:
    print(f"{feature_names[i]:<30} {tfidf_means[i]:.6f}")


## Recommended Preprocessing Order
Lowercasing

Removing non-alphabetic characters (optional, but useful)

Tokenization

Stopword removal

Lemmatization

Join back into text for TF-IDF

#4. Tokenization, remove stopwords and lemmatize

+ WordNetLemmatizer

## 3) Display keywords

In [None]:
# Create a DataFrame of top terms and their scores
import pandas as pd

top_keywords = pd.DataFrame({
    'term': [feature_names[i] for i in top_indices],
    'score': [tfidf_means[i] for i in top_indices]
})


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.barh(top_keywords['term'][::-1], top_keywords['score'][::-1])
plt.xlabel('Average TF-IDF Score')
plt.title(f'Top {top_n} Keywords by TF-IDF')
plt.tight_layout()
plt.show()


### Heatmap with top N keywords

In [None]:
import numpy as np
import pandas as pd

# Compute average TF-IDF scores
tfidf_means = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = pd.DataFrame({'term': feature_names, 'score': tfidf_means})

# Sort and select top 30
top_n = 30
top_keywords = tfidf_scores.sort_values(by='score', ascending=False).head(top_n)['term'].tolist()

# Convert the sparse matrix to dense and wrap in a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Keep only the top 30 keywords
tfidf_top_df = tfidf_df[top_keywords]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
sns.heatmap(tfidf_top_df, cmap='YlGnBu', linewidths=0.5)

plt.title("TF-IDF Heatmap of Top 30 Keywords Across Reflection Essays")
plt.xlabel("Keywords")
plt.ylabel("Essays")
plt.tight_layout()

# Save the plot before showing it
plt.savefig("tfidf_heatmap.png", dpi=300)  # You can also use .pdf or .svg
plt.show()


### Heatmap with scores inside

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set threshold for showing numbers (e.g., show only if TF-IDF score ≥ 0.10)
threshold = 0.2

# Create the label matrix: show score if ≥ threshold, else empty string
annot_labels = tfidf_top_df.applymap(lambda x: f"{x:.2f}" if x >= threshold else "")

# Plot the heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(
    tfidf_top_df,
    cmap='YlGnBu',
    linewidths=0.5,
    annot=annot_labels,
    fmt="",
    cbar=True
)

plt.title("TF-IDF Heatmap of Top 30 Keywords Across Reflection Essays")
plt.xlabel("Keywords")
plt.ylabel("Essays")
plt.tight_layout()
plt.show()


---

# Part II. Group similar keywords into potential themes

Identify and label recurring themes across the 33 reflection essays, using a semi-automated method based on high-TF-IDF terms.

### ✅ 1. Elbow Method
This helps you choose the k (number of clusters) where the gain in performance (inertia) starts to diminish.

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

inertia = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(tfidf_matrix.T)  # Transpose: terms x documents
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(6, 4))
plt.plot(k_range, inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia (Within-Cluster Sum of Squares)')
plt.title('Elbow Method for Optimal k')
plt.xticks(k_range)
plt.grid(True)
plt.tight_layout()
plt.show()


### ✅ 2. Silhouette Score
This evaluates how well-separated the clusters are. Higher score = better-defined clusters.

In [None]:
from sklearn.metrics import silhouette_score

sil_scores = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(tfidf_matrix.T)
    score = silhouette_score(tfidf_matrix.T, labels)
    sil_scores.append(score)

# Plot silhouette scores
plt.figure(figsize=(6, 4))
plt.plot(k_range, sil_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis for Optimal k')
plt.xticks(k_range)
plt.grid(True)
plt.tight_layout()
plt.show()


Number of themes = 3 or 4 based on the above k values

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Set number of themes (you can adjust after evaluation)
num_themes = 4

# Transpose to get (terms x documents) shape
term_matrix = tfidf_matrix.T

# Apply KMeans
kmeans = KMeans(n_clusters=num_themes, random_state=42, n_init=10)
clusters = kmeans.fit_predict(term_matrix)

# Build dataframe with terms and scores
tfidf_scores = pd.DataFrame({
    'term': feature_names,
    'score': tfidf_means,
    'cluster': clusters
})

# Sort for easy inspection
tfidf_scores.sort_values(by=['cluster', 'score'], ascending=[True, False], inplace=True)

# Display top terms per cluster
for c in range(num_themes):
    print(f"\nCluster {c}:")
    print(tfidf_scores[tfidf_scores['cluster'] == c].head(10)[['term', 'score']])


k=6

Plot

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce the term matrix to 2D for plotting
pca = PCA(n_components=2)
reduced = pca.fit_transform(term_matrix.toarray())

# Plot the terms with cluster coloring
plt.figure(figsize=(10, 6))
scatter = plt.scatter(reduced[:, 0], reduced[:, 1], c=clusters, cmap='tab10', s=100)

# Annotate each point with its term
for i, term in enumerate(feature_names):
    plt.annotate(term, (reduced[i, 0], reduced[i, 1]), fontsize=9)

plt.title("TF-IDF Term Clustering (PCA-reduced)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.tight_layout()
plt.show()


---
Before revision

### (Optional) PCA plot

### Show only top keywords per cluster

In [None]:
# Show only top N keywords per cluster
top_n_per_cluster = 7

top_keywords = (
    tfidf_scores
    .groupby('cluster')
    .apply(lambda x: x.sort_values('score', ascending=False).head(top_n_per_cluster))
    .reset_index(drop=True)
)

# Filter PCA-reduced coordinates to just top keywords
selected_indices = [feature_names.tolist().index(term) for term in top_keywords['term']]
reduced_selected = reduced[selected_indices]
cluster_selected = clusters[selected_indices]
terms_selected = top_keywords['term'].tolist()

# Plot
plt.figure(figsize=(10, 6))
scatter = plt.scatter(reduced_selected[:, 0], reduced_selected[:, 1], c=cluster_selected, cmap='tab10', s=100)

# Annotate terms
for i, term in enumerate(terms_selected):
    plt.annotate(term, (reduced_selected[i, 0], reduced_selected[i, 1]), fontsize=10)

plt.title("TF-IDF Term Clustering (Top Terms Only)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.tight_layout()
plt.show()


## Convex hulls to the above PCA plot

In [None]:
from scipy.spatial import ConvexHull
import numpy as np
import matplotlib.pyplot as plt

# Group coordinates, terms, and clusters together
points = np.array(reduced_selected)
labels = np.array(cluster_selected)
terms = np.array(terms_selected)

# Plot setup
plt.figure(figsize=(10, 6))
colors = plt.cm.tab10(np.arange(10))

# Plot points per cluster with convex hulls
for cluster_id in np.unique(labels):
    mask = labels == cluster_id
    cluster_points = points[mask]

    # Plot points
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], s=100,
                color=colors[cluster_id], label=f"Theme {cluster_id + 1}", alpha=0.6)

    # Draw convex hull if enough points
    if len(cluster_points) >= 3:
        hull = ConvexHull(cluster_points)
        for simplex in hull.simplices:
            plt.plot(cluster_points[simplex, 0], cluster_points[simplex, 1],
                     color=colors[cluster_id], linewidth=2)

    # Annotate terms
    for (x, y), term in zip(cluster_points, terms[mask]):
        plt.text(x, y, term, fontsize=10, ha='center', va='center')

plt.title("TF-IDF Term Clustering (Top Terms Only) with Convex Hulls")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend()
plt.tight_layout()
plt.text(x, y, term, fontsize=10, ha='center', va='center',
         bbox=dict(facecolor='white', edgecolor='none', alpha=0.7))

plt.savefig("term_clustering_convex.png", dpi=300)
plt.show()


#### Avoid overlapping text

In [None]:
%%capture
!pip install adjustText

In [None]:
from adjustText import adjust_text
from scipy.spatial import ConvexHull
import numpy as np
import matplotlib.pyplot as plt

# Group coordinates, terms, and clusters together
points = np.array(reduced_selected)
labels = np.array(cluster_selected)
terms = np.array(terms_selected)

# Plot setup
plt.figure(figsize=(10, 6))
colors = plt.cm.tab10(np.arange(10))
texts = []

# Plot points per cluster with convex hulls
for cluster_id in np.unique(labels):
    mask = labels == cluster_id
    cluster_points = points[mask]

    # Plot points
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], s=100,
                color=colors[cluster_id], label=f"Theme {cluster_id + 1}", alpha=0.6)

    # Draw convex hull if enough points
    if len(cluster_points) >= 3:
        hull = ConvexHull(cluster_points)
        for simplex in hull.simplices:
            plt.plot(cluster_points[simplex, 0], cluster_points[simplex, 1],
                     color=colors[cluster_id], linewidth=2)

    # Collect text objects for adjustText
    for (x, y), term in zip(cluster_points, terms[mask]):
        text = plt.text(x, y, term, fontsize=10, ha='center', va='center')
        texts.append(text)

# Adjust overlapping texts
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

plt.title("TF-IDF Term Clustering (Top Terms Only) with Convex Hulls and Adjusted Labels")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend()
plt.tight_layout()
plt.savefig("term_clustering_convex_good.png", dpi=300)
plt.show()


In [None]:
# ✅ Step 2: Extract top-N terms by mean TF-IDF
top_n = 30
top_indices = tfidf_means.argsort()[::-1][:top_n]

top_terms = feature_names[top_indices]
top_scores = tfidf_means[top_indices]

# Create new matrix with only those top terms
tfidf_top_matrix = tfidf_matrix[:, top_indices]



In [None]:
# 🎯 Step 3: Cluster the top terms (NOT full terms)

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

num_themes = 5
kmeans = KMeans(n_clusters=num_themes, random_state=42)
clusters = kmeans.fit_predict(tfidf_top_matrix.T)  # cluster terms

# PCA for visualization
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(tfidf_top_matrix.T)



In [None]:
# ✅ Step 4: Prepare plotting DataFrame

import pandas as pd

cluster_df = pd.DataFrame({
    'term': top_terms,
    'score': top_scores,
    'cluster': clusters,
    'x': pca_result[:, 0],
    'y': pca_result[:, 1]
})



In [None]:
# 🖼️ Step 5: Plot with adjusted text (for overlap)

import matplotlib.pyplot as plt
from adjustText import adjust_text

plt.figure(figsize=(10, 7))
colors = ['red', 'blue', 'green', 'orange', 'purple']

for c in range(num_themes):
    subset = cluster_df[cluster_df['cluster'] == c]
    plt.scatter(subset['x'], subset['y'], label=f"Theme {c+1}", s=60, alpha=0.7, color=colors[c])

# Add keyword labels
texts = []
for i, row in cluster_df.iterrows():
    texts.append(plt.text(row['x'], row['y'], row['term'], fontsize=10))

adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray'))

plt.title("Top 30 Keywords Clustered by TF-IDF Similarity (PCA + KMeans)")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
from scipy.spatial import ConvexHull

plt.figure(figsize=(10, 7))
colors = ['red', 'blue', 'green', 'orange', 'purple']

for c in range(num_themes):
    subset = cluster_df[cluster_df['cluster'] == c]
    plt.scatter(subset['x'], subset['y'], label=f"Theme {c+1}", color=colors[c], alpha=0.7)

    # Draw convex hull
    if len(subset) >= 3:
        hull = ConvexHull(subset[['x', 'y']])
        for simplex in hull.simplices:
            plt.plot(subset['x'].values[simplex], subset['y'].values[simplex], colors[c], linewidth=2)

# Label keywords
texts = []
for _, row in cluster_df.iterrows():
    texts.append(plt.text(row['x'], row['y'], row['term'], fontsize=10))
adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray'))

plt.title("Keyword Clusters with Convex Hulls")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
themes = {}

for i in range(num_themes):
    terms_in_cluster = tfidf_scores[tfidf_scores['cluster'] == i]
    top_terms = terms_in_cluster.sort_values(by='score', ascending=False).head(6)
    themes[f"Theme {i+1}"] = list(top_terms['term'])

# Print results
for name, keywords in themes.items():
    print(f"{name}: {', '.join(keywords)}")


# ✅ Next Steps After Theme Extraction

## Step 1: Interpret and Label Each Theme

In [None]:
for i, (name, keywords) in enumerate(themes.items(), 1):
    print(f"Theme {i}: {', '.join(keywords)}")
    # After printing, manually add:
    # → Suggested Label: e.g., "Digital Confidence and Skill Growth"


## Step 2: Visualize Theme-Keyword Relationships

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

G = nx.Graph()

# Add edges between theme and its keywords
for theme, keywords in themes.items():
    for kw in keywords:
        G.add_edge(theme, kw)

# Plot the network
plt.figure(figsize=(10, 6))
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=1500, font_size=10)
plt.title("Thematic Keyword Network")
plt.show()


---
GMM trials (7/27)

In [None]:
# STEP 1: Install required libraries
!pip install matplotlib networkx scikit-learn pandas --quiet

In [None]:
# STEP 2: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.covariance import GraphicalLassoCV
from sklearn.preprocessing import StandardScaler

# STEP 3: Upload your data
from google.colab import files
uploaded = files.upload()

# STEP 4: Load your CSV file
df = pd.read_csv(list(uploaded.keys())[0])

# STEP 5: Select relevant columns by name or index
# Replace with your actual column names if different
teaching_cols = ['Q4', 'Q5','Q6','Q7']
learning_cols = ['Q8','Q9','Q10','Q11']
tech_cols = ['Q12','Q13','Q14']

selected_cols = teaching_cols + learning_cols + tech_cols
data = df[selected_cols].dropna()

# STEP 6: Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(data)

# STEP 7: Estimate GGM using Graphical Lasso
model = GraphicalLassoCV()
model.fit(X)

# STEP 8: Build graph from precision matrix
precision = model.precision_
partial_corr = -precision / np.outer(np.sqrt(np.diag(precision)), np.sqrt(np.diag(precision)))
np.fill_diagonal(partial_corr, 0)

# Threshold to remove very weak edges
threshold = 0.1
adjacency = (np.abs(partial_corr) > threshold).astype(int)

# Create graph
G = nx.Graph()
labels = selected_cols

# Add nodes with color group
for i, label in enumerate(labels):
    if label in teaching_cols:
        G.add_node(label, group='Teaching')
    elif label in learning_cols:
        G.add_node(label, group='Learning')
    else:
        G.add_node(label, group='Tech')

# Add edges with weight (partial correlation)
for i in range(len(labels)):
    for j in range(i + 1, len(labels)):
        if adjacency[i, j]:
            G.add_edge(labels[i], labels[j], weight=partial_corr[i, j])

# STEP 9: Visualize with colored groups
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, seed=42)

# Colors for groups
group_colors = {
    'Teaching': '#66c2a5',
    'Learning': '#fc8d62',
    'Tech': '#8da0cb'
}
node_colors = [group_colors[G.nodes[node]['group']] for node in G.nodes]

# Draw
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=800, alpha=0.9)
nx.draw_networkx_labels(G, pos, font_size=10)
nx.draw_networkx_edges(G, pos, edge_color='gray', width=1.5, alpha=0.7)

plt.title("GGM Network of ChatGPT Survey Beliefs", fontsize=14)
plt.axis('off')
plt.show()


Centrality?

In [None]:
!pip install networkx
!pip install sklearn
!pip install pandas matplotlib seaborn numpy


In [None]:
import pandas as pd
from google.colab import files

# Upload your CSV file (must include Q4 to Q14 columns)
uploaded = files.upload()

# Load it
df = pd.read_csv(next(iter(uploaded)))

# Filter only the relevant question columns
selected_cols = [f'Q{i}' for i in range(4, 15)]
data = df[selected_cols].dropna()


In [None]:
from sklearn.covariance import GraphicalLassoCV
import numpy as np

# Fit Graphical Lasso to estimate sparse inverse covariance matrix
model = GraphicalLassoCV()
model.fit(data)

# Get partial correlation matrix
precision = model.precision_
d = np.sqrt(np.diag(precision))
partial_corr = -precision / np.outer(d, d)
np.fill_diagonal(partial_corr, 1)


In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Build graph from partial correlations
threshold = 0.05  # to filter weak edges
G = nx.Graph()

# Assign clusters for coloring
clusters = {
    "Teaching": [f'Q{i}' for i in range(4, 8)],
    "Learning": [f'Q{i}' for i in range(8, 12)],
    "Tech": [f'Q{i}' for i in range(12, 15)],
}

color_map = {}
for c, nodes in clusters.items():
    for n in nodes:
        color_map[n] = c

colors = {
    "Teaching": "#66c2a5",
    "Learning": "#fc8d62",
    "Tech": "#8da0cb"
}

# Add edges
nodes = selected_cols
for i in range(len(nodes)):
    for j in range(i+1, len(nodes)):
        weight = partial_corr[i, j]
        if abs(weight) > threshold:
            G.add_edge(nodes[i], nodes[j], weight=weight)

# Add nodes with attributes
for node in nodes:
    G.add_node(node, group=color_map[node])

# Draw the graph
plt.figure(figsize=(8, 6))
pos = nx.spring_layout(G, seed=42)
node_colors = [colors[color_map[node]] for node in G.nodes()]
nx.draw(G, pos, with_labels=True, node_color=node_colors, edge_color='gray', node_size=1000, font_size=10)
plt.title("GGM Network of ChatGPT Survey Beliefs")
plt.show()


In [None]:
centrality = nx.degree_centrality(G)
for node, val in sorted(centrality.items(), key=lambda x: -x[1]):
    print(f"{node}: {val:.3f}")
