In [2]:
import pandas as pd
import random

# Define learning styles and overlapping behaviors
learning_styles = {
    "Visual": [
        "uses diagrams", "color-codes notes", "sketches processes", "prefers charts", "watches videos"
    ],
    "Verbal": [
        "reads aloud", "writes summaries", "uses mnemonics", "likes reading textbooks", "takes detailed notes"
    ],
    "Logical": [
        "solves puzzles", "enjoys math", "analyzes patterns", "asks 'why' questions", "uses step-by-step instructions"
    ],
    "Active": [
        "asks questions", "joins discussions", "teaches peers", "engages in role-play", "tries out examples"
    ],
    "Passive": [
        "listens quietly", "observes demonstrations", "reflects silently", "avoids participation", "reads quietly"
    ],
    "Multimodal": [
        "mixes diagrams and notes", "switches learning methods", "adapts easily", "uses videos and texts", "likes variety"
    ]
}

# Create a master behavior list (some overlap)
all_behaviors = list(set(behavior for behaviors in learning_styles.values() for behavior in behaviors))

def generate_realistic_student_profiles(n=500, noise_rate=0.05):
    data = []
    style_keys = list(learning_styles.keys())
    for i in range(1, n + 1):
        # Choose a primary style
        style = random.choice(style_keys)
        
        # Pick 2 behaviors from that style
        primary_behaviors = random.sample(learning_styles[style], 2)
        
        # Pick 1 behavior from a different style to simulate mixed learning
        other_style = random.choice([s for s in style_keys if s != style])
        mixed_behavior = random.choice(learning_styles[other_style])
        
        behaviors = primary_behaviors + [mixed_behavior]
        random.shuffle(behaviors)
        
        # Simulate label noise
        if random.random() < noise_rate:
            style = random.choice([s for s in style_keys if s != style])
        
        data.append({
            "student_id": i,
            "behaviors": behaviors,
            "learning_style": style
        })
    return pd.DataFrame(data)

# Generate the data
df = generate_realistic_student_profiles(n=500)
print(df.head())


   student_id                                          behaviors  \
0           1  [analyzes patterns, avoids participation, read...   
1           2  [engages in role-play, likes variety, teaches ...   
2           3  [sketches processes, takes detailed notes, col...   
3           4  [uses videos and texts, observes demonstration...   
4           5  [uses videos and texts, uses diagrams, watches...   

  learning_style  
0        Passive  
1         Active  
2     Multimodal  
3        Passive  
4         Visual  


Semantic Embedding

In [26]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
df["behavior_text"] = df["behaviors"].apply(lambda x: " ".join(x))





In [28]:
X = model.encode(df["behavior_text"].tolist())

y = df["learning_style"].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Random forest

In [29]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

# Evaluate
y_pred = model_rf.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

      Active       0.72      0.87      0.79        15
     Logical       0.88      0.78      0.82        18
  Multimodal       0.85      0.65      0.73        17
     Passive       0.83      1.00      0.91        10
      Verbal       0.86      0.86      0.86        22
      Visual       0.95      1.00      0.97        18

    accuracy                           0.85       100
   macro avg       0.85      0.86      0.85       100
weighted avg       0.85      0.85      0.85       100



SGD Classifier

In [33]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

# SGD
sgd_model = SGDClassifier(loss='log_loss', max_iter=1000, random_state=42)
sgd_model.fit(X_train, y_train)
print(classification_report(y_test, sgd_model.predict(X_test)))

              precision    recall  f1-score   support

      Active       0.88      0.93      0.90        15
     Logical       0.94      0.94      0.94        18
  Multimodal       0.94      0.88      0.91        17
     Passive       0.90      0.90      0.90        10
      Verbal       0.95      0.86      0.90        22
      Visual       0.90      1.00      0.95        18

    accuracy                           0.92       100
   macro avg       0.92      0.92      0.92       100
weighted avg       0.92      0.92      0.92       100



OpenAI Classification

In [None]:

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

few_shot_examples = ""

styles = train_df['learning_style'].unique()

for style in learning_styles:
    # Pick one random row with this learning style
    example_row = train_df[train_df['learning_style'] == style].sample(1).iloc[0]
    
    example_behaviors = ", ".join(example_row['behaviors'])
    example_style = example_row['learning_style']
    
    few_shot_examples += f"Behaviors: {example_behaviors}\nLearning style: {example_style}\n\n"

In [23]:
from openai import OpenAI
from dotenv import load_dotenv
import os 
load_dotenv()
client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))
def classify_with_few_shot(behaviors):
    prompt = (
        few_shot_examples +
        f"Given the examples above and the new set of {behaviors},  predict what is the student's primary learning style? Choose one from Visual, Verbal, Logical, Active, Passive, Multimodal.\nAnswer with a single word (using the vocabulary of learning styles provided)"
    )
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=10
    )
    return response.choices[0].message.content.strip()

test_df['predicted_style'] = test_df['behaviors'].apply(classify_with_few_shot)


In [25]:
y_test = test_df["learning_style"]
y_pred = test_df["predicted_style"]
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

      Active       0.77      0.67      0.71        15
     Logical       0.59      0.94      0.72        18
  Multimodal       0.52      0.88      0.65        17
     Passive       0.80      0.40      0.53        10
      Verbal       0.73      0.36      0.48        22
      Visual       0.77      0.56      0.65        18

    accuracy                           0.64       100
   macro avg       0.69      0.64      0.63       100
weighted avg       0.69      0.64      0.62       100

