In [1]:
#!pip install --upgrade openai


In [2]:
import json
import random
import requests
from openai import OpenAI

In [3]:
class LLMClient:
    def __init__(self, provider="openai", api_key=None, base_url=None, model="gpt-4o-mini"):
        self.provider = provider
        self.api_key = api_key
        self.model = model
        self.base_url = base_url

        if provider == "openai":
            self.client = OpenAI(api_key=self.api_key)

        elif provider == "ollama":
            if not self.base_url:
                self.base_url = "http://localhost:11434"

        else:
            raise ValueError("Unknown provider. Use 'openai' or 'ollama'.")

    def generate(self, prompt):
        """Unified text generation interface (no streaming for Ollama)."""

        # ---------- OpenAI ----------
        if self.provider == "openai":
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}]
            )
            return response.choices[0].message.content.strip()

        # ---------- Ollama ----------
        elif self.provider == "ollama":
            url = f"{self.base_url}/api/generate"

            response = requests.post(
                url,
                json={
                    "model": self.model,
                    "prompt": prompt,
                    "stream": False       # <<< this fixes ALL freeze issues
                }
            )

            data = response.json()       # now safe: single JSON object
            return data["response"].strip()



In [40]:
class DataGenerator:
    ASPECTS = [
        "difficulty", "clarity", "workload", "lecturer_quality",
        "exam_fairness", "relevance", "interest", "support",
        "materials", "overall_experience"
    ]

    SENTIMENTS = ["positive", "negative", "neutral"]
    TONES = ["frustrated", "neutral", "enthusiastic", "formal"]
    STATUS = ["taking", "completed", "first_year", "advanced"]
    COURSES = [
    {
        "name": "Linear Algebra",
        "desc": "An introductory course covering matrices, vector spaces, eigenvalues and applications."
    },
    {
        "name": "Introduction to Programming",
        "desc": "Fundamentals of Python programming, algorithms, problem solving, and software design."
    },
    {
        "name": "Data Structures",
        "desc": "Covers trees, graphs, hash tables, heaps, and algorithmic efficiency."
    },
    {
        "name": "Operating Systems",
        "desc": "Concepts of processes, threads, scheduling, memory management, and file systems."
    },
    {
        "name": "Machine Learning",
        "desc": "Supervised and unsupervised learning, linear models, neural networks, clustering, evaluation metrics."
    },
    {
        "name": "Computer Networks",
        "desc": "OSI layers, TCP/IP, routing, switching, network protocols and security fundamentals."
    },
    {
        "name": "Databases",
        "desc": "Relational databases, SQL queries, normalization, transactions and indexing."
    }
    ]

    LECTURERS = [
        "Dr. Cohen", "Prof. Levi", "Dr. Mizrahi",
        "Dr. Kaplan", "Prof. Baruch", "Dr. Shapira",
        "Dr. Adler", "Prof. Klein"
    ]  

    def __init__(self, llm_client: LLMClient):
        self.llm = llm_client
        
    def get_random_course_and_lecturer(self):
        course = random.choice(self.COURSES)
        lecturer = random.choice(self.LECTURERS)
        return course["name"], course["desc"], lecturer


    def _build_prompt(self, course_name, course_desc, lecturer, aspect_labels, tone, status):
        """Prompt template â€” short, strict, stable."""
        return f"""
Write a VERY SHORT student review for a university course.

Course name: {course_name}
Description: {course_desc}
Lecturer: {lecturer}

Student status: {status}
Tone: {tone}

Aspects and sentiment:
{json.dumps(aspect_labels, indent=2)}

RULES:
- Write EXACTLY 2 sentences.
- Do NOT add headings, bullet points, sections, titles, or explanations.
- Output PLAIN TEXT ONLY.
- Review MUST reflect all aspect sentiments clearly.
"""

    def generate_single(self):
        # get random course + lecturer
        course_name, course_desc, lecturer = self.get_random_course_and_lecturer()

        # pick aspects
        aspects = random.sample(self.ASPECTS, random.randint(2, 5))
        aspect_labels = {a: random.choice(self.SENTIMENTS) for a in aspects}

        tone = random.choice(self.TONES)
        status = random.choice(self.STATUS)

        prompt = self._build_prompt(course_name, course_desc, lecturer, aspect_labels, tone, status)
        review_text = self.llm.generate(prompt)

        return {
            "course_name": course_name,
            "course_description": course_desc,
            "lecturer": lecturer,
            "student_status": status,
            "tone": tone,
            "aspects": aspect_labels,
            "review_text": review_text.strip()
        }


    def generate_many(self, n):
        dataset = []
        for _ in range(n):
            dataset.append(self.generate_single())
        return dataset

    @staticmethod
    def save_jsonl(dataset, filename):
        with open(filename, "w", encoding="utf-8") as f:
            for item in dataset:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")


In [46]:
if __name__ == "__main__":

    # -------- choose model provider --------

    # ----- OpenAI -----
    # llm = LLMClient(
    #     provider="openai",
    #     api_key="YOUR_OPENAI_KEY",
    #     model="gpt-4o-mini"
    # )

    # ----- Ollama -----
    llm = LLMClient(
        provider="ollama",
        model="llama3",
        base_url="http://localhost:11434"
    )

    generator = DataGenerator(llm)

    # Generate one example
    example = generator.generate_single()


    print(json.dumps(example, indent=4, ensure_ascii=False))

    # Generate dataset
    dataset = generator.generate_many(10)

    DataGenerator.save_jsonl(dataset, "course_reviews.jsonl")

    print("\nSaved dataset to course_reviews.jsonl")

{
    "course_name": "Data Structures",
    "course_description": "Covers trees, graphs, hash tables, heaps, and algorithmic efficiency.",
    "lecturer": "Dr. Kaplan",
    "student_status": "completed",
    "tone": "frustrated",
    "aspects": {
        "lecturer_quality": "negative",
        "materials": "positive",
        "interest": "positive",
        "exam_fairness": "positive"
    },
    "review_text": "I completed the Data Structures course with Dr. Kaplan and while I found the materials engaging and interesting, I was frustrated with the lecturer's delivery. Despite this, I thought the exams were fair and the topics covered helped me understand algorithmic efficiency."
}

Saved dataset to course_reviews.jsonl


In [50]:
dataset = generator.generate_many(5)
dataset


[{'course_name': 'Operating Systems',
  'course_description': 'Concepts of processes, threads, scheduling, memory management, and file systems.',
  'lecturer': 'Dr. Mizrahi',
  'student_status': 'advanced',
  'tone': 'enthusiastic',
  'aspects': {'difficulty': 'neutral',
   'workload': 'negative',
   'exam_fairness': 'neutral',
   'lecturer_quality': 'negative'},
  'review_text': "I thoroughly enjoyed the Operating Systems course with Dr. Mizrahi, but found the workload overwhelming and struggled to keep up despite being an advanced student. Despite this, I appreciated the neutral difficulty level and lack of bias in exams, but was underwhelmed by the lecturer's quality."},
 {'course_name': 'Databases',
  'course_description': 'Relational databases, SQL queries, normalization, transactions and indexing.',
  'lecturer': 'Prof. Baruch',
  'student_status': 'advanced',
  'tone': 'frustrated',
  'aspects': {'workload': 'neutral', 'overall_experience': 'neutral'},
  'review_text': "I was ex