In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm





In [6]:
df = pd.read_csv("dummy_courses_50.csv")

In [3]:
def recommend_courses(learning_goal, preferred_media, df, top_n=5):
  
    # 1. Combine the course name and description for richer context.
    corpus = df['name'] + ". " + df['description']
    
    # 2. Vectorize the combined text using TF-IDF.
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(corpus)
    
    # 3. Transform the learning goal into the same TF-IDF vector space.
    user_query_vector = vectorizer.transform([learning_goal])
    
    # 4. Compute cosine similarity between the user query and each course.
    similarity_scores = cosine_similarity(user_query_vector, tfidf_matrix).flatten()
    
    # 5. Copy the dataframe and add the similarity score.
    df = df.copy()
    df['similarity_score'] = similarity_scores
    
    # 6. Create a binary indicator: 1 if the course's media matches preferred_media, else 0.
    df['media_match'] = (df['media'] == preferred_media).astype(int)
    
    # 7. Sort the DataFrame first by similarity_score (learning goal match) then by media_match.
    df_sorted = df.sort_values(by=['similarity_score', 'media_match'], ascending=False)
    
    return df_sorted.head(top_n)

In [4]:
# Example usage:
learning_goal = "i want to know how to photograph"
preferred_media = "book"
top_n = 7

recommended_courses = recommend_courses(learning_goal, preferred_media, df, top_n)

print("Recommended Courses:")
print(recommended_courses)

Recommended Courses:
                             name media  \
1     Advanced Python Programming  book   
3         Modern Database Systems  book   
5           SQL for Data Analysis  book   
7       Machine Learning Concepts  book   
9        Advanced Web Development  book   
11  Deep Learning with TensorFlow  book   
13    Statistical Analysis with R  book   

                                          description  similarity_score  \
1   Dive deep into advanced Python techniques and ...               0.0   
3   An in-depth look into modern database architec...               0.0   
5   Master SQL queries and data analysis technique...               0.0   
7   A comprehensive guide to machine learning algo...               0.0   
9   Master modern web frameworks and advanced tech...               0.0   
11  Build and deploy deep learning models using Te...               0.0   
13  Learn statistical analysis and data visualizat...               0.0   

    media_match  
1             1

In [5]:
def recommend_courses_semantic(learning_goal, preferred_media, df, top_n=5):

    # Combine course name and description for a richer context.
    corpus = (df['name'] + ". " + df['description']).tolist()
    
    # Initialize the SentenceTransformer model.
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Encode the corpus and the learning goal into embeddings.
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(learning_goal, convert_to_tensor=True)
    
    # Compute cosine similarity between the learning goal and each course.
    cosine_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    
    # Convert cosine scores to a NumPy array.
    similarity_scores = cosine_scores.cpu().numpy()
    
    # Create a copy of the DataFrame and add the similarity scores.
    df = df.copy()
    df['similarity_score'] = similarity_scores
    
    # Add a binary indicator for media match.
    df['media_match'] = (df['media'] == preferred_media).astype(int)
    
    # Sort courses: first by similarity score, then by media match.
    df_sorted = df.sort_values(by=['similarity_score', 'media_match'], ascending=False)
    
    return df_sorted.head(top_n)

In [7]:
# Define a learning goal and a preferred media type.
learning_goal = "I want make "
preferred_media = "video"
top_n = 10

# Get top recommendations.
recommended_courses = recommend_courses_semantic(learning_goal, preferred_media, df, top_n)

print("Recommended Courses:")
print(recommended_courses)

Recommended Courses:
                                    name  media  \
41               Advanced Graphic Design   book   
39       Advanced Photography Techniques   book   
11         Deep Learning with TensorFlow   book   
40           Graphic Design Fundamentals  video   
25                  Advanced Game Design   book   
43  Advanced Virtual Reality Development   book   
37             Advanced Robotics Systems   book   
22                Mobile App Development  video   
45            Advanced Augmented Reality   book   
24               Game Development Basics  video   

                                          description  similarity_score  \
41  Master advanced design techniques and creative...          0.217704   
39  Learn advanced photography techniques and crea...          0.195822   
11  Build and deploy deep learning models using Te...          0.153688   
40  Understand the principles of graphic design an...          0.150691   
25  Master advanced game design principles

In [None]:
def recommend_improvement_single_student(performance_df, catalog_df, k_lowest, top_n, preferred_media_list):
    """
    Recommend courses for a single student to improve in their weak topics.
    
    For each of the K lowest-scored courses (weak topics) in the student's performance data,
    the system uses the course name and (if available) its description from the catalog to create
    a semantic query. It then finds similar courses in the catalog (based on the combined name and description),
    excludes courses already taken, and sorts candidates by semantic similarity (and media match).
    
    Parameters:
      performance_df (DataFrame): Student performance data with columns:
                                    ['course_name', 'score'].
                                    (Assumed to contain data for one student.)
      catalog_df (DataFrame): Course catalog data with columns:
                              ['name', 'media', 'description'].
      k_lowest (int): Number of lowest-scored courses (weak topics) to consider.
      top_n (int): Number of recommendations per weak topic.
      preferred_media_list (list): List of preferred media types (e.g., ["video", "interactive"]).
      
    Returns:
      list: A flat list of recommended course names (without duplicates), sorted such that recommendations 
            for the weakest topics come first and within each topic, courses matching preferred media rank higher.
    """
    # Sort the student's performance by score (lowest first) and select the k lowest courses.
    weak_topics = performance_df.sort_values(by='score', ascending=True).head(k_lowest)
    
    if weak_topics.empty:
        print("No performance data available for the student.")
        return []
    
    # Initialize the semantic embedding model.
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Pre-compute embeddings for all catalog courses.
    # For each catalog course, we combine the course name and description.
    catalog_texts = (catalog_df['name'] + ". " + catalog_df['description']).tolist()
    catalog_embeddings = model.encode(catalog_texts, convert_to_tensor=True)
    
    # Gather courses the student has already taken.
    taken_courses = set(performance_df['course_name'].tolist())
    
    recommended_courses = []
    
    # Process each weak topic in order (lowest score first).
    for index, row in weak_topics.iterrows():
        weak_topic = row['course_name']
        # Form query text: try to get the course's description from catalog_df.
        query_text = weak_topic  # default: use just the name
        matching = catalog_df[catalog_df['name'] == weak_topic]
        if not matching.empty:
            # Combine course name and description for richer query.
            query_text = weak_topic + ". " + matching.iloc[0]['description']
        
        # Encode the query text.
        query_embedding = model.encode(query_text, convert_to_tensor=True)
        # Compute cosine similarity between the query and all catalog course embeddings.
        cosine_scores = util.cos_sim(query_embedding, catalog_embeddings)[0]
        similarity_scores = cosine_scores.cpu().numpy()
        
        # Create a temporary DataFrame with similarity scores.
        temp_df = catalog_df.copy()
        temp_df['similarity_score'] = similarity_scores
        
        # Exclude courses that the student has already taken.
        temp_df = temp_df[~temp_df['name'].isin(taken_courses)]
        
        #Filter out courses with very low similarity.
        temp_df = temp_df[temp_df['similarity_score'] > 0.4]
        
        # Create a media_match indicator: 1 if the course's media is in the preferred list, else 0.
        temp_df['media_match'] = temp_df['media'].apply(lambda m: 1 if m in preferred_media_list else 0)
        
        # Sort candidates: first by similarity_score, then by media_match.
        temp_df_sorted = temp_df.sort_values(by=['similarity_score', 'media_match'], ascending=False)
        
        # Collect the top recommended course names for this weak topic.
        recommended_courses.extend(temp_df_sorted['name'].head(top_n).tolist())
    
    # Remove duplicates while preserving the order.
    seen = set()
    final_recommendations = []
    for course in recommended_courses:
        if course not in seen:
            final_recommendations.append(course)
            seen.add(course)
    
    return final_recommendations

In [7]:
# --- Example Usage ---

# Simulated performance data for one student.
# Columns: course_name, score
performance_data = {   
    "course_name": ["Python for Beginners", "Database Fundamentals", "Advanced Virtual Reality Development"],
    "score": [55, 85, 90]
}
performance_df = pd.DataFrame(performance_data)

# Simulated course catalog data (read from CSV sample).
# Columns: name, media, description
"""
catalog_data = {
    "name": [
        "Python for Beginners", "Advanced Python Programming", "Database Fundamentals",
        "Modern Database Systems", "Python Data Science", "Basic Java Course", "Algorithms 101",
        "Introductory Photography", "Digital Photography Basics"
    ],
    "media": ["video", "book", "video", "book", "video", "video", "book", "video", "book"],
    "description": [
        "Learn the basics of Python programming with interactive video lessons.",
        "Dive deep into advanced Python techniques and best practices.",
        "Explore core concepts of relational databases, including SQL and data modeling.",
        "An in-depth look into modern database architectures and NoSQL solutions.",
        "Utilize Python for data analysis, visualization, and introductory machine learning.",
        "Learn the fundamentals of Java programming.",
        "Introduction to algorithms and problem solving.",
        "Learn the basics of photography and camera handling.",
        "Master digital photography techniques with comprehensive guides."
    ]
}
"""
catalog_df = df

# For student "Alice", consider the 2 lowest scored courses,
# allowing multiple preferred media (e.g., "video" and "interactive").
preferred_media_list = ["video", "interactive"]
recommended_course_list = recommend_improvement_single_student(performance_df, catalog_df,
                                                               k_lowest=2, top_n=3,
                                                               preferred_media_list=preferred_media_list)

print("Final Recommended Courses:")
print(recommended_course_list)


Final Recommended Courses:
['Data Structures in Python', 'Advanced Python Programming', 'Python Data Science', 'SQL for Data Analysis', 'Modern Database Systems', 'Machine Learning Concepts']
