# Download Multi-Platform Online Courses Dataset from Kaggle

In [40]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("everydaycodings/multi-platform-online-courses-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\flees\.cache\kagglehub\datasets\everydaycodings\multi-platform-online-courses-dataset\versions\2


## Functions to clean kaggle datasets

In [41]:
import pandas as pd
import numpy as np

target_columns = [
    'url', 'title', 'author', 'students', 'rating',
    'difficulty', 'skills', 'description', 'price', 'source'
]

coursera_rename_map = {
    'course': 'title',
    'reviewcount': 'students',
    'level': 'difficulty'
}

coursera_kaggle_df = pd.read_csv("multi-platform-online-courses-dataset/Coursera.csv")
print(f"Columns before preprocessing: \n{coursera_kaggle_df.columns}\nSize: {len(coursera_kaggle_df)}\n")

# rename columns using map
coursera_kaggle_df = coursera_kaggle_df.rename(columns=coursera_rename_map)
# delete non-target columns
coursera_kaggle_df = coursera_kaggle_df[[col for col in coursera_kaggle_df.columns if col in target_columns]]

for col in target_columns:
    if col not in coursera_kaggle_df.columns:
        coursera_kaggle_df[col] = np.nan

coursera_kaggle_df = coursera_kaggle_df[target_columns]

coursera_kaggle_df = coursera_kaggle_df[~coursera_kaggle_df['skills'].isna()]
coursera_kaggle_df = coursera_kaggle_df[coursera_kaggle_df['skills'].astype(str).str.strip() != ""]
coursera_kaggle_df = coursera_kaggle_df[coursera_kaggle_df['skills'].astype(str) != "[]"]

print(f"Columns after preprocessing: \n{coursera_kaggle_df.columns}\nSize: {len(coursera_kaggle_df)}")

Columns before preprocessing: 
Index(['partner', 'course', 'skills', 'rating', 'reviewcount', 'level',
       'certificatetype', 'duration', 'crediteligibility'],
      dtype='object')
Size: 1139

Columns after preprocessing: 
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')
Size: 1088


## Clean edx data set

In [42]:
edx_kaggle_df = pd.read_csv("multi-platform-online-courses-dataset/edx.csv")
print(f"Columns before preprocessing: \n{edx_kaggle_df.columns}\nSize: {len(edx_kaggle_df)}\n")

edx_rename_map = {
    'link': 'url',
    'institution': 'author',
    'level': 'difficulty',
    'associatedskills': 'skills'
}

# rename columns using map
edx_kaggle_df = edx_kaggle_df.rename(columns=edx_rename_map)

edx_kaggle_df = edx_kaggle_df[edx_kaggle_df['language'] == "English"]

valid_subjects = ["Data Analysis & Statistics", "Computer Science"]
edx_kaggle_df = edx_kaggle_df[edx_kaggle_df["subject"].isin(valid_subjects)]

# delete non-target columns
edx_kaggle_df = edx_kaggle_df[[col for col in edx_kaggle_df.columns if col in target_columns]]

# add missing target columns
for col in target_columns:
    if col not in edx_kaggle_df.columns:
        edx_kaggle_df[col] = np.nan


edx_kaggle_df["source"] = "edx"

edx_kaggle_df = edx_kaggle_df[target_columns]

edx_kaggle_df = edx_kaggle_df[~edx_kaggle_df['skills'].isna()]
edx_kaggle_df = edx_kaggle_df[edx_kaggle_df['skills'].astype(str).str.strip() != ""]
edx_kaggle_df = edx_kaggle_df[edx_kaggle_df['skills'].astype(str) != "[]"]


print(f"Columns after preprocessing: \n{edx_kaggle_df.columns}\nSize: {len(edx_kaggle_df)}")


Columns before preprocessing: 
Index(['title', 'link', 'institution', 'subject', 'level', 'prerequisites',
       'language', 'videotranscript', 'associatedprograms',
       'associatedskills'],
      dtype='object')
Size: 816

Columns after preprocessing: 
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')
Size: 210


## Move files into ml directory & check content

 We will not use skillshare.csv and Udemy.csv dataset due to lack of skill columns.



In [43]:
import os

path = "multi-platform-online-courses-dataset"
datasets = []
df = pd.DataFrame()

for filename in os.listdir(path):
    if filename.endswith(".json"):
        dataset = pd.read_json(os.path.join(path, filename))
        datasets.append(dataset)
        df = pd.concat([df, dataset], ignore_index=True)
        print(filename)
        print(f"{dataset.columns}\n\n")

df = pd.concat([df, edx_kaggle_df], ignore_index=True)

df.to_csv("courses.csv", index=False)

coursera_clear.json
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')


stepik_clear.json
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')




## Data normalization

#### Skills normalization
Google Cloud, Cloud Computing, Network Security, Management, Cloud Automation ----> ['Google Cloud', 'Cloud Computing', 'Network Security', 'Management', 'Cloud Automation']

In [44]:
def normalize_skills(skills):
    if isinstance(skills, str):
        return [skill for skill in skills.split(", ")]
    elif isinstance(skills, list):
        return skills

df["skills"] = df["skills"].apply(normalize_skills)
df.to_csv("courses1.csv", index=False)

## Generate pseudo-description for missing values

In [45]:
import requests

descriptions = []
df.to_csv("courses2.csv", index=False)

def generate_pseudo_description(course_line):
    skills_str = ", ".join(course_line["skills"])
    title = course_line.get("title", "Untitled course")
    print(f"\nTitle: {title}\nSkills: {skills_str}\n ")
    prompt = (
        f"Write a concise and engaging course description for an online educational course titled '{title}'. "
        f"The course covers the following skills: {skills_str}. "
        f"The description should be clear, informative, and suitable for potential learners. "
        f"Keep the tone professional yet accessible. The description should be around 3 to 7 sentences long. "
        f"Only return the course description. Do not include any additional explanations or formatting."
    )
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "llama3",
                "prompt": prompt,
                "stream": False
            }
        )
        response.raise_for_status()
        print(response.json()["response"])
        descriptions.append(response.json()["response"])
        return response.json()["response"].strip()

    except Exception as e:
        print(f"Error generating description: {e}")
        return f"This course will teach you: {skills_str}. It is titled '{title}'."


def build_pseudo_description(course_line):
    skills_str = str(course_line["skills"])
    return f"This course will teach you: {skills_str}. It is titled '{course_line['title']}'."

# df['description'] = df['description'].fillna(df.apply(generate_pseudo_description, axis=1))
# df['description'] = df['description'].fillna(df.apply(generate_pseudo_description, axis=1))

mask = df['description'].isna() | df['description'].astype(str).str.strip().eq("")

df.loc[mask, 'description'] = df.loc[mask].apply(generate_pseudo_description, axis=1)


Title: CS50's Introduction toComputer Science
Skills: Resource Management, JavaScript (Programming Language), Forensic Sciences, Cryptography, SQL (Programming Language), Finance, Algorithms, Computer Science, HyperText Markup Language (HTML), Data Structures, Python (Programming Language), C (Programming Language), Security Software, Cascading Style Sheets (CSS)
 
Discover the foundations of computer science with CS50's Introduction to Computer Science! This comprehensive online course introduces you to a wide range of essential skills, from programming languages like JavaScript and Python, to data structures and algorithms. Explore topics such as resource management, cryptography, and SQL, while also delving into the world of finance and security software. As you learn, you'll gain hands-on experience with HTML, CSS, and C, plus a deeper understanding of computer science concepts and their applications in modern society. By the end of this course, you'll be equipped to tackle comple

In [50]:
# final version of courses with generated descriptions
df.to_csv("courses_final.csv", index=False)

# Create multi-vector

1. Add vector representation columns for each course:
    ```
   {
        "title_vector":[ ],
        "desc_vector": [ ],
        "skills_vector": [ ]
    }
   ```


In [57]:
from sentence_transformers import SentenceTransformer

# dim = 786 for description
desc_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# dim = 384 for skills
skills_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
def vectorize(row):
    return {
        "title_vector": desc_model.encode(row["title"]),
        "desc_vector": desc_model.encode(row.get("description", "")),
        "skills_vector": skills_model.encode(", ".join(row["skills"]))
    }

# delete courses with NaN titles
df = df.dropna(subset=["title", "description", "skills"]).reset_index(drop=True)
df["vectors"] = df.apply(vectorize, axis=1)
print(df.head())

                                                 url  \
0  https://www.coursera.org/learn/python-for-appl...   
1  https://www.coursera.org/learn/python-crash-co...   
2    https://www.coursera.org/specializations/python   
3  https://www.coursera.org/learn/programming-in-...   
4  https://www.coursera.org/professional-certific...   

                                               title  \
0          Python for Data Science, AI & Development   
1                             Crash Course on Python   
2                Python for Everybody Specialization   
3                              Programming in Python   
4  Microsoft Python Development  Professional Cer...   

                       author   students  rating difficulty  \
0        Joseph Santarcangelo  1222909.0     4.6   Beginner   
1  Google Career Certificates  1256131.0     4.8   Beginner   
2   Charles Russell Severance  1797838.0     4.8   Beginner   
3        Taught by Meta Staff   124567.0     4.6   Beginner   
4          

# Prepare data for Qdrant


In [71]:
def prepare_for_qdrant(row):
    row_id = int(row.name) if not isinstance(row.name, (int, str)) else row.name
    return {
        "id": row_id,
        "vector": {
            "title": row["vectors"]["title_vector"].tolist(),
            "description": row["vectors"]["desc_vector"].tolist(),
            "skills": row["vectors"]["skills_vector"].tolist()
        },
        # target = ['url', 'title', 'author', 'students', 'rating',
        # 'difficulty', 'skills', 'description', 'price', 'source']
        "payload": {
            "title": row["title"],
            "skills": row["skills"],
            "difficulty": row["difficulty"],
            "rating": float(row["rating"])
        }
    }

qdrant_data = [prepare_for_qdrant(row) for _, row in df.iterrows()]
print(qdrant_data[0])

{'id': 0, 'vector': {'title': [-0.011696447618305683, 0.05402686074376106, -0.024414725601673126, -0.01614173874258995, 0.005684220232069492, 0.016481371596455574, 0.07596568763256073, 0.010039807297289371, 0.04810941219329834, 0.006956601981073618, 0.04361554607748985, -0.013290857896208763, -0.023639772087335587, 0.12737509608268738, -0.03501163050532341, -0.06902378797531128, 0.034705355763435364, -0.054788827896118164, 0.054665081202983856, -0.010644293390214443, -0.052766911685466766, 0.01943526603281498, -0.05350259318947792, 0.028162257745862007, -0.024924857541918755, -0.0012504400219768286, -0.00892032403498888, -0.01584160327911377, -0.024116016924381256, -0.03429693356156349, 0.046730030328035355, -0.006108345463871956, 0.03463362529873848, 0.07978440076112747, 1.4990155250416137e-06, -0.03946884721517563, 0.04439002275466919, 0.04356050863862038, -0.011823749169707298, -0.007325344253331423, 0.0647520050406456, -0.013550995849072933, 0.020485900342464447, -0.004938593599945

# Create Qdrant using Docker
```
docker run -d -p 6333:6333 -v C:\Users\flees\Desktop\KIZAK_dls\ml\qdrant:/qdrant/storage --name qd qdrant/qdrant
```

Then connect to db and store values

In [68]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, NamedVector

client = QdrantClient(host="localhost", port=6333)

# create new collection
collection_name = "courses"

# delete and create empty collection
if client.collection_exists(collection_name):
    client.delete_collection(collection_name)

client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "title": VectorParams(size=768, distance=Distance.COSINE),
        "description": VectorParams(size=768, distance=Distance.COSINE),
        "skills": VectorParams(size=384, distance=Distance.COSINE)
    }
)

True

In [73]:
print("Пример записи:")
print({
    "id": qdrant_data[2]["id"],
    "vector_shapes": {k: len(v) for k, v in qdrant_data[0]["vector"].items()},
    "payload_keys": list(qdrant_data[0]["payload"].keys())
})

Пример записи:
{'id': 2, 'vector_shapes': {'title': 768, 'description': 768, 'skills': 384}, 'payload_keys': ['title', 'skills', 'difficulty', 'rating']}


In [74]:
success_count = 0
batch_size = 100
failed_ids = []

def upload_to_qdrant(clientQd, qdrant_data, collection_name="courses"):
    points = [
        PointStruct(
            id=item["id"],
            vector=item["vector"],
            payload=item["payload"]
        )
        for item in qdrant_data
    ]
    clientQd.upsert(
        collection_name=collection_name,
        points=points
    )

    print(f"{len(points)} lines in '{collection_name}'")

upload_to_qdrant(client, qdrant_data)

733 lines in 'courses'


In [88]:
def vectorization_for_search(query, skills):
    # dim = 786 for description
    d_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    # dim = 384 for skills
    s_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    return d_model.encode(query), d_model.encode(query), s_model.encode(", ".join(skills))

def search_courses(title_query=None, description_query=None, skills_query=None, filters=None):

    results = []

    # if title_query is not None:
    #     title_results = client.search(
    #         collection_name="courses",
    #         query_vector=("title", title_query),
    #         # query_filter=filters,
    #         limit=10,
    #         with_payload=True
    #     )
    #     results.extend(title_results)

    if description_query is not None:
        desc_results = client.search(
            collection_name="courses",
            query_vector=("description", description_query),
            # query_filter=filters,
            limit=10,
            with_payload=True
        )
        results.extend(desc_results)

    if skills_query is not None:
        skills_results = client.search(
            collection_name="courses",
            query_vector=("skills", skills_query),
            # query_filter=filters,
            limit=10,
            with_payload=True
        )
        results.extend(skills_results)

    return results

# user_req_skills = ["sql", "python", "docker"]
# user_query = "i want to be "
# res = search_courses(vectorization_for_search(user_query, user_req_skills))



In [96]:
from qdrant_client.models import NamedVector
from collections import defaultdict

# search for courses using weighted combination of title, description, and skills vectors
def search_courses_batch_weighted(title_vector, description_vector, skills_vector,
                                 weights={'title': 0.3, 'description': 0.2, 'skills': 0.5}):

     # 3 separate search queries for each vector type
    search_requests = [
        {
            "vector": NamedVector(name="title", vector=title_vector),
            "limit": 5,
            "with_payload": True
        },
        {
            "vector": NamedVector(name="description", vector=description_vector),
            "limit": 5,
            "with_payload": True
        },
        {
            "vector": NamedVector(name="skills", vector=skills_vector),
            "limit": 5,
            "with_payload": True
        }
    ]
    # send search request to Qdrant
    batch_results = client.query_batch_points(
        collection_name="courses",
        requests=search_requests
    )

    # calculate and weight scores from each result
    weighted_scores = defaultdict(lambda: {'score': 0, 'point': None})
    vector_names = ['title', 'description', 'skills']

    # check results and apply weights to each score
    for i, result in enumerate(batch_results):
        vector_name = vector_names[i]
        weight = weights[vector_name]

        for point in result.points:
            point_id = point.id
            weighted_scores[point_id]['score'] += point.score * weight
            weighted_scores[point_id]['point'] = point

    # sort by total combined score (higher is better)
    sorted_results = sorted(
        weighted_scores.values(),
        key=lambda x: x['score'],
        reverse=True
    )

    return sorted_results[:5]

user_req_skills = ["sql", "python", "docker"]
user_query = "i want to be "
res = search_courses(*vectorization_for_search(user_query, user_req_skills))

for i, result in enumerate(res, 1):
    print(f"{i}. Course ID: {result.id}")
    print(f" Payload: {result.payload}")
    print(f"Combined Score: {result.score}\n")

1. Course ID: 4
 Payload: {'title': 'Microsoft Python Development  Professional Certificate', 'skills': ['Plotly', 'Debugging', 'Test Driven Development (TDD)', 'Restful API', 'Docker (Software)', 'Scripting', 'Git (Version Control System)', 'Data Ethics', 'Agile Methodology', 'Data Manipulation', 'Flask (Web Framework)', 'Web Development'], 'difficulty': 'Beginner', 'rating': 4.4}
Combined Score: 0.28662294

2. Course ID: 61
 Payload: {'title': 'IBM AI Developer Professional Certificate', 'skills': ['Unit Testing', 'HTML and CSS', 'ChatGPT', 'Development Environment', 'Prompt Engineering', 'Software Architecture', 'Professional Development', 'Engineering Software', 'Artificial Intelligence', 'Python Programming', 'Software Design', 'Jupyter'], 'difficulty': 'Beginner', 'rating': 4.6}
Combined Score: 0.2750702

3. Course ID: 187
 Payload: {'title': 'Design Computing: 3D Modeling in Rhinoceros with Python/Rhinoscript', 'skills': ['Automation', 'Animations', 'Data Structures', 'Computer 

  desc_results = client.search(
  skills_results = client.search(
