# Download Multi-Platform Online Courses Dataset from Kaggle

In [40]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("everydaycodings/multi-platform-online-courses-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\flees\.cache\kagglehub\datasets\everydaycodings\multi-platform-online-courses-dataset\versions\2


## Functions to clean kaggle datasets

In [41]:
import pandas as pd
import numpy as np

target_columns = [
    'url', 'title', 'author', 'students', 'rating',
    'difficulty', 'skills', 'description', 'price', 'source'
]

coursera_rename_map = {
    'course': 'title',
    'reviewcount': 'students',
    'level': 'difficulty'
}

coursera_kaggle_df = pd.read_csv("multi-platform-online-courses-dataset/Coursera.csv")
print(f"Columns before preprocessing: \n{coursera_kaggle_df.columns}\nSize: {len(coursera_kaggle_df)}\n")

# rename columns using map
coursera_kaggle_df = coursera_kaggle_df.rename(columns=coursera_rename_map)
# delete non-target columns
coursera_kaggle_df = coursera_kaggle_df[[col for col in coursera_kaggle_df.columns if col in target_columns]]

for col in target_columns:
    if col not in coursera_kaggle_df.columns:
        coursera_kaggle_df[col] = np.nan

coursera_kaggle_df = coursera_kaggle_df[target_columns]

coursera_kaggle_df = coursera_kaggle_df[~coursera_kaggle_df['skills'].isna()]
coursera_kaggle_df = coursera_kaggle_df[coursera_kaggle_df['skills'].astype(str).str.strip() != ""]
coursera_kaggle_df = coursera_kaggle_df[coursera_kaggle_df['skills'].astype(str) != "[]"]

print(f"Columns after preprocessing: \n{coursera_kaggle_df.columns}\nSize: {len(coursera_kaggle_df)}")

Columns before preprocessing: 
Index(['partner', 'course', 'skills', 'rating', 'reviewcount', 'level',
       'certificatetype', 'duration', 'crediteligibility'],
      dtype='object')
Size: 1139

Columns after preprocessing: 
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')
Size: 1088


## Clean edx data set

In [42]:
edx_kaggle_df = pd.read_csv("multi-platform-online-courses-dataset/edx.csv")
print(f"Columns before preprocessing: \n{edx_kaggle_df.columns}\nSize: {len(edx_kaggle_df)}\n")

edx_rename_map = {
    'link': 'url',
    'institution': 'author',
    'level': 'difficulty',
    'associatedskills': 'skills'
}

# rename columns using map
edx_kaggle_df = edx_kaggle_df.rename(columns=edx_rename_map)

edx_kaggle_df = edx_kaggle_df[edx_kaggle_df['language'] == "English"]

valid_subjects = ["Data Analysis & Statistics", "Computer Science"]
edx_kaggle_df = edx_kaggle_df[edx_kaggle_df["subject"].isin(valid_subjects)]

# delete non-target columns
edx_kaggle_df = edx_kaggle_df[[col for col in edx_kaggle_df.columns if col in target_columns]]

# add missing target columns
for col in target_columns:
    if col not in edx_kaggle_df.columns:
        edx_kaggle_df[col] = np.nan


edx_kaggle_df["source"] = "edx"

edx_kaggle_df = edx_kaggle_df[target_columns]

edx_kaggle_df = edx_kaggle_df[~edx_kaggle_df['skills'].isna()]
edx_kaggle_df = edx_kaggle_df[edx_kaggle_df['skills'].astype(str).str.strip() != ""]
edx_kaggle_df = edx_kaggle_df[edx_kaggle_df['skills'].astype(str) != "[]"]


print(f"Columns after preprocessing: \n{edx_kaggle_df.columns}\nSize: {len(edx_kaggle_df)}")


Columns before preprocessing: 
Index(['title', 'link', 'institution', 'subject', 'level', 'prerequisites',
       'language', 'videotranscript', 'associatedprograms',
       'associatedskills'],
      dtype='object')
Size: 816

Columns after preprocessing: 
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')
Size: 210


## Move files into ml directory & check content

 We will not use skillshare.csv and Udemy.csv dataset due to lack of skill columns.



In [43]:
import os

path = "multi-platform-online-courses-dataset"
datasets = []
df = pd.DataFrame()

for filename in os.listdir(path):
    if filename.endswith(".json"):
        dataset = pd.read_json(os.path.join(path, filename))
        datasets.append(dataset)
        df = pd.concat([df, dataset], ignore_index=True)
        print(filename)
        print(f"{dataset.columns}\n\n")

df = pd.concat([df, edx_kaggle_df], ignore_index=True)

df.to_csv("courses.csv", index=False)

coursera_clear.json
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')


stepik_clear.json
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')




## Data normalization

#### Skills normalization
Google Cloud, Cloud Computing, Network Security, Management, Cloud Automation ----> ['Google Cloud', 'Cloud Computing', 'Network Security', 'Management', 'Cloud Automation']

In [44]:
def normalize_skills(skills):
    if isinstance(skills, str):
        return [skill for skill in skills.split(", ")]
    elif isinstance(skills, list):
        return skills

df["skills"] = df["skills"].apply(normalize_skills)
df.to_csv("courses1.csv", index=False)

## Generate pseudo-description for missing values

In [45]:
import requests

descriptions = []
df.to_csv("courses2.csv", index=False)

def generate_pseudo_description(course_line):
    skills_str = ", ".join(course_line["skills"])
    title = course_line.get("title", "Untitled course")
    print(f"\nTitle: {title}\nSkills: {skills_str}\n ")
    prompt = (
        f"Write a concise and engaging course description for an online educational course titled '{title}'. "
        f"The course covers the following skills: {skills_str}. "
        f"The description should be clear, informative, and suitable for potential learners. "
        f"Keep the tone professional yet accessible. The description should be around 3 to 7 sentences long. "
        f"Only return the course description. Do not include any additional explanations or formatting."
    )
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "llama3",
                "prompt": prompt,
                "stream": False
            }
        )
        response.raise_for_status()
        print(response.json()["response"])
        descriptions.append(response.json()["response"])
        return response.json()["response"].strip()

    except Exception as e:
        print(f"Error generating description: {e}")
        return f"This course will teach you: {skills_str}. It is titled '{title}'."


def build_pseudo_description(course_line):
    skills_str = str(course_line["skills"])
    return f"This course will teach you: {skills_str}. It is titled '{course_line['title']}'."

# df['description'] = df['description'].fillna(df.apply(generate_pseudo_description, axis=1))
# df['description'] = df['description'].fillna(df.apply(generate_pseudo_description, axis=1))

mask = df['description'].isna() | df['description'].astype(str).str.strip().eq("")

df.loc[mask, 'description'] = df.loc[mask].apply(generate_pseudo_description, axis=1)


Title: CS50's Introduction toComputer Science
Skills: Resource Management, JavaScript (Programming Language), Forensic Sciences, Cryptography, SQL (Programming Language), Finance, Algorithms, Computer Science, HyperText Markup Language (HTML), Data Structures, Python (Programming Language), C (Programming Language), Security Software, Cascading Style Sheets (CSS)
 
Discover the foundations of computer science with CS50's Introduction to Computer Science! This comprehensive online course introduces you to a wide range of essential skills, from programming languages like JavaScript and Python, to data structures and algorithms. Explore topics such as resource management, cryptography, and SQL, while also delving into the world of finance and security software. As you learn, you'll gain hands-on experience with HTML, CSS, and C, plus a deeper understanding of computer science concepts and their applications in modern society. By the end of this course, you'll be equipped to tackle comple

In [50]:
# final version of courses with generated descriptions
df.to_csv("courses_final.csv", index=False)


# Create multi-vector

1. Add vector representation columns for each course:
    ```
   {
        "title_vector":[ ],
        "desc_vector": [ ],
        "skills_vector": [ ]
    }
   ```


In [115]:
from sentence_transformers import SentenceTransformer

# dim = 786 for description
# desc_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# dim = 384 for skills
skills_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
def vectorize(row):
    return {
        "title_vector": skills_model.encode(row["title"]),
        "desc_vector": skills_model.encode(row.get("description", "")),
        "skills_vector": skills_model.encode(", ".join(row["skills"]))
    }

# delete courses with NaN titles
df = df.dropna(subset=["title", "description", "skills"]).reset_index(drop=True)
df["vectors"] = df.apply(vectorize, axis=1)
print(df.head())

                                                 url  \
0  https://www.coursera.org/learn/python-for-appl...   
1  https://www.coursera.org/learn/python-crash-co...   
2    https://www.coursera.org/specializations/python   
3  https://www.coursera.org/learn/programming-in-...   
4  https://www.coursera.org/professional-certific...   

                                               title  \
0          Python for Data Science, AI & Development   
1                             Crash Course on Python   
2                Python for Everybody Specialization   
3                              Programming in Python   
4  Microsoft Python Development  Professional Cer...   

                       author   students  rating difficulty  \
0        Joseph Santarcangelo  1222909.0     4.6   Beginner   
1  Google Career Certificates  1256131.0     4.8   Beginner   
2   Charles Russell Severance  1797838.0     4.8   Beginner   
3        Taught by Meta Staff   124567.0     4.6   Beginner   
4          

# Prepare data for Qdrant


In [124]:
def prepare_for_qdrant(row):
    row_id = int(row.name) if not isinstance(row.name, (int, str)) else row.name
    return {
        "id": row_id,
        "vector": {
            "title": row["vectors"]["title_vector"].tolist(),
            "description": row["vectors"]["desc_vector"].tolist(),
            "skills": row["vectors"]["skills_vector"].tolist()
        },
        # target = ['url', 'title', 'author', 'students', 'rating',
        # 'difficulty', 'skills', 'description', 'price', 'source']
        "payload": {
            "title": row["title"],
            "skills": row["skills"],
            "difficulty": row["difficulty"],
            "rating": float(row["rating"])
        }
    }

qdrant_data = [prepare_for_qdrant(row) for _, row in df.iterrows()]
print(qdrant_data[0])

{'id': 0, 'vector': {'title': [-0.229887917637825, -0.07593397796154022, -0.2879154086112976, -0.22225551307201385, -0.030657444149255753, 0.020765271037817, 0.007236632518470287, -0.028711464256048203, -0.012824943289160728, 0.575271725654602, 0.07500015944242477, -0.2684146463871002, 0.26839131116867065, -0.04526103287935257, -0.3245367407798767, -0.08210940659046173, -0.09135721623897552, -0.172034353017807, -0.20277957618236542, -0.2961622178554535, -0.007116260938346386, -0.144480898976326, -0.15821585059165955, -0.2983548641204834, -0.10762292146682739, -0.03132396563887596, -0.1319778710603714, -0.09129314124584198, 0.17157213389873505, -0.12677545845508575, 0.5788305997848511, 0.050153762102127075, 0.638266384601593, 0.2778944969177246, -0.13725891709327698, 0.27024734020233154, -0.0921994224190712, 0.014150457456707954, -0.0823940634727478, 0.14760926365852356, 0.17362722754478455, 0.09482162445783615, 0.17863638699054718, -0.10410156100988388, -0.08671964704990387, 0.15503028

# Create Qdrant using Docker
```
docker run -d -p 6333:6333 -v C:\Users\flees\Desktop\KIZAK_dls\ml\qdrant:/qdrant/storage --name qd qdrant/qdrant
```

Then connect to db and store values

In [134]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct, NamedVector

client = QdrantClient(host="localhost", port=6333)

# create new collection
collection_name = "courses"

# delete and create empty collection
if client.collection_exists(collection_name):
    client.delete_collection(collection_name)

client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "title": VectorParams(size=384, distance=Distance.COSINE),
        "description": VectorParams(size=384, distance=Distance.COSINE),
        "skills": VectorParams(size=384, distance=Distance.COSINE)
    }
)

True

In [135]:
print("Lin:")
print({
    "id": qdrant_data[2]["id"],
    "vector_shapes": {k: len(v) for k, v in qdrant_data[0]["vector"].items()},
    "payload_keys": list(qdrant_data[0]["payload"].keys())
})

Lin:
{'id': 2, 'vector_shapes': {'title': 384, 'description': 384, 'skills': 384}, 'payload_keys': ['title', 'skills', 'difficulty', 'rating']}


In [136]:
success_count = 0
batch_size = 100
failed_ids = []

def upload_to_qdrant(clientQd, qdrant_data, collection_name="courses"):
    points = [
        PointStruct(
            id=item["id"],
            vector=item["vector"],
            payload=item["payload"]
        )
        for item in qdrant_data
    ]
    clientQd.upsert(
        collection_name=collection_name,
        points=points
    )

    print(f"{len(points)} lines in '{collection_name}'")

upload_to_qdrant(client, qdrant_data)

733 lines in 'courses'


In [137]:
def vectorization_for_search(role, query, skills):
    # dim = 384
    s_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    return s_model.encode(role), s_model.encode(query), s_model.encode(", ".join(skills))


In [173]:
from qdrant_client import QdrantClient
from qdrant_client.models import NamedVector, SearchRequest
from collections import defaultdict

def search_courses_batch_weighted(
    title_vector,
    description_vector,
    skills_vector,
    weights={'title': 0.3, 'description': 0.2, 'skills': 0.5},
    limit=5
):
    # search for each vector
    search_requests = [
        SearchRequest(
            vector=NamedVector(name="title", vector=title_vector),
            limit=40,
            with_payload=True
        ),
        SearchRequest(
            vector=NamedVector(name="description", vector=description_vector),
            limit=40,
            with_payload=True
        ),
        SearchRequest(
            vector=NamedVector(name="skills", vector=skills_vector),
            limit=40,
            with_payload=True
        )
    ]

    # complex search
    batch_results = client.search_batch(
        collection_name="courses",
        requests=search_requests
    )

    # create weighted vectors
    weighted_scores = defaultdict(lambda: {
        'weighted_score': 0,
        'point': None,
        'original_scores': {'title': 0, 'description': 0, 'skills': 0}
    })

    vector_names = ['title', 'description', 'skills']

    for i, results in enumerate(batch_results):
        vector_name = vector_names[i]
        weight = weights[vector_name]

        for point in results:
            if point.id not in weighted_scores:
                weighted_scores[point.id] = {
                    'weighted_score': 0,
                    'point': point,
                    'original_scores': {'title': 0, 'description': 0, 'skills': 0}
                }
            weighted_scores[point.id]['weighted_score'] += point.score * weight
            weighted_scores[point.id]['original_scores'][vector_name] = point.score

    # sorted using weighted sum
    sorted_results = sorted(
        weighted_scores.values(),
        key=lambda x: x['weighted_score'],
        reverse=True
    )


    return [
        {
            "point": item['point'],
            "weighted_score": item['weighted_score'],
            "details": {
                "id": item['point'].id,
                "title": item['point'].payload.get('title'),
                "original_scores": item["original_scores"]
            }
        }
        for item in sorted_results[:limit]
    ]

role = 'Data Engineer'
user_req_skills = ["sql", "python", "docker"]
user_query = "i want to be Data Scientist"

title_vec, desc_vec, skills_vec = vectorization_for_search(role, user_query, user_req_skills)

results = search_courses_batch_weighted(title_vec, desc_vec, skills_vec)

for item in results:
    print(f"Course ID: {item['details']['id']}")
    print(f"Title: {item['details']['title']}")
    print(f"Weighted score: {item['weighted_score']:.4f}")
    print(f"Original_scores: {str(item['details']['original_scores'])}")
    print(f"Original point: {item['point']}")


Course ID: 103
Title: Data Engineering Foundations Specialization
Weighted score: 0.6964
Original_scores: {'title': 0.7823555, 'description': 0.62638336, 'skills': 0.6728875}
Original point: id=103 version=0 score=0.7823555 payload={'title': 'Data Engineering Foundations Specialization', 'skills': ['Database Design', 'Databases', 'Extract, Transform, Load', 'SQL', 'Python Programming', 'Jupyter', 'MySQL', 'Data Warehousing', 'Big Data', 'IBM DB2', 'Data Store', 'Data Manipulation'], 'difficulty': 'Beginner', 'rating': 4.7} vector=None shard_key=None order_value=None
Course ID: 97
Title: Introduction to Data Science Specialization
Weighted score: 0.6716
Original_scores: {'title': 0.78826797, 'description': 0.6434228, 'skills': 0.61277866}
Original point: id=97 version=0 score=0.78826797 payload={'title': 'Introduction to Data Science Specialization', 'skills': ['Database Design', 'Data Science', 'Databases', 'SQL', 'Query Languages', 'Jupyter', 'Data Visualization Software', 'Big Data',

  batch_results = client.search_batch(
