# Download Multi-Platform Online Courses Dataset from Kaggle

In [13]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("everydaycodings/multi-platform-online-courses-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\flees\.cache\kagglehub\datasets\everydaycodings\multi-platform-online-courses-dataset\versions\2


## Functions to clean kaggle datasets

In [14]:
import pandas as pd
import numpy as np

target_columns = [
    'url', 'title', 'author', 'students', 'rating',
    'difficulty', 'skills', 'description', 'price', 'source'
]

coursera_rename_map = {
    'course': 'title',
    'reviewcount': 'students',
    'level': 'difficulty'
}

coursera_kaggle_df = pd.read_csv("multi-platform-online-courses-dataset/Coursera.csv")
print(f"Columns before preprocessing: \n{coursera_kaggle_df.columns}\nSize: {len(coursera_kaggle_df)}\n")

# rename columns using map
coursera_kaggle_df = coursera_kaggle_df.rename(columns=coursera_rename_map)
# delete non-target columns
coursera_kaggle_df = coursera_kaggle_df[[col for col in coursera_kaggle_df.columns if col in target_columns]]

for col in target_columns:
    if col not in coursera_kaggle_df.columns:
        coursera_kaggle_df[col] = np.nan

coursera_kaggle_df = coursera_kaggle_df[target_columns]

coursera_kaggle_df = coursera_kaggle_df[~coursera_kaggle_df['skills'].isna()]
coursera_kaggle_df = coursera_kaggle_df[coursera_kaggle_df['skills'].astype(str).str.strip() != ""]
coursera_kaggle_df = coursera_kaggle_df[coursera_kaggle_df['skills'].astype(str) != "[]"]

print(f"Columns after preprocessing: \n{coursera_kaggle_df.columns}\nSize: {len(coursera_kaggle_df)}")

Columns before preprocessing: 
Index(['partner', 'course', 'skills', 'rating', 'reviewcount', 'level',
       'certificatetype', 'duration', 'crediteligibility'],
      dtype='object')
Size: 1139

Columns after preprocessing: 
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')
Size: 1088


## Clean edx data set

In [15]:
edx_kaggle_df = pd.read_csv("multi-platform-online-courses-dataset/edx.csv")
print(f"Columns before preprocessing: \n{edx_kaggle_df.columns}\nSize: {len(edx_kaggle_df)}\n")

edx_rename_map = {
    'link': 'url',
    'institution': 'author',
    'level': 'difficulty',
    'associatedskills': 'skills'
}

# rename columns using map
edx_kaggle_df = edx_kaggle_df.rename(columns=edx_rename_map)

edx_kaggle_df = edx_kaggle_df[edx_kaggle_df['language'] == "English"]

valid_subjects = ["Data Analysis & Statistics", "Computer Science"]
edx_kaggle_df = edx_kaggle_df[edx_kaggle_df["subject"].isin(valid_subjects)]

# delete non-target columns
edx_kaggle_df = edx_kaggle_df[[col for col in edx_kaggle_df.columns if col in target_columns]]

# add missing target columns
for col in target_columns:
    if col not in edx_kaggle_df.columns:
        edx_kaggle_df[col] = np.nan


edx_kaggle_df["source"] = "edx"

edx_kaggle_df = edx_kaggle_df[target_columns]

edx_kaggle_df = edx_kaggle_df[~edx_kaggle_df['skills'].isna()]
edx_kaggle_df = edx_kaggle_df[edx_kaggle_df['skills'].astype(str).str.strip() != ""]
edx_kaggle_df = edx_kaggle_df[edx_kaggle_df['skills'].astype(str) != "[]"]


print(f"Columns after preprocessing: \n{edx_kaggle_df.columns}\nSize: {len(edx_kaggle_df)}")


Columns before preprocessing: 
Index(['title', 'link', 'institution', 'subject', 'level', 'prerequisites',
       'language', 'videotranscript', 'associatedprograms',
       'associatedskills'],
      dtype='object')
Size: 816

Columns after preprocessing: 
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')
Size: 210


## Move files into ml directory & check content

 We will not use skillshare.csv and Udemy.csv dataset due to lack of skill columns.



In [16]:
import os

path = "multi-platform-online-courses-dataset"
datasets = []
df = pd.DataFrame()

for filename in os.listdir(path):
    if filename.endswith(".json"):
        dataset = pd.read_json(os.path.join(path, filename))
        datasets.append(dataset)
        df = pd.concat([df, dataset], ignore_index=True)
        print(filename)
        print(f"{dataset.columns}\n\n")

df = pd.concat([df, edx_kaggle_df], ignore_index=True)

df.to_csv("courses.csv", index=False)

coursera_clear.json
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')


stepik_clear.json
Index(['url', 'title', 'author', 'students', 'rating', 'difficulty', 'skills',
       'description', 'price', 'source'],
      dtype='object')




## Data normalization

#### Skills normalization
Google Cloud, Cloud Computing, Network Security, Management, Cloud Automation ----> ['Google Cloud', 'Cloud Computing', 'Network Security', 'Management', 'Cloud Automation']

In [17]:
def normalize_skills(skills):
    if isinstance(skills, str):
        return [skill for skill in skills.split(", ")]
    elif isinstance(skills, list):
        return skills

df["skills"] = df["skills"].apply(normalize_skills)
df.to_csv("courses1.csv", index=False)

## Generate pseudo-description for missing values

In [18]:
# import requests
#
# def generate_pseudo_description(course_line):
#     skills_str = ", ".join(course_line["skills"]) if isinstance(course_line["skills"], list) else str(course_line["skills"])
#     title = course_line.get("title", "Untitled course")
#
#     prompt = f"Write a short course description for a course titled '{title}'. The course covers the following skills: {skills_str}."
#
#     try:
#         response = requests.post(
#             "http://localhost:11434/api/generate",  # или другой порт, который ты настроила в Docker
#             json={
#                 "model": "llama3",  # замени на свою модель, если другая
#                 "prompt": prompt,
#                 "stream": False
#             }
#         )
#         response.raise_for_status()
#         return response.json()["response"].strip()
#
#     except Exception as e:
#         print(f"Error generating description: {e}")
#         return f"This course will teach you: {skills_str}. It is titled '{title}'."
#

# def generate_pseudo_description(course_line):
#     skills_str = str(course_line["skills"])
#     return f"This course will teach you: {skills_str}. It is titled '{course_line['title']}'."




# df['description'] = df['description'].fillna(df.apply(generate_pseudo_description, axis=1))

Put .json's in main dataframe

In [19]:
def course_to_text(course):
    skills_str = str(course["skills"])
    return f"{course['title']} . {course['description']} . Skills: {skills_str}"

In [20]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
#
# course_texts = [course_to_text(course) for course in df]
# embeddings = model.encode(course_texts, show_progress_bar=True)


TypeError: string indices must be integers