## Preprocessing for parsed courses

We assume that parsed courses stored in .json file that contains list of elements with such structure:

      {
        "url": "https://stepik.org/course/174852?search=7291233903",
        "title": "Основы Python: курс для начинающих",
        "author": "Алексей Ковальчук",
        "students": 68000,
        "rating": 4.9,
        "difficulty": "Beginner",
        "skills": [
            "Artificial Intelligence and Machine Learning (AI/ML)",
            "Pivot Tables And Charts",
            "Python Programming",
            "Time Series Analysis and Forecasting",
            "Python"
        ],
        "description": "Онлайн-курс по изучению языка программирования Python (первая часть). \nКурс разбит на 2 части и является методическим пособием для изучения \nалгоритмического программирования на Python. Курс подойдет школьникам, в\n том числе для подготовки к ОГЭ и ЕГЭ, студентам и взрослым. Курс ведёт \nвыпускник мехмата МГУ.",
        "price": 0,
        "source": "Stepik"
      }


In [None]:
import json
import pandas as pd
import accessify
import re
import requests

class CourseDataProcessor:
    def __init__(self, json_path, csv_path, mapping_path=None):
        self.json_path = json_path
        self.csv_path = csv_path
        self.mapping_path = mapping_path
        self.mapping = {}
        self.courses = pd.DataFrame()


    def load_json(self):
        with open(self.json_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
        self.courses = pd.DataFrame(raw_data)

    def clean_skill(self, skill: str) -> str:
        skill = skill.lower()
        skill = re.sub(r'#+', '', skill)
        skill = skill.replace("-", " ")
        skill = re.sub(r'\s+', ' ', skill)
        skill = re.sub(r'[^\w\s]', '', skill)
        return skill.strip()

    def clean_skills(self, cell: str) -> str:
        skills = [self.clean_skill(s) for s in cell.split(",")]
        return ", ".join(skills)

    def apply_mapping(self):
        if not self.mapping_path:
            return
        with open(self.mapping_path, 'r', encoding='utf-8') as f:
            self.mapping = json.load(f)

        for column, replace_map in self.mapping.items():
            if column in self.courses.columns:
                self.courses[column] = self.courses[column].replace(replace_map)
    def check_invalid_links(self):
        invalid_urls = []

        for course_url in self.courses['url']:
            try:
                print(f"Checking: {course_url}")
                response = requests.head(course_url, allow_redirects=True, timeout=25)
                if response.status_code >= 400:
                    invalid_urls.append(course_url)
            except Exception as e:
                invalid_urls.append(course_url)

        self.courses = self.courses[~self.courses['url'].isin(invalid_urls)]

    def run(self):
        self.load_json()
        self.apply_mapping()
        self.courses['skills'] = self.courses['skills'].apply(self.clean_skills)
        self.export_to_csv()