In [33]:
import pandas as pd
import numpy as np
import re

In [50]:
df_dataset = pd.read_csv('ml_concepts_dataset_60k.csv')

In [52]:
print("Before Cleaning : ", df_dataset.shape)

Before Cleaning :  (60000, 15)


In [53]:
df_dataset = df_dataset[["concept_name",
    "description",
    "keywords",
    "category",
    "subcategory",
    "difficulty_level",
    "mathematical_foundation",
    "application_area",
    "first_introduced_year",
    "popularity_score",
    "is_probabilistic"]]

In [54]:
def clean_string(text):
    if pd.isna(text):
        return ""
    text = str(text).strip()

    if text.lower() in {"n/a", "na", "undefined", "unknown", "none"}:
        return ""
    
    return re.sub(r"\s+", " ", text)

In [55]:
def norm_concept_name(concept_name):
    norm = clean_string(concept_name)

    if norm.isupper() and len(norm.split()) <= 3:
        return norm
    return norm.title()

In [56]:
def parse_year(value):
    if pd.isna(value):
        return np.nan
    m = re.search(r"(\d{4})", str(value))

    return int(m.group(1)) if m else np.nan

In [57]:
def parse_popularity(value):
    s = str(value).lower()

    mapping = {"high": 90, "medium": 50, "low": 10}

    try:
        if re.fullmatch(r'\d+(\.\d+)?', s):
            return float(s)
    except:
        pass

    return mapping.get(s, np.nan)

In [58]:
def parse_bool(val):
    s = str(val).lower()
    if s in {"true","1","yes","y"}:
        return True
    if s in {"false","0","no","n"}:
        return False
    return np.nan

In [59]:
def parse_keywords(val):
    if pd.isna(val):
        return []
    s = str(val).strip()
    if ";" in s:
        return [p.strip() for p in s.split(";") if p.strip()]
    if s.startswith("[") and s.endswith("]"):
        inner = s[1:-1]
        return [p.strip().strip("'\"") for p in inner.split(",") if p.strip()]
    return [p.strip() for p in re.split(r'[,;]', s) if p.strip()]

In [60]:
df_dataset['concept_name'] = df_dataset['concept_name'].apply(norm_concept_name)
df_dataset['description'] = df_dataset['description'].apply(clean_string)
df_dataset['category'] = df_dataset['category'].apply(lambda x: clean_string(x).title())
df_dataset['subcategory'] = df_dataset['subcategory'].apply(lambda x: clean_string(x).title())
df_dataset['difficulty_level'] = df_dataset['difficulty_level'].apply(lambda x: clean_string(x).title())
df_dataset['mathematical_foundation'] = df_dataset['mathematical_foundation'].apply(lambda x: clean_string(x).title())
df_dataset['application_area'] = df_dataset['application_area'].apply(lambda x: clean_string(x).title())
df_dataset['first_introduced_year'] = df_dataset['first_introduced_year'].apply(parse_year)
df_dataset['popularity_score'] = df_dataset['popularity_score'].apply(parse_popularity)
df_dataset['is_probabilistic'] = df_dataset['is_probabilistic'].apply(parse_bool)
df_dataset['keywords_parsed'] = df_dataset['keywords'].apply(parse_keywords)

In [61]:
df_dataset['first_introduced_year'].fillna(df_dataset['first_introduced_year'].mode()[0], inplace=True)
df_dataset['popularity_score'].fillna(df_dataset['popularity_score'].median(), inplace=True)
df_dataset['is_probabilistic'].fillna(df_dataset['is_probabilistic'].fillna(False), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dataset['first_introduced_year'].fillna(df_dataset['first_introduced_year'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dataset['popularity_score'].fillna(df_dataset['popularity_score'].median(), inplace=True)
  df_dataset['is_probabilistic'].fi

In [62]:
df_dataset = df_dataset.drop_duplicates(subset=['concept_name', 'description']).reset_index(drop=True)

In [63]:
print("After Cleaning : ", df_dataset.shape)

After Cleaning :  (51005, 12)


In [66]:
#df_dataset.to_csv('cleaned_dataset.csv', index=False)