In [8]:
import pandas as pd
import numpy as np
import re

In [3]:
df_dataset = pd.read_csv('ml_concepts_dataset_60k.csv')

In [None]:
df_dataset = df_dataset[["concept_name",
    "description",
    "keywords",
    "category",
    "subcategory",
    "difficulty_level",
    "mathematical_foundation",
    "application_area",
    "first_introduced_year",
    "popularity_score",
    "is_probabilistic"]]

In [10]:
def clean_string(text):
    if pd.isna(text):
        return ""
    text = str(text).strip()

    if text.lower() in {"n/a", "na", "undefined", "unknown", "none"}:
        return ""
    
    return re.sub(r"\s+", " ", text)

In [11]:
def norm_concept_name(concept_name):
    norm = clean_string(concept_name)

    if norm.isupper() and len(norm.split()) <= 3:
        return norm
    return norm.title()

In [14]:
def parse_year(value):
    if pd.isna(value):
        return np.nan
    m = re.search(r"(\d{4})", str(value))

    return int(m.group(1)) if m else np.nan

In [32]:
def parse_popularity(value):
    s = str(value).lower()

    mapping = {"high": 90, "medium": 50, "low": 10}

    try:
        if re.fullmatch(r'\d+(\.\d+)?', s):
            return float(s)
    except:
        pass

    return mapping.get(s, np.nan)

In [16]:
def parse_bool(val):
    s = str(val).lower()
    if s in {"true","1","yes","y"}:
        return True
    if s in {"false","0","no","n"}:
        return False
    return np.nan

In [17]:
def parse_keywords(val):
    if pd.isna(val):
        return []
    s = str(val).strip()
    if ";" in s:
        return [p.strip() for p in s.split(";") if p.strip()]
    if s.startswith("[") and s.endswith("]"):
        inner = s[1:-1]
        return [p.strip().strip("'\"") for p in inner.split(",") if p.strip()]
    return [p.strip() for p in re.split(r'[,;]', s) if p.strip()]

In [37]:
df_dataset['concept_name'] = df_dataset['concept_name'].apply(norm_concept_name)
df_dataset['description'] = df_dataset['description'].apply(clean_string)
df_dataset['category'] = df_dataset['category'].apply(lambda x: clean_string(x).title())
df_dataset['subcategory'] = df_dataset['subcategory'].apply(lambda x: clean_string(x).title())
df_dataset['difficulty_level'] = df_dataset['difficulty_level'].apply(lambda x: clean_string(x).title())
df_dataset['mathematical_foundation'] = df_dataset['mathematical_foundation'].apply(lambda x: clean_string(x).title())
df_dataset['application_area'] = df_dataset['application_area'].apply(lambda x: clean_string(x).title())
df_dataset['first_introduced_year'] = df_dataset['first_introduced_year'].apply(parse_year)
df_dataset['popularity_score'] = df_dataset['popularity_score'].apply(parse_popularity)
df_dataset['is_probabilistic'] = df_dataset['is_probabilistic'].apply(parse_bool)
df_dataset['keywords_parsed'] = df_dataset['keywords'].apply(parse_keywords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dataset['concept_name'] = df_dataset['concept_name'].apply(norm_concept_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dataset['description'] = df_dataset['description'].apply(clean_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dataset['category'] = df_dataset['category'].ap

In [None]:
df_dataset['first_introduced_year'].fillna(df_dataset['first_introduced_year'].mode()[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dataset['first_introduced_year'].fillna(df_dataset['first_introduced_year'].mode()[0], inplace=True)


In [43]:
df_dataset

Unnamed: 0,concept_name,description,keywords,category,subcategory,difficulty_level,mathematical_foundation,application_area,first_introduced_year,popularity_score,is_probabilistic,keywords_parsed
0,Transfer Learning,Transfer Learning is a evaluation technique in...,deployment;feature-engineering;neural-networks...,Reinforcement Learning,Evaluation,Hard,Information Theory,Finance,2023.0,82.0,False,"[deployment, feature-engineering, neural-netwo..."
1,Z-Score Normalization,Z-Score Normalization is a generative modeling...,graph;optimization;regularization;supervised,Reinforcement Learning,Generative Modeling,Medium,Probability,Computer Vision,2001.0,75.0,False,"[graph, optimization, regularization, supervised]"
2,Online Inference,Online Inference is a time series technique in...,dimensionality;evaluation;feature-engineering;...,Deep Learning,Time Series,Easy,Probability,Healthcare,1989.0,87.0,False,"[dimensionality, evaluation, feature-engineeri..."
3,OKAPI BM25,Okapi BM25 is a graph ml technique in optimiza...,deployment;time-series,Optimization,Graph Ml,Hard,Optimization,Finance,2007.0,98.0,False,"[deployment, time-series]"
4,Xgboost,XGBoost is a regression technique in supervise...,feature-engineering;neural-networks;unsupervised,Supervised,Regression,Hard,Probability,Anomaly Detection,1971.0,68.0,False,"[feature-engineering, neural-networks, unsuper..."
...,...,...,...,...,...,...,...,...,...,...,...,...
59995,SARIMA,SARIMA is a regression technique in supervised...,optimization;probabilistic;supervised;time-series,Supervised,Regression,Hard,Graph Theory,Healthcare,1975.0,96.0,False,"[optimization, probabilistic, supervised, time..."
59996,Pairwise Ranking,Pairwise Ranking is a regression technique in ...,dimensionality;optimization;regularization,Supervised,Regression,Hard,Probability,Time Series,1978.0,90.0,False,"[dimensionality, optimization, regularization]"
59997,Fasttext,FastText is a dimensionality reduction techniq...,feature-engineering;neural-networks;probabilis...,Deep Learning,Dimensionality Reduction,Hard,Statistics,Anomaly Detection,1973.0,48.0,False,"[feature-engineering, neural-networks, probabi..."
59998,Association Rules,Association Rules is a regression technique in...,deployment;optimization;unsupervised,Optimization,Regression,Hard,Information Theory,Time Series,1977.0,75.0,False,"[deployment, optimization, unsupervised]"
