# Intro

## loading required data and libs

## importing

In [2]:
import warnings
import gc
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

from autocorrect import Speller
import json
import inspect
from pathlib import Path
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import torch
import tokenize
from tokenize import TokenError
import io
tqdm.pandas()
pd.set_option('display.max_colwidth', -1)
import torch
from sentence_transformers import util

import nltk
import re
import string
import emoji

import numpy as np
import pandas as pd
from scipy import sparse
import scipy
from sklearn.pipeline import Pipeline
import sys
import textwrap
import wandb
import spacy
import contextualSpellCheck
from sklearn.preprocessing import FunctionTransformer

from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from bisect import bisect
from time import time

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('data/')
sys.path.append(str('AI4Code'))
%load_ext autoreload
%autoreload 2

# utils

## Metric

In [3]:
def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max


## reading

In [4]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

def read_train_data(data_dir, NUM_TRAIN = 10000, OFFSET=0):
    def read_notebook(path):
        return (
            pd.read_json(
                path,
                dtype={'cell_type': 'category', 'source': 'str'})
            .assign(id=path.stem)  # final path component
            .rename_axis('cell_id')
        )

    paths_train = list((data_dir / 'train').glob('*.json'))[OFFSET:NUM_TRAIN]
    notebooks_train = [
      read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
    ]
    df = (
      pd.concat(notebooks_train)
      .set_index('id', append=True)
      .swaplevel()
      .sort_index(level='id', sort_remaining=False)
    )
    return df

def get_df_orders_and_ranks(df, data_dir):
    # train orders
    df_orders = pd.read_csv(
      data_dir / 'train_orders.csv',
      index_col='id',
      squeeze=True,
    ).str.split()  # cell_ids str -> list


    df_orders_ = df_orders.to_frame().join(
      # reset only one index out of many -> "cell_id"; make a list out of cells in train data
      df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
      how='right',
    )

    ranks = {}
    for id_, cell_order, cell_id in df_orders_.itertuples():
        ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

    df_ranks = (
      pd.DataFrame
      .from_dict(ranks, orient='index')
      .rename_axis('id')
      .apply(pd.Series.explode)
      .set_index('cell_id', append=True)
    )
    # now we have
    # id cell_id rank
    return df_orders, df_ranks


def get_ancestors(data_dir, ids):
    # Split, keeping notebooks with a common origin (ancestor_id) together
    df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
    return df_ancestors.loc[ids, 'ancestor_id']

## text utils

In [5]:
spell = Speller(fast=True)
def correct_spelling(text):
    if text:
        return spell(text)
    else:
        return text
        

def clean_text(text):
    '''Make text lowercase, remove square brackets, replace links with "link" ,
    and remove words containing numbers with "number".'''
    if not text:
        return ''
    text = text.lower()
    text = ' '.join(text.split())
    text = re.sub('<.*?>+', ' ', text)
    text = text.replace('[' , ' ')
    text = text.replace(']' , ' ')
    text = re.sub('http.?://\S+|www\.\S+', 'link', text)
    # text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', '. ', text)
    text = re.sub('\w*\d\w*', 'number', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text


image_pattern = r'(!\[([^\]]*)\]\((.*?)\s*("(?:.*[^"])")?\s*\))'
def replace_image_with_label(text):
    matches = re.findall(image_pattern, text)
    for match in matches:
        if match[1]:
            result = text_preprocessing(' '.join(match[1].split('.')[:-1]))
            result = "image link " + ' '.join(result.split("_"))
        else:
            result = "image link "
        text = text.replace(match[0], result)
    return text_preprocessing(text)


def clean_code(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.replace('[', ' ').replace(']', ' ').replace('(', ' ')\
    .replace(')', ' ').replace('{', ' ').replace('}', ' ').replace('=', ' ').replace(',', ' ')
    text = text.lower()
    text = text.replace('_', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('.', ' ')
    text = re.sub(r'".*"', ' ', text)
    text = re.sub(r"'.*'", ' ', text)
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

def code_preprocessing(text):
    """
    Cleaning and parsing the text.
    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_code(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text

# Reading

In [6]:
# example pipeline work
df = read_train_data(data_dir, NUM_TRAIN=500)
df_orders, df_ranks = get_df_orders_and_ranks(df, data_dir)

print(f"Df shape is {df.shape}")

HBox(children=(FloatProgress(value=0.0, description='Train NBs', max=500.0, style=ProgressStyle(description_wi…


Df shape is (23461, 2)


# Feature Engineering

## Small features

### additional load

In [40]:
def load_entities(unload=False):
    global ner
    if not unload:
        ner = spacy.load("en_core_web_sm")
    else:
        del ner

def load_ml_glossary(unload=False):
    global embedder, terms, corpus_embeddings
    if not unload:
        embedder = SentenceTransformer('all-MiniLM-L6-v2')
        ml_glossary = pd.read_csv("machine_learning_glossary_terms.csv")
        # sentences we will be searching through
        corpus = np.array(ml_glossary['definition'])
        terms = np.array(ml_glossary['term'])
        corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    else:
        del embedder, corpus_embeddings
        gc.collect()

### general implementations

In [70]:
additional_load = {
    "collect_entities": load_entities,
    "get_top_glossary_terms": load_ml_glossary
}

def prepare_data(df, func, new_column, on_column='source'):
    tic = time()
    func_name = func.__name__
    print(f"{func_name} on {on_column} -> {new_column} column")
    if func_name in additional_load:
        additional_load[func_name]()
    df[new_column] = df.progress_apply(lambda x: func(x[on_column]), axis=1)
    if func_name in additional_load:
        additional_load[func_name](unload=True)
    toc = time()
    print(f"COMPLETION TIME = {toc-tic:.1f} s")
    print("-"*25+">")
    print("\n")
    return df

def prepare_data_with_vector(df, func, new_column, on_column='source'):
    tic = time()
    func_name = func.__name__
    if func_name in additional_load:
        additional_load[func_name]()
    print(f"{func.__name__} on {on_column} -> {new_column} column ...")
    vector = df.progress_apply(lambda x: func(x[on_column]), axis=1).values
    if func_name in additional_load:
        additional_load[func_name](unload=True)
    new_columns = [f"{new_column}_{x}" for x in range(len(vector[0]))]
    vector_df = pd.DataFrame(np.row_stack(vector), columns=new_columns)  
    # todo return vector and the stack them
    df = pd.concat([df, vector_df], ignore_index=False, axis=1)
    
    toc = time()
    print(f"COMPLETION TIME = {toc-tic:.1f} s")
    print("-"*25+">")
    print("\n")
    return df

### tokenizing python

In [71]:
def tokenize_python_code_names(text):
    try:
        code_text = tokenize.generate_tokens(io.StringIO(text).readline)
        strings = [tok.string for tok in code_text if tok.type==1]
    except Exception:
        return None # Error happened
    return ' '.join(strings)

def tokenize_python_code_comments(text):
    try:
        code_text = tokenize.generate_tokens(io.StringIO(text).readline)
        strings = [tok.string for tok in code_text if tok.type==55]
    except Exception:
        return None # Error happened
    return ' '.join(strings)

### lemming

In [72]:
lem = nltk.stem.wordnet.WordNetLemmatizer()  # lemmatizer  
def lemm_sentence(text):
    lst_txt = [lem.lemmatize(word) for word in text.split()]
    return ' '.join(lst_txt)

### entities

In [73]:
def collect_entities(text):
    "Named Entity Recognition"
    rs = ner(text)
    labels = []
    for r in rs.ents:
        labels.append(r.label_)
    return ' '.join(labels)

### heading order

In [74]:
def get_hedding_order(text) -> int:
    """ Get heading order """
    text = text.strip()
    heading_number = text.split(" ")[0].count("#")
    if len(text.split(" ")[0]) != heading_number or not heading_number:
        return None
    return heading_number

### TFID

In [75]:
def convert_to_TfidfVector(df, max_features=100):
    tfidf = TfidfVectorizer(min_df=0.01, max_features=max_features)
    return tfidf, tfidf.fit_transform(df.astype(str))


## ALPHA 1 todo list


## Machine Learning glossary feature

In [76]:
# better do not vector but words and the put them to TFID
top_k=5
def get_top_glossary_terms(query):
    output_vector = np.zeros((len(terms)))
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k, query_chunk_size=500)
    # todo add threshold 0.5? mb
    output_vector[[hit['corpus_id'] for hit in hits[0]]] = [hit['score'] for hit in hits[0]]
    return output_vector
# Adds len(terms) = 249 columns to dataset

# Pipeline

## markdown pipeline

In [77]:
markdowns_pipeline = Pipeline([    
    ('Heading order', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': get_hedding_order,
             'new_column': "heading_order",
             'on_column': "source"})),
    
    ('Replacing image with label', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': replace_image_with_label,
             'new_column': "source",
             'on_column': "source"})),
    
    ('Clearing and parsing source', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': text_preprocessing,
             'new_column': "source_clean",
             'on_column': "source"})),
    
    ('Spelling', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': correct_spelling,
             'new_column': "source_clean",
             'on_column': "source_clean"})),
    
    ('Entities', FunctionTransformer(
         func=prepare_data,
         kw_args={
         'func': collect_entities,
         'new_column': "entities",
         'on_column': "source_clean"})),
    
    ('ML glossary feature', FunctionTransformer(
         func=prepare_data_with_vector,
         kw_args={
             'func': get_top_glossary_terms,
             'new_column': "glossary_ml_terms",
             'on_column': "source_clean"})),

##### COMMENT FOR BERT
    ('Lemmatizing sentences', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': lemm_sentence,
             'new_column': "source_clean",
             'on_column': "source_clean"})),
])

## code pipeline

### code comments pipeline

In [78]:
code_comments_sub_pipeline = Pipeline([
    
    ('[comments] Extracting code comments', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': tokenize_python_code_comments,
             'new_column': "code_comments",
             'on_column': "source"})),
    
    ('[comments] Clearing and parsing code comments', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': text_preprocessing,
             'new_column': "code_comments",
             'on_column': "code_comments"})),
    
    ('[comments] Correcting spelling', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': correct_spelling,
             'new_column': "code_comments",
             'on_column': "code_comments"})),
    
    ('[comments] Lemmatizing code comments', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': lemm_sentence,
             'new_column': "code_comments",
             'on_column': "code_comments"})),
        # todo make comments statistics and implement it if column is not empty
    
    ('[comments] Len of code comments', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': lambda x: len(str(x).split()),
             'new_column': "len_of_code_comments",
             'on_column': "code_comments"})),
])

### code pipeline

In [79]:
code_pipeline = Pipeline([
    ('Clearing and parsing source', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': code_preprocessing,
             'new_column': "source_clean",  # TODO maybe even remove it or leave the same. check on gridsearch later
             'on_column': "source"})),
    
    ('Extacting code names', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': tokenize_python_code_names,
             'new_column': "python_code_names",
             'on_column': "source"})),
    
    ('Comments', code_comments_sub_pipeline)
])

## counting features pipe

In [80]:
stats_pipeline = Pipeline([
    ('Length', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': lambda x: len(str(x).split()),
             'new_column': "text_len",
             'on_column': "source_clean"})),
    ('Word count', FunctionTransformer(
         func=prepare_data,
         kw_args={
             'func': lambda x: len(str(x)),
             'new_column': "text_word_count",
             'on_column': "source_clean"})),

])

## Pipeline helper func

In [81]:
def preprocess_data(df):
    df['index_col'] = range(1, len(df) + 1)  # to merge later
    
    # splitting on markdown and code
    markdowns = df[df['cell_type'] == 'markdown'].reset_index()
    codes = df[df['cell_type'] == 'code'].reset_index()

    print(f"Df shape is {df.shape} markdowns {markdowns.shape} code {codes.shape}")
    
    markdowns = markdowns_pipeline.fit_transform(markdowns)
    markdowns = stats_pipeline.fit_transform(markdowns)
    codes = code_pipeline.fit_transform(codes)
    codes = stats_pipeline.fit_transform(codes)
    
    # setting rank
    codes['rank'] = codes.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1
    markdowns['rank'] = 0

    # Stacking back together
    df = pd.concat([codes, markdowns]).set_index(['id', 'cell_id', 'index_col'])
    
    return df

# Running

In [82]:
df = preprocess_data(df)

ValueError: cannot insert index_col, already exists

# Training

## splitting

In [53]:
NVALID = 0.1

ids = df.index.unique('id')
ancestors = get_ancestors(data_dir, ids)
splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors)) 
ids_train, ids_valid = ids[ids_train], ids[ids_valid]
df_train = df.loc[ids_train, :].sort_index(level='index_col')
df_valid = df.loc[ids_valid, :].sort_index(level='index_col')

## transforming

In [54]:
TFID_FEATURES = {
    'entities': None,
    # "glossary_ml_terms": None,
    "python_code_names": None,
    "code_comments": None,
    "source_clean": None,
} # todo gridsearch

In [55]:
terms_cols = [x for x in df.columns if x.startswith("glossary_ml_terms")]

In [56]:
gc.collect()

125

In [57]:
terms_vals = scipy.sparse.csr_matrix(df_train[terms_cols].fillna(0.0).values)

In [58]:
tfid_transformers = {}
vector_shapes = []
X_train = np.array([])
for feature, max_n in tqdm(TFID_FEATURES.items()):
    tfid_tr, tfid_vector = convert_to_TfidfVector(df_train[feature], max_n)
    vector_shapes.extend(tfid_vector.shape[1]*[feature])
    print(f"New {feature} vector TFID {tfid_vector.shape}")
    tfid_transformers[feature] = tfid_tr
    if not X_train.shape[0]:
        X_train = tfid_vector
    else:
        X_train = sparse.hstack((X_train, tfid_vector))

X_train = sparse.hstack((
    X_train, 
    np.where(
        df_train['cell_type'] == 'code',
        df_train.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
    ))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

New entities vector TFID (42429, 7)
New python_code_names vector TFID (42429, 104)
New code_comments vector TFID (42429, 35)
New source_clean vector TFID (42429, 214)



In [59]:
X_train = sparse.hstack((
    X_train, 
    terms_vals
    ))

In [68]:
# Rank of each cell within the notebook
y_train = df_ranks.loc[ids_train].to_numpy()
# Number of cells in each notebook
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy()

In [116]:
from xgboost import XGBRanker

model = XGBRanker(
    min_child_weight=10,
    subsample=0.5,
    tree_method='hist',
)
model.fit(X_train, y_train, group=groups)

XGBRanker(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None,
          enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1,
          grow_policy='depthwise', importance_type=None,
          interaction_constraints='', learning_rate=0.300000012, max_bin=256,
          max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0,
          min_child_weight=10, missing=nan, monotone_constraints='()',
          n_estimators=100, n_jobs=0, num_parallel_tree=1,
          objective='rank:pairwise', predictor='auto', random_state=0,
          reg_alpha=0, ...)

In [70]:
X_valid = np.array([])
for feature, _ in tqdm(TFID_FEATURES.items()):
    transformer = tfid_transformers[feature]
    tfid_vector = transformer.transform(df_valid[feature].astype('str'))
    if not X_valid.shape[0]:
        X_valid = tfid_vector
    else:
        X_valid = sparse.hstack((X_valid, tfid_vector))

X_valid = sparse.hstack((
    X_valid, 
    np.where(
        df_valid['cell_type'] == 'code',
        df_valid.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
    ))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [71]:
terms_vals = scipy.sparse.csr_matrix(df_valid[terms_cols].fillna(0.0).values)
X_valid = sparse.hstack((
    X_valid, 
    terms_vals
    ))

In [117]:
y_valid = df_orders.loc[ids_valid]

In [118]:
y_pred = pd.DataFrame({'rank': model.predict(X_valid)}, index=df_valid.index)
y_pred = (
    y_pred
    .sort_values(['id', 'rank'])  # Sort the cells in each notebook by their rank.
                                  # The cell_ids are now in the order the model predicted.
    .reset_index('cell_id')  # Convert the cell_id index into a column.
    .groupby('id')['cell_id'].apply(list)  # Group the cell_ids for each notebook into a list.
)

In [119]:
y_dummy = df_valid.reset_index('cell_id').groupby('id')['cell_id'].apply(list)
kendall_tau(y_valid, y_dummy) 

0.283291481553023

In [120]:
kendall_tau(y_valid, y_pred)
# 0.395 on 1k

0.39578134054471703

In [89]:
kendall_tau(y_valid, y_pred) # 0.39

0.3743558045199923

In [121]:
vector_shapes.extend(terms_cols)

vector_shapes = np.array(vector_shapes)

features_importance = model.feature_importances_[:-1]

In [122]:
k=10000
position_shift = 0
for x in features_importance.argsort()[-k:][::-1]:
    feature_name = vector_shapes[x]
    print(f"feature vector {feature_name} ")
    position_shift = np.where(vector_shapes == feature_name)[0][0]
    if "ml_terms" in feature_name:
        print(
            f"{terms[int(feature_name.split('_')[-1])]} -> {model.feature_importances_[x]}"
        )
    else:
        print(
            f"{tfid_transformers[feature_name].get_feature_names()[x-position_shift]} -> \
{model.feature_importances_[x]}"
        )
    print("-"*30)

feature vector glossary_ml_terms_0 
validation -> 0.016306407749652863
------------------------------
feature vector glossary_ml_terms_71 
dashboard -> 0.005712929181754589
------------------------------
feature vector glossary_ml_terms_39 
adam_optimization -> 0.0053808619268238544
------------------------------
feature vector source_clean 
kaggle -> 0.005372083745896816
------------------------------
feature vector source_clean 
link -> 0.005234787240624428
------------------------------
feature vector source_clean 
missing -> 0.005044576246291399
------------------------------
feature vector source_clean 
out -> 0.005032229702919722
------------------------------
feature vector glossary_ml_terms_212 
independent_variable -> 0.004935173783451319
------------------------------
feature vector glossary_ml_terms_153 
response_variable -> 0.004728739615529776
------------------------------
feature vector glossary_ml_terms_136 
ordinal_variable -> 0.004659112077206373
---------------------