# Intro

## loading required data and libs

## importing

In [1]:
import json
import inspect
from pathlib import Path
import pylev
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer, util
import torch

import nltk
import re
import string

import numpy as np
import pandas as pd
from scipy import sparse
import sys
import textwrap

from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('data/')
sys.path.append(str('AI4Code'))
%load_ext autoreload
%autoreload 2




In [2]:
import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

## text utils

In [3]:
def jaccard(str1, str2):
    "Find intersection between two strings"
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = text.strip()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text

def clean_code(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.replace('[', ' ').replace(']', ' ').replace('(', ' ').replace(')', ' ').replace('{', ' ').replace('}', ' ').replace('=', ' ').replace(',', ' ')
    text = text.lower()
    text = text.replace('_', '')
    text = text.replace('\n', ' ')
    text = text.replace('.', ' ')
    text = re.sub(r'".*"', ' ', text)
    text = re.sub(r"'.*'", ' ', text)
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

def code_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_code(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text

def count_hastags(row):
    "Count the number of hashtags "
    row['hash_count'] = row['source'].count('# ') if row['cell_type']=='markdown' else 0
    return row

## reading utils

In [9]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

def read_train_data(data_dir, NUM_TRAIN = 10000):
    def read_notebook(path):
        return (
            pd.read_json(
                path,
                dtype={'cell_type': 'category', 'source': 'str'})
            .assign(id=path.stem)  # final path component
            .rename_axis('cell_id')
        )

    paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
    notebooks_train = [
      read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
    ]
    df = (
      pd.concat(notebooks_train)
      .set_index('id', append=True)
      .swaplevel()
      .sort_index(level='id', sort_remaining=False)
    )
    return df

def get_df_orders_and_ranks(df, data_dir):
    # train orders
    df_orders = pd.read_csv(
      data_dir / 'train_orders.csv',
      index_col='id',
      squeeze=True,
    ).str.split()  # cell_ids str -> list


    df_orders_ = df_orders.to_frame().join(
      # reset only one index out of many -> "cell_id"; make a list out of cells in train data
      df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
      how='right',
    )

    ranks = {}
    for id_, cell_order, cell_id in df_orders_.itertuples():
        ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

    df_ranks = (
      pd.DataFrame
      .from_dict(ranks, orient='index')
      .rename_axis('id')
      .apply(pd.Series.explode)
      .set_index('cell_id', append=True)
    )
    # now we have
    # id cell_id rank
    return df_orders, df_ranks


def get_ancestors(data_dir, ids):
    # Split, keeping notebooks with a common origin (ancestor_id) together
    df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
    return df_ancestors.loc[ids, 'ancestor_id']

## Reading

In [278]:
df = read_train_data(data_dir, NUM_TRAIN=10000)
df_orders, df_ranks = get_df_orders_and_ranks(df, data_dir)

Train NBs: 100%|██████████| 10000/10000 [02:38<00:00, 63.21it/s]


# Procesisng Bert
https://www.kaggle.com/code/parulpandey/eda-and-preprocessing-for-bert

## cleaning with regex

In [279]:
markdowns = df[df['cell_type'] == 'markdown']
codes = df[df['cell_type'] == 'code']

In [280]:
codes.loc[:, 'source_clean'] = codes['source'].apply(str).apply(lambda x: code_preprocessing(x)).copy().values
markdowns.loc[:, 'source_clean'] = markdowns['source'].apply(str).apply(lambda x: text_preprocessing(x)).copy().values

In [281]:
df = pd.concat([codes, markdowns])

## counting features

In [282]:
df['text_len'] = df['source_clean'].astype(str).apply(len)
df['text_word_count'] = df['source_clean'].apply(lambda x: len(str(x).split()))

## Detecting language

In [283]:
from langdetect import detect

In [284]:
def detect_language(row):
    try:
        if row['cell_type'] == 'markdown':
            if row['text_word_count'] > 2:
                lang = detect(row['source_clean'])
            else:
                lang = 'en'
        else:
            lang='python'
    except :
        lang = None
        if row['source_clean']:
            print(f"Not found {row['source_clean']}")
    return lang

In [17]:
%%time
# df['lang'] = df.apply(detect_language, axis=1)
# maybe later groupby and then select the most popular group and mark with it the whole notebook

CPU times: user 40 s, sys: 334 ms, total: 40.3 s
Wall time: 40.5 s


## Machine Learning glossary feature

In [285]:
from sentence_transformers import SentenceTransformer, util
import torch

In [286]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')



In [287]:
ml_glossary = pd.read_csv("machine_learning_glossary_terms.csv")

In [288]:
# sentences we will be searching through
corpus = np.array(ml_glossary['definition'])
terms = np.array(ml_glossary['term'])
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

## experiment

### implementing feature

In [289]:
result_vector = np.zeros_like(terms)

In [290]:
top_k=7
def get_top_glossary_terms(query):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
    hits = [hit['corpus_id'] for hit in hits[0]]
    return hits

In [None]:
result_vector = np.zeros(shape=(len(df), len(terms)))
for i, text in enumerate(df['source_clean']):
    if i and i % 10000 == 0:
        print("+10000")
    posititions = get_top_glossary_terms(text)
    result_vector[i, posititions] = 1

+10000
+10000
+10000
+10000
+10000
+10000
+10000
+10000


In [None]:
df = pd.concat([df, pd.DataFrame(result_vector).set_index(df.index)], axis=1)

# Smart splitting via group shuffling

In [None]:
NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

ids = df.index.unique('id')  # get all the unique ids
ancestors = get_ancestors(data_dir, ids)  # find ancestor by id if it exists
# split the ids using groups. This way the same group/notebooks will be in the test or in the training
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors)) 
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]

print(f"Shape of train: {df_train.shape[0]}; validation: {df_valid.shape[0]}")

## preparing training set

## Preparation utils

In [None]:
def mark_each_cell_with_its_position(current_X, full_df):
    """
    marking each cell with its number if its code, for markdown zero. 
    We are doing it to help the model learn the correct order in lines?
    """
    old_shape = current_X.shape
    current_X = sparse.hstack((
        current_X,
        np.where(
            full_df['cell_type'] == 'code',
            full_df.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
            0,
        ).reshape(-1, 1)
    ))
    new_shape = current_X.shape
    print(f"Shape change {old_shape} -> {new_shape}")
    return current_X

# idf(t) = log [ n / df(t) ] + 1, where df(t) – number of time term is used
tfidf = TfidfVectorizer(min_df=0.01, max_features=100, stop_words='english')
def convert_to_TfidfVector(df):
    print("Converting with Tfid vectorizer")
    return tfidf.fit_transform(df.astype(str))

def add_data_to_sparse(current_X, values):
    print(f"Added {values.shape} to the dataframe")
    return sparse.hstack((
        current_X,
        values
    ))

## Splitting training

In [None]:
# use sklearn pipeline
X_train = convert_to_TfidfVector(df_train['source_clean'])
X_train = mark_each_cell_with_its_position(X_train, df_train)
X_train = add_data_to_sparse(X_train, df_train.iloc[:, 5:].values)

In [None]:
y_train = df_ranks.loc[ids_train].to_numpy()  # get all required train results
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy() # Number of cells in each notebook. will later be used to help xgboost make a ranking

# Basic model

In [None]:
from xgboost import XGBRanker

model = XGBRanker(
    min_child_weight=10,
    subsample=0.5,
    tree_method='hist',
)
model.fit(X_train, y_train, group=groups)

## validation

In [None]:
X_valid = convert_to_TfidfVector(df_valid['source_clean'])
X_valid = mark_each_cell_with_its_position(X_valid, df_valid)
X_valid = add_data_to_sparse(X_valid, df_valid.iloc[:, 5:].values)

In [None]:
y_valid = df_orders.loc[ids_valid]

In [None]:
y_pred = pd.DataFrame({'rank': model.predict(X_valid)}, index=df_valid.index)
y_pred = (
    y_pred
    .sort_values(['id', 'rank'])  # Sort the cells in each notebook by their rank.
                                  # The cell_ids are now in the order the model predicted.
    .reset_index('cell_id')  # Convert the cell_id index into a column.
    .groupby('id')['cell_id'].apply(list)  # Group the cell_ids for each notebook into a list.
)

# Metrics

## metrics utils

In [None]:
from bisect import bisect


def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max


## results

In [None]:
y_dummy = df_valid.reset_index('cell_id').groupby('id')['cell_id'].apply(list)
kendall_tau(y_valid, y_dummy)

In [None]:
kendall_tau(y_valid, y_pred)