In [17]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

import nltk
import re
import string

# Preparation

# Features

In [None]:
def jaccard(str1, str2):
    "Find intersection between two strings"
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = text.strip()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text

def clean_code(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.replace('[', ' ').replace(']', ' ').replace('(', ' ').replace(')', ' ').replace('{', ' ').replace('}', ' ').replace('=', ' ').replace(',', ' ')
    text = text.lower()
    text = text.replace('_', '')
    text = text.replace('\n', ' ')
    text = text.replace('.', ' ')
    text = re.sub(r'".*"', ' ', text)
    text = re.sub(r"'.*'", ' ', text)
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

def code_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_code(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [9]:
def count_hastags(row):
    row['hash_count'] = row['source'].count('# ') if row['cell_type']=='markdown' else 0
    return row

# Embeddings

# General

In [14]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

# reading data

In [15]:
def read_train_data(data_dir, NUM_TRAIN = 10000):
    def read_notebook(path):
        return (
            pd.read_json(
                path,
                dtype={'cell_type': 'category', 'source': 'str'})
            .assign(id=path.stem)  # final path component
            .rename_axis('cell_id')
        )

    paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
    notebooks_train = [
      read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
    ]
    df = (
      pd.concat(notebooks_train)
      .set_index('id', append=True)
      .swaplevel()
      .sort_index(level='id', sort_remaining=False)
    )
    return df

def get_df_orders_and_ranks(df, data_dir):
    # train orders
    df_orders = pd.read_csv(
      data_dir / 'train_orders.csv',
      index_col='id',
      squeeze=True,
    ).str.split()  # cell_ids str -> list


    df_orders_ = df_orders.to_frame().join(
      # reset only one index out of many -> "cell_id"; make a list out of cells in train data
      df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
      how='right',
    )

    ranks = {}
    for id_, cell_order, cell_id in df_orders_.itertuples():
        ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

    df_ranks = (
      pd.DataFrame
      .from_dict(ranks, orient='index')
      .rename_axis('id')
      .apply(pd.Series.explode)
      .set_index('cell_id', append=True)
    )
    # now we have
    # id cell_id rank
    return df_orders, df_ranks


def get_ancestors(data_dir, ids):
    # Split, keeping notebooks with a common origin (ancestor_id) together
    df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
    return df_ancestors.loc[ids, 'ancestor_id']