### NOTES
---
Reference: [CodeBERT + Pairwise](https://www.kaggle.com/code/devrimzay/codebert-pairwise)

#### LIBRARIES

In [1]:
import re
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from scipy import sparse

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupShuffleSplit

import nltk
from nltk.stem import WordNetLemmatizer

#### UTILITIES

In [2]:
def read_notebook(path):
    """
    Read the notebook JSON content.
    """
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )


def encode_orders(raw_data_order, ground_truth_order):
    """
    Encode the cell orders from cell_ids to numerical values.
    
    Parameters
    ----------
    raw_data_order
        The cell order as per the raw JSON data.
    ground_truth_order
        The actual logical cell order.
        
    Returns
    -------
    The encoded numerical order.
    """
    return [ground_truth_order.index(cell) for cell in raw_data_order]

#### DATA

In [3]:
data_dir = Path.cwd()/"AI4Code"
train_dir = data_dir/"train"

# Train orders.
train_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id'
).squeeze().str.split()

# Train ancestors
train_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')

### UNPACKING PROCESS

In [4]:
# Starting samples.
samples = 300

In [5]:
# Sample paths.
paths_train = list((train_dir).glob('*.json'))[:samples]
notebooks_train = [read_notebook(path) for path in tqdm(paths_train, desc='Loading')]

Loading: 100%|██████████████████████████████████████████████████████████████████████| 300/300 [00:01<00:00, 258.30it/s]


In [6]:
notebooks_train[0]

Unnamed: 0_level_0,cell_type,source,id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1862f0a6,code,# This Python 3 environment comes with many he...,00001756c60be8
2a9e43d6,code,import numpy as np\nimport pandas as pd\nimpor...,00001756c60be8
038b763d,code,import warnings\nwarnings.filterwarnings('igno...,00001756c60be8
2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14}),00001756c60be8
0beab1cd,code,"def evaluate_preds(train_true_values, train_pr...",00001756c60be8
9a78ab76,code,TRAIN_DATASET_PATH = '/kaggle/input/real-estat...,00001756c60be8
ebe125d5,code,train_df = pd.read_csv(TRAIN_DATASET_PATH)\ntr...,00001756c60be8
d9dced8b,code,train_df.dtypes,00001756c60be8
86497fe1,code,num_feat = list(train_df.select_dtypes(exclude...,00001756c60be8
e2c8e725,code,test_df = pd.read_csv(TEST_DATASET_PATH)\ntest...,00001756c60be8


In [7]:
train_data = pd.concat(
    notebooks_train # The notebooks.
).set_index(
    'id', append=True # Set id as index.
).swaplevel( # Swap cell_id with the notebook id.
).sort_index(
    level='id', sort_remaining=False # Sort by id.
)

#### HANDLING THE LABELS

In [8]:
# Get an example notebook.
example_id = train_data.index.unique('id')[42]

In [9]:
# Unordered notebook.
nb = train_data.loc[example_id, :]
nb;

In [10]:
# Ordered notebook.
cell_order = train_orders.loc[example_id]
nb.loc[cell_order, :]

Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
21147235,markdown,# Machine intelligence Project:\n## Investigat...
6c01d0d2,code,# This Python 3 environment comes with many he...
5bd28595,markdown,# 1) Data Cleaning
b8fd3a8c,code,import numpy as np\nimport pandas as pd\nimpor...
a2501363,markdown,"#### First, we will have a view of the data in..."
...,...,...
9906172a,code,from sklearn.linear_model import RidgeClassifi...
5ca1d878,markdown,"# Ensembling of Logistic Regression,KNN, Deci..."
6dfce6b0,code,import statistics as st\nfinal_pred = np.array...
79ea8736,markdown,# Conclusion


In [11]:
# Add the numerically encoded cell ranks.
nb.insert(0, 'rank', encode_orders(nb.index, cell_order))
nb

Unnamed: 0_level_0,rank,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6c01d0d2,1,code,# This Python 3 environment comes with many he...
b8fd3a8c,3,code,import numpy as np\nimport pandas as pd\nimpor...
94d3d43a,5,code,df.dropna()\ndf.head()
9ecece20,6,code,df.shape
808d31ab,7,code,df.columns
...,...,...,...
fb5c88e1,47,markdown,## B- Explorartory Data Analysis\n### After ge...
ec57b602,59,markdown,#### We have used age and waiting_days columns...
449510e4,87,markdown,# E- SVM Polynomial Kernel
9d30d44b,30,markdown,### 1) What is the ratio between males and fem...


In [12]:
# Get the raw data order (cell_id) and the ground truth order (cell_order).
cell_orders = train_orders.to_frame().join(
    train_data.reset_index("cell_id").groupby("id")["cell_id"].apply(list), how='right'
)

In [13]:
# Encode all the ranks.
ranks = {}
for id_, cell_order, cell_id in cell_orders.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': encode_orders(cell_id, cell_order)}
    
# Create train ranks dataframe.
train_ranks = pd.DataFrame.from_dict(
    ranks, orient='index'
).rename_axis(
    'id'
).apply(
    pd.Series.explode
).set_index('cell_id', append=True)

In [14]:
# Merge all pieces of data. 
train = train_data.reset_index().merge(train_ranks, on=['id', 'cell_id']).merge(train_ancestors, on=['id'])

In [15]:
train["pct_rank"] = train["rank"] / train.groupby("id")["cell_id"].transform("count")
# train[["id", "cell_id", "rank", "pct_rank"]]

### SOURCE DATA PREPROCESSING

In [16]:
nltk.download("wordnet")
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moses\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\moses\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [17]:
# Get the source text into a dictionary.
source_dict = dict(zip(train['cell_id'].values, train['source'].values))

# Initialize the stemmer.
stemmer = WordNetLemmatizer()

def preprocess_text(document):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(document))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()
    #return document

    # Lemmatization
    tokens = document.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if len(word) > 3]

    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [18]:
# Apply the preprocessing.
train.source = train.source.apply(preprocess_text)

#### CREATING VALIDATION SET

In [19]:
val_size = 0.1  # Size of validation set
splitter = GroupShuffleSplit(n_splits=1, test_size=val_size, random_state=0)
train_idx, val_idx = next(splitter.split(train, groups=train["ancestor_id"]))

In [20]:
trainset = train.loc[train_idx].reset_index(drop=True)
valset = train.loc[val_idx].reset_index(drop=True)

In [21]:
# Ensure no mix of notebooks with the same ancestors in the two sets.
np.intersect1d(trainset['ancestor_id'].unique(), valset['ancestor_id'].unique())

array([], dtype=object)