In [22]:
import json
import import_ipynb
from importlib import reload
import utils
import inspect
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
import sys
import utils

from sklearn.model_selection import GroupShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('data/')
sys.path.append(str('AI4Code'))
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
reload(utils)

importing Jupyter notebook from utils.ipynb


<module 'utils' from 'utils.ipynb'>

In [96]:
from utils import read_train_data, get_df_ranks, get_ancestors, count_hastags

In [98]:
df = read_train_data(data_dir, NUM_TRAIN=3000)
df_ranks = get_df_ranks(df, data_dir)


Train NBs:   0%|          | 0/3000 [00:00<?, ?it/s][A
Train NBs:   1%|          | 21/3000 [00:00<00:14, 204.88it/s][A
Train NBs:   1%|▏         | 42/3000 [00:00<00:14, 201.89it/s][A
Train NBs:   2%|▏         | 63/3000 [00:00<00:14, 202.89it/s][A
Train NBs:   3%|▎         | 84/3000 [00:00<00:14, 201.79it/s][A
Train NBs:   4%|▎         | 105/3000 [00:00<00:14, 201.15it/s][A
Train NBs:   4%|▍         | 126/3000 [00:00<00:14, 203.03it/s][A
Train NBs:   5%|▍         | 147/3000 [00:00<00:14, 202.63it/s][A
Train NBs:   6%|▌         | 168/3000 [00:00<00:13, 202.92it/s][A
Train NBs:   6%|▋         | 189/3000 [00:00<00:13, 203.32it/s][A
Train NBs:   7%|▋         | 210/3000 [00:01<00:13, 204.83it/s][A
Train NBs:   8%|▊         | 231/3000 [00:01<00:13, 203.66it/s][A
Train NBs:   8%|▊         | 252/3000 [00:01<00:13, 205.08it/s][A
Train NBs:   9%|▉         | 273/3000 [00:01<00:13, 205.78it/s][A
Train NBs:  10%|▉         | 294/3000 [00:01<00:13, 206.40it/s][A
Train NBs:  10%|█       

KeyboardInterrupt: 

# applying feature

In [None]:
df = df.apply(lambda row: count_hastags(row), axis=1)

Train NBs:  13%|█▎        | 1306/10000 [00:19<00:48, 177.75it/s]

# Smart splitting via group shuffling

In [58]:
NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

ids = df.index.unique('id')  # get all the unique ids
ancestors = get_ancestors(data_dir, ids)  # find ancestor by id if it exists
# split the ids using groups. This way the same group/notebooks will be in the test or in the training
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors)) 
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]

print(f"Shape of train: {df_train.shape[0]}; validation: {df_valid.shape[0]}")

Shape of train: 416521; validation: 45150


## preparing training set

In [59]:
def mark_each_cell_with_its_position(current_X, full_df):
    """
    marking each cell with its number if its code, for markdown zero. 
    We are doing it to help the model learn the correct order in lines?
    """
    old_shape = current_X.shape
    current_X = sparse.hstack((
        current_X,
        np.where(
            full_df['cell_type'] == 'code',
            full_df.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
            0,
        ).reshape(-1, 1)
    ))
    new_shape = current_X.shape
    print(f"Shape change {old_shape} -> {new_shape}")
    return current_X

In [64]:
tfidf = TfidfVectorizer(min_df=0.01)  # idf(t) = log [ n / df(t) ] + 1, where df(t) – number of time term is used
def convert_to_TfidfVector(df):
    print("Converting with Tfid vectorizer")
    return tfidf.fit_transform(df.astype(str))

In [65]:
def add_custom_column_to_sparse(current_X, full_df, column):
    print(f"Added {column} to the dataframe")
    return sparse.hstack((
        current_X,
        full_df['hash_count'].values.reshape(-1, 1)
    ))

In [92]:
X_train = convert_to_TfidfVector(df_train['source'])

Converting with Tfid vectorizer


In [83]:
y_train = df_ranks.loc[ids_train].to_numpy()  # get all required train results
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy() # Number of cells in each notebook. will later be used to help xgboost make a ranking

In [84]:
X_train = mark_each_cell_with_its_position(X_train, df_train)

Shape change (416521, 276) -> (416521, 277)


In [85]:
X_train = add_custom_column_to_sparse(X_train, df_train, "hash_count")

Added hash_count to the dataframe


In [70]:
from xgboost import XGBRanker

model = XGBRanker(
    min_child_weight=10,
    subsample=0.5,
    tree_method='hist',
)
model.fit(X_train, y_train, group=groups)

XGBRanker(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None,
          enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1,
          grow_policy='depthwise', importance_type=None,
          interaction_constraints='', learning_rate=0.300000012, max_bin=256,
          max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0,
          min_child_weight=10, missing=nan, monotone_constraints='()',
          n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto',
          random_state=0, reg_alpha=0, reg_lambda=1, ...)

## validation

In [90]:
X_valid = convert_to_TfidfVector(df_valid['source'])

Converting with Tfid vectorizer


In [87]:
y_valid = df_ranks.loc[ids_valid]

In [88]:
X_valid = mark_each_cell_with_its_position(X_valid, df_valid)
X_valid = add_custom_column_to_sparse(X_valid, df_valid, "hash_count")

Shape change (45150, 277) -> (45150, 278)
Added hash_count to the dataframe


In [77]:
y_pred = pd.DataFrame({'rank': model.predict(X_valid)}, index=df_valid.index)
y_pred = (
    y_pred
    .sort_values(['id', 'rank'])  # Sort the cells in each notebook by their rank.
                                  # The cell_ids are now in the order the model predicted.
    .reset_index('cell_id')  # Convert the cell_id index into a column.
    .groupby('id')['cell_id'].apply(list)  # Group the cell_ids for each notebook into a list.
)

ValueError: Feature shape mismatch, expected: 278, got 279