In [None]:
import sys
!{sys.executable} --version
!pip list | grep pickle

In [None]:
%pip install catboost

In [1]:
import pickle
import pathlib
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

from catboost import Pool, CatBoostClassifier

#%load_ext lab_black

pd.options.display.max_colwidth = 250
pd.options.display.max_columns = 250

In [2]:
from google.colab import drive

drive.mount('/content/drive')

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)
(DATA_DIR / "raw").mkdir(exist_ok=True)

!cp drive/MyDrive/comp_data/openhack/kanunum-nlp-doc-analysis-dataset.csv data/raw

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
df = pd.read_csv(Path(DATA_DIR) / "raw" / "kanunum-nlp-doc-analysis-dataset.csv")

In [40]:
LABEL = "kategori"
TEXT_FIELD = "data_text"
N_SPLITS = 5
RANDOM_STATE = 42
OOF = True

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

model_signature = CatBoostClassifier
model_params = dict(auto_class_weights="Balanced",  task_type="GPU", iterations=10)
fit_params = dict(verbose=100)
tfidf_kwargs = dict(sublinear_tf=True, 
                    min_df=5,
                    ngram_range=(1, 1)
                    )

oof = []

for fold_idx, (train_indices, test_indices) in enumerate(cv.split(df.index, df[LABEL])):
    model = model_signature(**model_params)
    tfidf = TfidfVectorizer(**tfidf_kwargs
                            )
  
    tfidf.fit(df.loc[train_indices, TEXT_FIELD])

    fit_params_ = fit_params.copy()
    eval_set = Pool(tfidf.transform(df.loc[test_indices, TEXT_FIELD]).toarray(),
                    df.loc[test_indices, LABEL]
                    )

    model.fit(tfidf.transform(df.loc[train_indices, TEXT_FIELD]).toarray(),
              df.loc[train_indices, LABEL],
              eval_set=eval_set,
              **fit_params)
    
    if OOF:
      oof.append(df.loc[test_indices, LABEL].to_frame("actual").assign(prediction=model.predict(eval_set),
                                                                        prediction_proba=pd.DataFrame(model.predict_proba(eval_set), columns=model.classes_)\
                                                                                                    .reset_index()\
                                                                                                    .melt(id_vars=["index"], var_name="class", value_name="probability")\
                                                                                                    .assign(probability_rank=lambda df: df.groupby("index")["probability"].rank(method="dense"))\
                                                                                                    .query("probability_rank == 1")\
                                                                                                    .sort_values("index")["probability"])
                )
    
if OOF:
    oof = pd.concat(oof)

In [None]:
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(oof["actual"], oof["prediction"], 
                                    #target_names= df['kategori'].unique())
))

In [108]:
def train_fn(df, 
             test_ratio=0.25, 
             artifacts_dir=Path("artifacts")
             ):
  
    LABEL = "kategori"
    TEXT_FIELD = "data_text"

    assert TEXT_FIELD in df.columns and LABEL in df.columns

    artifacts_dir.mkdir(exist_ok=True)

    train_indices, test_indices = train_test_split(df.index, stratify=df[LABEL], test_size=0.25)

    model_signature = CatBoostClassifier
    model_params = dict(auto_class_weights="Balanced",  task_type="GPU", iterations=10)
    fit_params = dict(verbose=100)
    tfidf_kwargs = dict(sublinear_tf=True, 
                        min_df=5,
                        ngram_range=(1, 1)
                        )
    
    model = model_signature(**model_params)
    tfidf = TfidfVectorizer(**tfidf_kwargs
                            )
  
    tfidf.fit(df.loc[train_indices, TEXT_FIELD])

    fit_params_ = fit_params.copy()
    eval_set = Pool(tfidf.transform(df.loc[test_indices, TEXT_FIELD]).toarray(),
                    df.loc[test_indices, LABEL]
                    )

    model.fit(tfidf.transform(df.loc[train_indices, TEXT_FIELD]).toarray(),
              df.loc[train_indices, LABEL],
              eval_set=eval_set,
              **fit_params)
    
    pickle.dump(tfidf, open(artifacts_dir / 'tfidf.pickle', 'wb'))
    model.save_model(artifacts_dir / "model.cbm")

    return model

def predict_fn(df, artifacts_dir=Path("artifacts")):
    LABEL = "kategori"
    TEXT_FIELD = "data_text"
    
    assert TEXT_FIELD in df.columns

    tfidf = pickle.load(open(artifacts_dir / 'tfidf.pickle', 'rb'))

    model = CatBoostClassifier()
    model = model.load_model(artifacts_dir / "model.cbm")

    return model.predict(tfidf.transform(df.loc[:, TEXT_FIELD]).toarray()).ravel()