### Initialize - run without asking

In [1]:
# Importing Required Library
import pandas as pd
import lightgbm as lgb
 
# Similarly LGBMRegressor can also be imported for a regression model.
from lightgbm import LGBMClassifier

import os
import pandas as pd
import cv2
from box_utils.boxes import *
from data.visualization import Visualization

import glob
import pickle
import itertools
from tqdm import tqdm
from modules.kv_embedding_full_features import KVEmbedding
import numpy as np
import time
from sklearn.metrics import f1_score, precision_score, recall_score
import json
import warnings
warnings.filterwarnings("ignore")

class Preparation_data():
    def __init__(self):
        self.cols = ['k_id', 'k_text', 'k_box', 'v_id', 'v_text', 'v_box', 'k_embed', 'v_embed', 'width', 'height', 'fname']
        # self.scaler_path = "/home/chuongphung/projects/chatgpt/XGBoost/20231106_xgboost_weights/scaler.pkl"
        # self.scaler = self.load_scaler()
        self.device = "cuda"
        self.kv_embed = KVEmbedding(self.device)
        
    def preprocess_ser2re_batch_ver2(self, im_path, data = None):
            h, w = cv2.imread(im_path).shape[:2]
            f_name = os.path.basename(im_path)
            df_d = pd.DataFrame(data)
            df_d['width'] = w
            df_d['height'] = h
            df_d['fname'] = f_name
            re = []
            # start_time = time.time()
            # print("--------======", df_d.transcription.values)
            df_key = df_d[df_d.label.str.lower()=='question']
            if df_key.shape[0] == 0:
                # print("No question-answer pair was found")
                re = pd.DataFrame(re)
                return re
            df_key_transcription = df_key.transcription.values.tolist()
            df_key_transcription = self.kv_embed.embedding(df_key_transcription)
            df_key["embedding"] = df_key_transcription.tolist()
            
            df_value = df_d[df_d.label.str.lower()=='answer']
            if df_value.shape[0] == 0:
                # print("No question-answer pair was found")
                re = pd.DataFrame(re)
                return re
            df_value_transcription = df_value.transcription.values.tolist()
            df_value_transcription = self.kv_embed.embedding(df_value_transcription)
            df_value["embedding"] = df_value_transcription.tolist()
            
            for key in df_key.iterrows():
                linking = key[-1].linking
                for value in df_value.iterrows():
                    if [key[-1].id, value[-1].id] in linking:
                        link_label =1.0
                    else:
                        link_label =0.0
                    re.append({
                        'k_id': key[-1].id,
                        'k_text': str(key[-1].transcription),
                        'k_embed': key[-1].embedding,
                        'k_box': points2xyxy(key[-1].points),
                        'v_id': value[-1].id,
                        'v_text': value[-1].transcription,
                        'v_embed': value[-1].embedding,
                        'v_box': points2xyxy(value[-1].points),
                        'width': w,
                        'height': h,
                        'fname': os.path.basename(f_name),
                        'label': link_label
                    })
            # print("------------================time 2 for loops", time.time()-start_time)
            re = pd.DataFrame(re)
            if re.shape[0] == 0:
                # print("No question-answer pair was found")
                return re
        
            return re.reset_index(drop=True)

    def make_features(self, df:pd.DataFrame):
        """Create feature from dataframe

        Args:
            df (pd.DataFrame): input data

        Returns:
            pd.DataFrame: feature after process
        """
        # print(self.cols + ['label'])
        df = df[self.cols + ['label']]
        df.k_box = df.apply(lambda x: normalize_scale_bbox(x.k_box, x.width, x.height), axis=1)
        df.v_box = df.apply(lambda x:normalize_scale_bbox(x.v_box, x.width, x.height), axis=1)
        k_features = pd.DataFrame(df.k_box.tolist(), index=df.index, columns=['k_' + s for s in ['x1', 'y1', 'x2', 'y2']])
        v_features = pd.DataFrame(df.v_box.tolist(), index=df.index, columns=['v_' + s for s in ['x1', 'y1', 'x2', 'y2']])
        
        df = pd.concat([k_features, v_features, df[self.cols], df['label']], axis=1)
        
        df['k_cx'] = df.k_x1.add(df.k_x2).div(2)
        df['k_cy'] = df.k_y1.add(df.k_y2).div(2)
        
        df['v_cx'] = df.v_x1.add(df.v_x2).div(2)
        df['v_cy'] = df.v_y1.add(df.v_y2).div(2)
        
        df['fe1'] = abs(df.v_x1 - df.k_x1)
        df['fe2'] = abs(df.v_y1 - df.k_y1)
        df['fe3'] = abs(df.v_x1 - df.k_x2)
        df['fe4'] = abs(df.v_y1 - df.k_y2)
        df['fe5'] = abs(df.v_x2 - df.k_x1)
        df['fe6'] = abs(df.v_y2 - df.k_y1)
        df['fe7'] = abs(df.v_x2 - df.k_x2)
        df['fe8'] = abs(df.v_y2 - df.k_y2)
        df['fe9'] = abs(df.v_x2 - df.v_x1)
        df['fe10'] = abs(df.v_y2 - df.v_y1)
        df['fe11'] = abs(df.k_x2 - df.k_x1)
        df['fe12'] = abs(df.k_y2 - df.k_y1)
        
        df['fe13'] = df.apply(lambda x: cal_degrees([x.k_x1, x.k_y1], [x.v_x1, x.v_y1]), axis=1)
        df['fe14'] = df.apply(lambda x: cal_degrees([x.k_x2, x.k_y1], [x.v_x2, x.v_y1]), axis=1)
        df['fe15'] = df.apply(lambda x: cal_degrees([x.k_x2, x.k_y2], [x.v_x2, x.v_y2]), axis=1)
        df['fe16'] = df.apply(lambda x: cal_degrees([x.k_x1, x.k_y2], [x.v_x1, x.v_y2]), axis=1)
        df['fe17'] = df.apply(lambda x: cal_degrees([x['k_cx'], x['k_cy']], [x['v_cx'], x['v_cy']]), axis=1)
        
        df['fe18'] = df.apply(lambda x: boxes_distance([x.k_x1-x.v_x2, x.k_y2-x.v_y1],[x.v_x1-x.k_x2, x.v_y2-x.k_y1]), axis=1)
        df['fe19'] = df.apply(lambda x: dist_points([x.k_cx, x.k_cy], [x.v_cx, x.v_cy]), axis=1)
        
        # print("============//////////////============", np.array(df['k_embed'].values.tolist()))
        k_embed_df = pd.DataFrame(np.array(df['k_embed'].values.tolist())).add_prefix('fe20')
        # print("========================", k_embed_df.shape)
        df = pd.concat([df, k_embed_df], axis=1)
        v_embed_df = pd.DataFrame(np.array(df['k_embed'].values.tolist())).add_prefix('fe21')
        # print("========================", v_embed_df.shape)
        df = pd.concat([df, v_embed_df], axis=1)
        # print("========================", df.shape)
        cols = [c for c in df.columns if c.startswith('fe')] + ['label']

        return df[cols], df[self.cols]
    
    # def load_scaler(self):
    #     # print('Loading scaler post processing relation ...')
    #     if os.path.exists(self.scaler_path):
    #         with open(self.scaler_path, 'rb') as f_scaler:
    #             scaler = pickle.load(f_scaler)
    #         f_scaler.close()
    #         return scaler 
    #     else:
    #         print("Path to scaler not exist !")
    
    def run(self, im_path, data):
        data = self.preprocess_ser2re_batch_ver2(im_path, data)
        if len(data)==0:
            return [], [], []
        d_features, __ = self.make_features(data)
        
        X, y = d_features.values[:, :-1], d_features.values[:, -1]
        # print("===========X.shape=============", X.shape)
        # X_transform = self.scaler.transform(X)
        return X, y, data



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
preparation_engine = Preparation_data()

### Create training- eval data

In [3]:
path_train_dot_json = "/home/chuongphung/projects/chatgpt/XGBoost/dataset/no1_1_no3_1/final_qa_same_20230927/train/train.json"
path_train_image = "/home/chuongphung/projects/chatgpt/XGBoost/dataset/no1_1_no3_1/final_qa_same_20230927/train/image/"

#### Create training data

In [4]:
with open(path_train_dot_json, "rb") as f:
    infer_imgs = f.readlines()
print("-------------================Len train dataset: ", len(infer_imgs))
X_train = np.array([])
Y_train = np.array([])
for doc in tqdm(infer_imgs[:]):
    data_line = doc.decode('utf-8')
    substr = data_line.strip("\n").split("\t")
    im_path = os.path.join(path_train_image, substr[0])
    try:
        x_train, y_train, _ = preparation_engine.run(im_path, eval(substr[1]))
    except:
        print(substr[0])
        continue
    if len(y_train)!=0:
        if X_train.shape[0] == 0:
            X_train = x_train
        else:
            X_train = np.vstack((X_train, x_train))
        
        if Y_train.shape[0] == 0:
            Y_train = y_train
        else:
            Y_train = np.hstack((Y_train, y_train))
    # break
print(X_train.shape)
print(Y_train.shape)
print(Y_train)



100%|██████████| 100/100 [00:16<00:00,  6.01it/s]

(10961, 787)
(10961,)
[1. 0. 0. ... 1. 1. 1.]





#### Create eval data

In [5]:
path_eval_dot_json = "/home/chuongphung/projects/chatgpt/XGBoost/dataset/test_dataset/dataset2/val/val.json"
path_eval_image = "/home/chuongphung/projects/chatgpt/XGBoost/dataset/test_dataset/dataset2/val/image/"

In [6]:
with open(path_eval_dot_json, "rb") as f:
    infer_imgs = f.readlines()
print("-------------================Len train dataset: ", len(infer_imgs))
X_val = np.array([])
Y_val = np.array([])
DF_val = []
for doc in tqdm(infer_imgs[:]):
    data_line = doc.decode('utf-8')
    substr = data_line.strip("\n").split("\t")
    im_path = os.path.join(path_eval_image, substr[0])
    
    x_val, y_val, df_val = preparation_engine.run(im_path, eval(substr[1]))
    if len(y_val)!=0:
        if X_val.shape[0] == 0:
            X_val = x_val
        else:
            X_val = np.vstack((X_val, x_val))
        
        if Y_val.shape[0] == 0:
            Y_val = y_val
        else:
            Y_val = np.hstack((Y_val, y_val))
        if len(DF_val)==0:
            DF_val = df_val
        else:
            DF_val = pd.concat([DF_val, df_val], ignore_index=True,axis=0)
print(X_val.shape)
print(Y_val.shape)



100%|██████████| 10/10 [00:01<00:00,  8.17it/s]

(802, 787)
(802,)





### Scale data

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
SAVE_PATH = "20231220_lightgbm/"
os.makedirs(SAVE_PATH, exist_ok=True)

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
pickle.dump(scaler, open(os.path.join(SAVE_PATH, "scaler.pkl"), 'wb'))

### Train model

In [10]:
params = {
            'random_state': 1997,
            'n_estimators': 200,
            'n_jobs': 15,
            'max_depth': 70,
            'num_leaves': 100,
            'class_weight': 'balanced' #{0: 0.31, 1:0.69}
        }
model = LGBMClassifier(objective="binary", **params)
model.fit(X_train, Y_train)
print("----------================Eval ...")
print('Training accuracy {:.4f}'.format(model.score(X_train,Y_train)))
print('Testing accuracy {:.4f}'.format(model.score(X_val,Y_val)))
print('----------================Saving model ...')
with open(os.path.join(SAVE_PATH, 'clf.pkl'), 'wb') as f_cls:
    pickle.dump(model, f_cls, protocol=pickle.HIGHEST_PROTOCOL)
# Predicting the Target variable
pred = model.predict(X_val)
print("-----------===========precision_score", precision_score(Y_val, pred, average=None))
print("-----------===========recall_score", recall_score(Y_val, pred, average=None))
print("-----------===========f1_score", f1_score(Y_val, pred, average=None))

[LightGBM] [Info] Number of positive: 1128, number of negative: 9833
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036767 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 196396
[LightGBM] [Info] Number of data points in the train set: 10961, number of used features: 787
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training accuracy 1.0000
Testing accuracy 0.9763


### Load saved model

In [11]:
with open(os.path.join(SAVE_PATH, 'clf.pkl'), 'rb') as f_model:
    model_lightgbm = pickle.load(f_model)

### Eval with post processing

In [12]:
DF_val["pred_prob"] = model_lightgbm.predict_proba(X_val)[:, 1].tolist()
DF_val["pred"] = model_lightgbm.predict(X_val).tolist()

In [13]:
def post_process(df: pd.DataFrame, threshold = 0.25):
    # one value only links to one key but one key can link to many value
    # df['pred_prob'] = pred_prob
    df['is_linking'] = 0
    fnames = df.fname.unique().tolist()
    for fname in fnames:
        df_fname = df[df.fname==fname]
        v_ids = df_fname.v_id.unique().tolist()
        for v_id in v_ids:
            df_vid = df_fname[df_fname.v_id==v_id]
            idx_max = df_vid.pred_prob.idxmax()

            if df.loc[(df.fname==fname)&(df.v_id==v_id)&(df.index==idx_max), 'pred_prob'].values[0] >= threshold:
                df.loc[(df.fname==fname)&(df.v_id==v_id)&(df.index==idx_max), 'is_linking'] = 1
    return df

In [14]:
# Without post-processing
print("-----------===========precision_score", precision_score(DF_val["label"].values, DF_val["pred"].values, average=None))
print("-----------===========recall_score", recall_score(DF_val["label"].values, DF_val["pred"].values, average=None))
print("-----------===========f1_score", f1_score(DF_val["label"].values, DF_val["pred"].values, average=None))



In [15]:
# With post-processing threshold = 0.0
new_df = post_process(DF_val.copy(), 0)
print("-----------===========precision_score", precision_score(new_df["label"].values, new_df["is_linking"].values, average=None))
print("-----------===========recall_score", recall_score(new_df["label"].values, new_df["is_linking"].values, average=None))
print("-----------===========f1_score", f1_score(new_df["label"].values, new_df["is_linking"].values, average=None))



In [16]:
# With post-processing threshold = 0.25
new_df = post_process(DF_val.copy(), 0.25)
print("-----------===========precision_score", precision_score(new_df["label"].values, new_df["is_linking"].values, average=None))
print("-----------===========recall_score", recall_score(new_df["label"].values, new_df["is_linking"].values, average=None))
print("-----------===========f1_score", f1_score(new_df["label"].values, new_df["is_linking"].values, average=None))

