# Load Model

In [None]:
# !module load cuda/11.8
import os
import torch
from transformers import AutoConfig, AutoModel, AutoTokenizer
import json
import pandas as pd
from tqdm import tqdm
import gc
import numpy as np
llm_path = r"/workspace/LLM/chatglm-6b"


def load_ori_glm1(llm_path="/workspace/LLM/chatglm-6b"):
    # config = AutoConfig.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True, pre_seq_len=1024, output_hidden_states=True, output_attentions = True)
    # config = AutoConfig.from_pretrained("THUDM/chatglm-6b-int4", trust_remote_code=True, output_hidden_states=True, output_attentions = True)
    # model = AutoModel.from_pretrained("THUDM/chatglm-6b-int4", config=config, trust_remote_code=True).half().cuda()
    config = AutoConfig.from_pretrained(llm_path, trust_remote_code=True, output_hidden_states=True, output_attentions = True)
    # model = AutoModel.from_pretrained(llm_path, config=config, trust_remote_code=True).half().cuda()
    model = AutoModel.from_pretrained(llm_path, config=config, trust_remote_code=True).quantize(4).half().cuda()
    model = model.eval()
    tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
    return model
def load_ori_glm2(llm_path="/workspace/LLM/chatglm2-6b"):
    config = AutoConfig.from_pretrained(llm_path, trust_remote_code=True, output_hidden_states=True, output_attentions = True)
    model = AutoModel.from_pretrained(llm_path, config=config, trust_remote_code=True).quantize(4).half().cuda()
    model = model.eval()
    return model

def load_glm_checkpoint(checkpoint_path, llm_path):

    # 载入Tokenizer
    # tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
    # config = AutoConfig.from_pretrained(llm_path, trust_remote_code=True, pre_seq_len=1024)

    # tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
    config = AutoConfig.from_pretrained(llm_path, trust_remote_code=True, pre_seq_len=1024, output_hidden_states=True, output_attentions = True)

    # # model = AutoModel.from_pretrained("/content/drive/MyDrive/share_p/20230416_chatglm6b_model", config=config, trust_remote_code=True)
    model = AutoModel.from_pretrained(llm_path, config=config, trust_remote_code=True)
    print("Parameter Merging!")
    prefix_state_dict = torch.load(os.path.join(checkpoint_path, "pytorch_model.bin"))

    new_prefix_state_dict = {}
    for k, v in prefix_state_dict.items():
        if k.startswith("transformer.prefix_encoder."):
            new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
    model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
    print("Model Quantizationing!")
    model = model.quantize(4)
    model = model.half().cuda()
    model.transformer.prefix_encoder.float()
    model = model.eval()
    print("Model Loaded!")
    return model


def read_json(json_path):
    with open(json_path, "r", encoding="utf-8-sig") as json_file:
        # json_list = json_file.readlines()
        json_list = [json.loads(line) for line in json_file]
        keys = [key for key in json_list[0].keys()]
        print(f"json length:{len(json_list)}\njson keys:{keys}")
    return json_list


def get_mean_pooling_embedding(input_text, tokenizer, model):
    torch.cuda.empty_cache()
    gc.collect()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(input_text, return_tensors="pt", add_special_tokens=True, return_attention_mask=True, truncation=True, max_length=2048)
    inputs = {k:v.to(device) for k,v in inputs.items()}
    # print(len(inputs['input_ids'][0]))

    with torch.no_grad():
        torch.cuda.empty_cache()
        gc.collect()
        outputs = model(**inputs)
    # hidden state shape (batch_size, sequence_length, hidden_size)
    # (input_tokens_length, 1, 4096)
    last_hidden_state = outputs[2][-1]
    input_tokens_length = last_hidden_state.shape[0]
    # (1, 4096)
    embedding = torch.sum(last_hidden_state, 0)
    embedding = embedding[0] / input_tokens_length
    torch.cuda.empty_cache()
    gc.collect()
    return embedding


In [None]:
# import gc
embedding = None
gc.collect()
torch.cuda.empty_cache()
gc.collect()

In [None]:
# 調整參數區
# checkpoint_path = r"workspace/ptuning/output/20230620_hackson_1000-6b-pt-1024-1e-2/checkpoint-{}"  #改
tokenizer = AutoTokenizer.from_pretrained("/workspace/LLM", trust_remote_code=True)
# -----------------------------
# merged_model = load_glm_checkpoint(checkpoint_path.format(str(400)))
merged_model = load_local_glm()

# Vector store (ChatGLM)

In [None]:
import pandas as pd
# # Check if csv files are correctly corresponded.
e_fee = pd.read_csv('/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_fee_embedding.csv')
# 1057180
fee = pd.read_csv('/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_fee.csv')

print("fee_embedding length: ", len(e_fee))
print("Last data dimension of fee_embedding: ", len(eval(e_fee.iloc[len(e_fee)-1]['embedding'])))
print("fee ID == fee_embedding ID: ",e_fee.iloc[:len(e_fee)]['Unnamed: 0'].tolist()==fee.iloc[:len(e_fee)]['Unnamed: 0'].tolist())
print('\nLast data of fee_embedding:\n', e_fee.iloc[len(e_fee)-1])
print('\nSame data of fee:\n', fee.iloc[len(e_fee)-1])

In [None]:
# Store in csv file
import numpy as np
import faiss
from tqdm import tqdm
dir_path = '/cluster/home/lawrencechh.cs/cdf_dataset/'
target_df_paths = ['/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_fee.csv',
                   '/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_opinion.csv',
                   '/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_sub.csv']
# d = 4096
# res = faiss.StandardGpuResources()
# save_vector_store_steps = 5000
# 20231231 2545/1057180
# 20240102 22812/1057180
# 20240105 40051/1031823
embedding_df_length = 2545 + 22812 + 40051
start_index = embedding_df_length
for path in target_df_paths:
    print(path)
    df = pd.read_csv(path)[embedding_df_length:]
    # df = pd.read_csv(path)
    # df = pd.read_csv(path, nrows=100)

    basic_column, category_column, target_column_name = df.columns[0], df.columns[1], df.columns[2]
    output_df = pd.DataFrame(columns=[basic_column, category_column, 'embedding'])
    output_csv_path = f"/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_{target_column_name}_embedding.csv"
    if os.path.isfile(output_csv_path) == False:
        output_df.to_csv(output_csv_path, index=False)
    else:
        print("File exists.")
    # index_flat = faiss.IndexFlatL2(d)
    # gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)

    for i in tqdm(range(len(df))):
        input_text = df.iloc[i][target_column_name]
        basic_id = df.iloc[i][basic_column]
        category_id = df.iloc[i][category_column]
        print(df.iloc[i])
        break
        # print(df.iloc[start_index+i])
        # print(len(input_text))
        embedding = get_mean_pooling_embedding(input_text, tokenizer, merged_model)
        # embedding = embedding.to("cpu").numpy().astype(np.float32).tolist()
        embedding = str(embedding.to("cpu").numpy().astype(np.float32).tolist())
        output_df.loc[0] = {basic_column: basic_id, category_column: category_id, 'embedding': embedding}
        # embedding = np.array(embedding.cpu(), dtype=np.float32)
        # index_flat.add(np.array([embedding]))
        # gpu_index_flat.add(np.array([embedding]))


        # output_df.to_csv(output_csv_path, mode="a", index=False, header=False, encoding="utf-8-sig")


        # if i%5==0:
        #     output_df = pd.DataFrame(columns=[basic_column, category_column, 'embedding'])
            # faiss.write_index(faiss.index_gpu_to_cpu(gpu_index_flat), f"/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_{target_column_name}.bin")
            # faiss.write_index(index_flat, f"/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_{target_column_name}.bin")
        gc.collect()
        embedding = None
        torch.cuda.empty_cache()
        gc.collect()

    # faiss.write_index(faiss.index_gpu_to_cpu(gpu_index_flat), f"/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_{target_column_name}.bin")
    # faiss.write_index(index_flat, f"/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_{target_column_name}.bin")





## Test faiss

In [None]:
import numpy as np
input_text = "你好"
v = get_mean_pooling_embedding(input_text, tokenizer, merged_model)
print(type(v))
# output_df = pd.DataFrame(columns=['embedding'])
# for i in range(1000):
#     # output_df.loc[len(output_df)] = {'embedding': v.detach().cpu().numpy()}
#     output_df.loc[len(output_df)] = {'embedding': v.cpu().numpy().astype(np.float32)}
    # output_df.loc[len(output_df)] = {'embedding': v.cpu()}
# output_df.to_csv('/cluster/home/lawrencechh.cs/cdf_dataset/test_embedding.csv', index_label='index')


In [None]:
import faiss # make faiss available
# Dimension
d = 4096
res = faiss.StandardGpuResources()
index_flat = faiss.IndexFlatL2(d)
# make it into a gpu index
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
for i in range(1000):
    gpu_index_flat.add(np.array([npv]))         # add vectors to the index
    if i%500==0:
        faiss.write_index(faiss.index_gpu_to_cpu(gpu_index_flat), "/cluster/home/lawrencechh.cs/cdf_dataset/index.bin")
        print(gpu_index_flat.ntotal)
faiss.write_index(faiss.index_gpu_to_cpu(gpu_index_flat), "/cluster/home/lawrencechh.cs/cdf_dataset/index.bin")
print(gpu_index_flat.ntotal)


In [None]:
index2 = faiss.read_index("/cluster/home/lawrencechh.cs/cdf_dataset/index.bin")
gpu_index_flat2 = faiss.index_cpu_to_gpu(res, 0, index2)
print(gpu_index_flat2.ntotal)
for i in range(1000):
    gpu_index_flat2.add(np.array([npv]))         # add vectors to the index
    if i%500==0:
        faiss.write_index(faiss.index_gpu_to_cpu(gpu_index_flat2), "/cluster/home/lawrencechh.cs/cdf_dataset/index.bin")
        print(gpu_index_flat2.ntotal)
print(gpu_index_flat2.ntotal)


In [None]:
index2 = faiss.read_index("/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_sub.bin")
index2.ntotal
index2.

In [None]:
# # load quantized
# input_text = "你好"
# v = get_mean_pooling_embedding(input_text, tokenizer, merged_model)
# print(v)
# print(len(v))

# input_text = "天氣如何?"
# v = get_mean_pooling_embedding(input_text, tokenizer, merged_model)
# print(v)
# print(len(v))

In [None]:
# # load ori then quantized
# input_text = "你好"
# v = get_mean_pooling_embedding(input_text, tokenizer, merged_model)
# print(v)
# print(len(v))

# input_text = "天氣如何?"
# v = get_mean_pooling_embedding(input_text, tokenizer, merged_model)
# print(v)
# print(len(v))

## Store criminal opinions vector

In [None]:
import pandas as pd
import faiss
from tqdm import tqdm
import numpy as np

# # Read target csv
# eop_df = pd.read_csv('/workspace/CDB/cdb/static/1222_opinion_sentence_district_embedding.csv')
print(len(eop_df))

# Dimension
d = 4096
res = faiss.StandardGpuResources()
index_flat = faiss.IndexFlatL2(d)

# make it into a gpu index
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
for i in tqdm(range(len(eop_df))):
    # add vectors to the index
    tmp_embedding = eval(eop_df.iloc[i]['embedding'])
    tmp_embedding = np.array([tmp_embedding])
    gpu_index_flat.add(tmp_embedding)         

faiss.write_index(faiss.index_gpu_to_cpu(gpu_index_flat), "/workspace/CDB/cdb/static/1222_opinion_sentence_district_index.bin")
print(gpu_index_flat.ntotal)

In [None]:
teste = eval(eop_df.iloc[0]['embedding'])
teste = np.array([teste])
print(teste.shape)

# Data of Criminal Database

In [None]:
b_ori = pd.read_csv('/cluster/home/lawrencechh.cs/cdf_dataset/output.csv')
c_ori = pd.read_csv('/cluster/home/lawrencechh.cs/cdf_dataset/20231204_dic_-001.csv')

print("output.csv: ", len(b_ori))
print("20231204_dic_-001.csv", len(c_ori))
tmp_df = b_ori[b_ori['JID'].isin(c_ori['case_num'])]
# tmp_df = c_ori[c_ori['case_num'].isin(b_ori['JID'])]
print("Same Jud: ", len(tmp_df))
print(list(b_ori.columns))
print(list(c_ori.columns))

In [None]:
merged_b = pd.read_csv('/gdrive/MyDrive/研究資料/高院/20231211_categoryID_basic_data.csv')
opinion_df = pd.read_csv('/gdrive/MyDrive/研究資料/高院/20231211_category_basic_data_merged_opinion.csv')
print(len(merged_b))
print(len(merged_b.dropna()))
tmp_df = merged_b[merged_b['count'].isna()]
print(len(tmp_df))
print(tmp_df.iloc[0])
# print(merged_b.columns)

In [None]:
print(merged_b.columns)
print(opinion_df.columns)
print()
print(merged_b.iloc[0])
print()

print(opinion_df.iloc[0])

In [None]:
import pandas as pd
from tqdm import tqdm
# merged_df = pd.read_csv('/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged.csv')
print('merged_df length: ', len(merged_df))
print('merged_df columns: ', merged_df.columns)

target_columns = list(merged_df.columns[-4:-1])
print(target_columns)
for target_col in target_columns:
    target_list = merged_df[target_col].tolist()
    target_num = 0
    for target in tqdm(target_list):
        # print(target)
        # print(type(target))
        if type(target)==str:
            target_num+=len(eval(target))
        else:
            continue
    print(f"{target_col} numbers: ", target_num)
    target_df_path = f'/cluster/home/lawrencechh.cs/cdf_dataset/20231211_category_basic_data_merged_{target_col}.csv'
    target_df = pd.read_csv(target_df_path)
    print(f"20231211_category_basic_data_merged_{target_col}.csv length: ", len(target_df))
    print(f'{target_col} numbers == merged_{target_col}.csv length: ', target_num==len(target_df))


# Extract 法官、檢察官 (附件前方)

In [None]:
from google.colab import drive
drive.mount('/gdrive')
dir_path = '/gdrive/MyDrive/研究資料/高院'
import pandas as pd
import re
from tqdm import tqdm
import random
merged_b = pd.read_csv(dir_path + '/20231211_categoryID_basic_data.csv')
print(merged_b.columns)

print('merged_b length', len(merged_b))
print('matched data length', len(merged_b.dropna()))

## Extract prosecutor

In [None]:
# Shorten judgments length
extracted_length = []

def locating_judge_and_prosecutor_texts_lists(input_list):
    patterns = [r'據上論[斷|結].+?書記官.+?\r?\n?',
            r'作\s*成\s*本\s*判\s*決。.+?書記官.+?[ |\r]*\n',
            r'判決如主文。.+?書記官.+?\r?\n?',
            r'本案經檢察官.+?書記官.+?\r?\n?',
            r'應\s*予\s*駁\s*回\s*。.+?書記官.+?[ |\r]*\n',
            r'刑事第.+庭.+?書記官.+?[ |\r]*\n'
            ]

    error_signs = ['\u3000', '\u30005', '\u3000111', '\u30004', '\u300011', '\u30003', '\u300024', '&nbsp;', '\u3000110', '\u300012', '\u300030', ' ']
    matched_indices = [[] for _ in range(len(patterns))]
    empty_list = []
    print('matched_indices', matched_indices)
    for i in tqdm(range(len(input_list))):
        jud = input_list[i][1]
        input_list_id = input_list[i][0]
        # ---------------------------
        for sign in error_signs:
            jud = jud.replace(sign, '')
        for rule_index, pattern in enumerate(patterns):
            extracted_list = re.findall(pattern, jud, re.DOTALL)
            try:
                extracted_text_length = len(extracted_list[0])

                if extracted_text_length<=1000:
                # empty_list length: 68
                # rules match list length: [112850, 657, 1524, 1234, 38, 149]
                  matched_indices[rule_index].append([input_list_id, extracted_text_length, extracted_list[0]])
                else:
                  continue

                break
            except:
                continue
        if extracted_list == []:
            empty_list.append(i)
    # [matched_indices[rule_index][[input_list_index, extracted_text_length, extracted_text],...], empty_list]
    return [matched_indices, empty_list]

juds_list = list(zip(merged_b['Unnamed: 0'].tolist(), merged_b['JFULL'].tolist()))
extracted_list = locating_judge_and_prosecutor_texts_lists(juds_list)

# Sorted list according to length
for rule_index, rule_matched_list in enumerate(extracted_list[0]):
  extracted_list[0][rule_index] = sorted(rule_matched_list, key=lambda x:x[1], reverse=True)

print('empty_list length:', len(extracted_list[1]))
print('rules match list length:', [len(rule_list) for rule_list in extracted_list[0]])
# extracted_length = sorted(extracted_length, reverse=True)
# print(extracted_length[:10])


# Locate prosecutor
def locate_prosectuor_texts_lists(re_extracted_list):
  prosecutor_rules = [
      [r'判決如主文。.*?中華民國', True],
      [r'本*[案|件][經|由].+?。', True],
      # [112266, 413]
      # 171
  ]
  matched_list = [[] for _ in range(len(prosecutor_rules))]
  not_matched_list = []

  for data_index in tqdm(range(len(re_extracted_list))):
    for rule_index, rule in enumerate(prosecutor_rules):
      text_replaced = rule[1]
      text = re_extracted_list[data_index][1]
      re_extracted_list_index = re_extracted_list[data_index][0]
      if text_replaced:
        text = text.replace('\r', '').replace('\n', '').replace(' ', '')

      re_text = re.findall(rule[0], text, re.DOTALL)

      if len(re_text) > 0:
        matched_list[rule_index].append([re_extracted_list_index, len(re_text[0]), re_text[0]])
        break
      else:
        continue
    if len(re_text) == 0:
      text_before_year = re.findall(r'.+?中華民國', re_extracted_list[data_index][1].replace('\r', '').replace('\n', '').replace(' ', ''), re.DOTALL)[0]
      text_before_year = text_before_year.strip('中華民國')
      # not_matched_list.append([data_index, extracted_list[0][0][data_index][2]])
      not_matched_list.append([re_extracted_list_index, len(text_before_year), text_before_year])
  return [matched_list, not_matched_list]
  print('\n', [len(dlist) for dlist in matched_list])
  print(len(not_matched_list))
input_list = [[data[0], data[2]] for data in extracted_list[0][0]]
second_extracted_list = locate_prosectuor_texts_lists(input_list)

In [None]:
# Test function
print(len(extracted_list))
print(len(extracted_list[0]))
print(len(extracted_list[0][0]))
print(len(extracted_list[0][1]))
# [0 matched_list 1 empty_list][rule_index][data_index]
extracted_list[0][0][0]


input_list = [[data[0], data[2]] for data in extracted_list[0][0]]
second_extracted_list = locate_prosectuor_texts_lists(input_list)

In [None]:
print(len(second_extracted_list))
print(len(second_extracted_list[0]))
print(len(second_extracted_list[0][0]))
print(len(second_extracted_list[0][1][0]))
second_extracted_list[0][0]

## NER extract names

In [None]:
import jieba
import jieba.posseg as psg
def find_prosecutor_names(input_list):
    not_names = ['應依']
    result_list = []  # Create a new list to store the results
    for list_index in tqdm(range(len(input_list))):
      text = input_list[list_index][2]
      data_index = input_list[list_index][0]
      seg_list = psg.cut(text)
      names = []
      for item in seg_list:
        if item.word in not_names:
          continue
        elif item.flag == 'nr' or item.flag == 'nrfg':
          names.append(item.word)
      result_list.append([data_index, text, names])  # Append the result to the new list
    return result_list

# name_list = find_prosecutor_names(no_prosecutor)
name_list = find_prosecutor_names(second_extracted_list[0][1])
name_list

## (Dropped) Shorten judgments length

In [None]:
extracted_length = []

def get_re_and_empty_list(src_df):
    # unicode_pattern = r'\\u[0-9A-Fa-f]+'
    # unicode_pattern = r'\u3000.*'
    # unicode_pattern = r'[\u3000-\u4000+]'
    # jud = re.sub(unicode_pattern, '', jud)
    # ---------------------------
    # error_signs = ['\u3000', '\u30005', '\u3000111', '\u30004', '\u300011', '\u30003', '\u300024', '&nbsp;', '\u3000110', '\u300012', '\u300030', '\r', '\n', ' ']
    # pattern = r'中華民國\d+年\d+月\d+日(.*)?中華民國\d+年\d+月\d+日'
    # ---------------------------
    # error_signs = ['\u3000', '\u30005', '\u3000111', '\u30004', '\u300011', '\u30003', '\u300024', '&nbsp;', '\u3000110', '\u300012', '\u300030']
    # pattern = r'中 *華 *民 *國 *\d+ *年 *\d+ *月 *\d+ *日(.*)?中 *華 *民 *國 *\d+ *年 *\d+ *月 *\d+ *日'
    # ---------------------------
        # for sign in error_signs:
    #     jud = jud.replace(sign, '')
    # ---------------------------
    # patterns = [r'據上論[斷|結].+?書記官.+?\n',
    #             r'作成本判決。.+?書記官.+?\n'
    #             ]
    # # empty_list length: 22984
    # # rules match list length: [92950, 595]
    # ---------------------------
    # # Recommendation
    # patterns = [r'據上論[斷|結].+?書記官.+?\r?\n?',
    #             r'作成本判決。.+?書記官.+?\r?\n?',
    #             r'判決如主文。.+?書記官.+?\r?\n?',
    #             r'本案經檢察官.+?書記官.+?\r?\n?',
    #             ]
    # # empty_list length: 17903
    # # rules match list length: [95560, 595, 1226, 1245]
    # ---------------------------
    patterns = [r'據上論[斷|結].+?書記官.+?\r?\n?',
                r'作\s*成\s*本\s*判\s*決。.+?書記官.+?[ |\r]*\n',
                r'判決如主文。.+?書記官.+?\r?\n?',
                r'本案經檢察官.+?書記官.+?\r?\n?',
                r'應\s*予\s*駁\s*回\s*。.+?書記官.+?[ |\r]*\n',
                r'刑事第.+庭.+?書記官.+?[ |\r]*\n'
                ]
    # empty_list length: 17703
    # rules match list length: [95560, 657, 1226, 1209, 35, 139]
    # ---------------------------
    # patterns = [r'據上論[斷|結].+?書記官',
    #             # r'作\s*成\s*本\s*判\s*決。.+?書記官.+?[ |\r]*\n',
    #             # r'判決如主文。.+?書記官.+?\r?\n?',
    #             # r'本案經檢察官.+?書記官.+?\r?\n?',
    #             # r'應\s*予\s*駁\s*回\s*。.+?書記官.+?[ |\r]*\n',
    #             # r'刑事第.+庭.+?書記官.+?[ |\r]*\n'
    #             ]
    # error_signs = ['\u3000', '\u30005', '\u3000111', '\u30004', '\u300011', '\u30003', '\u300024', '&nbsp;', '\u3000110', '\u300012', '\u300030', '\r', '\n', ' ']
    error_signs = ['\u3000', '\u30005', '\u3000111', '\u30004', '\u300011', '\u30003', '\u300024', '&nbsp;', '\u3000110', '\u300012', '\u300030', ' ']
    matched_indices = [[] for _ in range(len(patterns))]
    empty_list = []
    print('matched_indices', matched_indices)
    for i in tqdm(range(len(src_df))):
        jud = src_df.iloc[i]['JFULL']
        # ---------------------------
        for sign in error_signs:
            jud = jud.replace(sign, '')
        for rule_index, pattern in enumerate(patterns):
            extracted_list = re.findall(pattern, jud, re.DOTALL)
            try:
                extracted_text_length = len(extracted_list[0])

                if extracted_text_length<=1000:
                # empty_list length: 68
                # rules match list length: [112850, 657, 1524, 1234, 38, 149]
                  matched_indices[rule_index].append([i, extracted_text_length, extracted_list[0]])
                else:
                  continue

                break
            except:
                continue
        if extracted_list == []:
            empty_list.append(i)

    return [matched_indices, empty_list]

extracted_list = get_re_and_empty_list(merged_b)

# Sorted list according to length
for rule_index, rule_matched_list in enumerate(extracted_list[0]):
  extracted_list[0][rule_index] = sorted(rule_matched_list, key=lambda x:x[1], reverse=True)

print('empty_list length:', len(extracted_list[1]))
print('rules match list length:', [len(rule_list) for rule_list in extracted_list[0]])
# extracted_length = sorted(extracted_length, reverse=True)
# print(extracted_length[:10])

## (Dropped) Find exact entity (檢察官, 法官)

In [None]:

prosecutor_rules = [
    [r'判決如主文。.*?中華民國', True],
    [r'本*[案|件][經|由].+?。', True],
    # [112266, 413]
    # 171
]
prosecutor = [[] for _ in range(len(prosecutor_rules))]
no_prosecutor = []
for data_index in tqdm(range(len(extracted_list[0][0]))):

  for rule_index, rule in enumerate(prosecutor_rules):
    text_replaced = rule[1]
    text = extracted_list[0][0][data_index][2]
    if text_replaced:
      text = text.replace('\r', '').replace('\n', '').replace(' ', '')

    re_text = re.findall(rule[0], text, re.DOTALL)

    if len(re_text) > 0:
      prosecutor[rule_index].append([data_index, re_text[0]])
      break
    else:
      continue
  if len(re_text) == 0:
    text_before_year = re.findall(r'.+?中華民國', extracted_list[0][0][data_index][2].replace('\r', '').replace('\n', '').replace(' ', ''), re.DOTALL)[0]
    text_before_year = text_before_year.strip('中華民國')
    # no_prosecutor.append([data_index, extracted_list[0][0][data_index][2]])
    no_prosecutor.append([data_index, text_before_year])

print('\n', [len(dlist) for dlist in prosecutor])
print(len(no_prosecutor))


In [None]:
# sorted([[len(text[1]), text[0], text[1]] for text in prosecutor[0]], reverse=True)
sorted([[len(text[1]), text[0], text[1]] for text in no_prosecutor], reverse=True)
# sorted([[len(text[1]), text[0], text[1]] for text in prosecutor[1]], reverse=True)

In [None]:
檢察官黃怡君到庭
檢察官鄭仙杏提起公訴
檢察官謝志明、侯詠琪提起公訴，檢察官謝名冠到庭執行\r\n職務
檢察官洪國朝提起公訴，檢察官丑○○到庭執行職務。
檢察官許月雲到庭執行職務。
檢察官何宗霖提起公訴，檢察官吳祚延到庭執行職務。
檢察官王亮欽提起公訴，檢察官張慧瓊到庭執行職務。
檢察官曲鴻煜提起公訴，檢察官洪政和提起上訴，檢察官\r\n周穎宏到庭執行職務。\


檢察官劉孟昕提起公訴，檢察官羅建勛到庭執行職務。

檢察官顏郁山提起公訴，檢察官李松諺提起上訴，檢察官\r\n陳建弘到庭執行職務。
檢察官周欣蓓偵查起訴，由檢察官楊秀琴在本審到庭實行\r\n公訴。
檢察官賴穎穎偵查起訴，於檢察官張家維提起上訴後，由\r\n檢察官沈明倫在本審到庭實行公訴。

A案經檢察官陳昭蓉提起公訴，檢察官吳淑娟追加起訴及移送併\r\n辦；B案經檢察官朱啟仁提起公訴，均經檢察官蘇南桓到庭執行\r\n職務。

本案由檢察官陳映蓁、吳欣恩提起公訴，由檢察官王亞樵到庭執\r\n行職務。

本案由臺灣桃園地方檢察署檢察官鍾信一偵查起訴、檢察官蔡宜\r\n均上訴；臺灣高等檢察署檢察官楊四猛到庭執行職務。\r\n

[939,
 '據上論結，應依刑事訴訟法第369第1項前段、第364條、第30\r\n3條第5款、第307條，判決如主文。\r\n中華民國106年5月9日\r\n刑事第四庭審判長法官惠光霞\r\n法官王憲義\r\n法官李東柏\r\n以上正本證明與原本無異。\r\n檢察官如不服本判決應於收受本判決後10日內向本院提出上訴書\r\n狀，其未敘述上訴理由者，並得於提起上訴後10日內向本院補提\r\n理由書狀（均須按他造當事人之人數附繕本）「切勿逕送上級法\r\n院」。\r\n中華民國106年5月9日\r\n書記官洪']

In [None]:
# # Check re text over 1000
# over_1000 = []
# for rule_index in range(len(extracted_list[0])):
#   for data_index in tqdm(range(len(extracted_list[0][rule_index]))):
#     if extracted_list[0][rule_index][data_index][1]>1000:
#       over_1000.append(extracted_list[0][rule_index][data_index][0])
# print('\n', len(over_1000))

# # Check single text content
# rule_index = 5
# data_index = 35
# extracted_list[0][rule_index][data_index]

# # Randomly select data
random_rule_index = random.choice(range(len(extracted_list[0])))
random_data_index = random.choice(range(len(extracted_list[0][random_rule_index])))
print(f'random_rule_index: {random_rule_index}\nrandom_data_index: {random_data_index}\n')
extracted_list[0][random_rule_index][random_data_index]

In [None]:
target_index = extracted_list[1][1]
jud = merged_b.iloc[target_index]['JFULL']
url = 'https://judgment.judicial.gov.tw/EXPORTFILE/reformat.aspx?type=JD&id={}&lawpara=&ispdf=1'.format(merged_b.iloc[target_index]['JID'])
url2 = 'https://judgment.judicial.gov.tw/FJUD/data.aspx?ty=JD&id={}'.format(merged_b.iloc[target_index]['JID'])
print(merged_b.iloc[target_index]['JID'])
print(url)
print(url2)
error_signs = ['\u3000', '\u30005', '\u3000111', '\u30004', '\u300011', '\u30003', '\u300024', '&nbsp;', '\u3000110', '\u300012', '\u300030', '\r', '\n', ' ']
for sign in error_signs:
    jud = jud.replace(sign, '')
# jud.replace(' ', "").replace('\r', "").replace('\n', "")
jud

# larger500 = [length for length in extracted_length if length > 500]
# print(len(larger500))


# pattern = r'據上論[斷|結].+?書記官.+?\n'
# pattern = r'作\s*成\s*本\s*判\s*決。.+?書記官.+\s*'
# pattern = r'應\s*予\s*駁\s*回\s*。.+?書記官.+\s*'
# pattern = r'刑事第.+庭.+?書記官.+?[ |\r]*\n'
# pattern = r'刑事第.+庭.+?書記官.+\s*'
# re.findall(pattern, jud, re.DOTALL)



# ChatGLM2

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
config = AutoConfig.from_pretrained("THUDM/chatglm2-6b-int4", trust_remote_code=True, output_hidden_states=True, output_attentions = True)

# model = AutoModel.from_pretrained("THUDM/chatglm2-6b", config=config, trust_remote_code=True).half().cuda()
# model = model.eval()
model = AutoModel.from_pretrained("THUDM/chatglm2-6b-int4", config=config, trust_remote_code=True).cuda()

response, history = model.chat(tokenizer, "你好", history=[])
print(response)

# Check 地院資料

In [None]:
from google.colab import drive
drive.mount('/gdrive')
dir_path = '/gdrive/MyDrive/資料庫'

In [None]:
import pandas as pd
basic_df = pd.read_csv('/gdrive/MyDrive/資料庫/111地院判決書_無簡_全文_移除符號.csv')
opinion_df = pd.read_csv('/gdrive/MyDrive/資料庫/1222_opinion_sentence_district_embedding.csv')
print("111地院判決書_無簡_全文_移除符號.csv: ", len(basic_df))
print("1222_opinion_sentence_district_embedding.csv: ", len(opinion_df))
# c_ori = pd.read_csv('/cluster/home/lawrencechh.cs/cdf_dataset/20231204_dic_-001.csv')

# print("output.csv: ", len(b_ori))
# print("20231204_dic_-001.csv", len(c_ori))
# tmp_df = b_ori[b_ori['JID'].isin(c_ori['case_num'])]
# # tmp_df = c_ori[c_ori['case_num'].isin(b_ori['JID'])]
# print("Same Jud: ", len(tmp_df))
# print(list(b_ori.columns))
# print(list(c_ori.columns))

In [None]:
print('basic_df:', basic_df.columns)
print('opinion_df:', opinion_df.columns)

In [None]:
print(len(basic_df[basic_df['tar_JID'].isin(opinion_df['JID'])]))


# 20240119 Store_embedding

In [None]:
import pandas as pd
import os

# jud_date_df = pd.read_csv('/workspace/111資料/111判決書日期/111_date.csv')
# syllabus_df = pd.read_csv('/workspace/111資料/111判決書主文及去符號全文/111_main.csv')
# jud_full_df = pd.read_csv('/workspace/111資料/111判決書全文_含上級審與地院_有符號與無符號/18個檔案的地院111判決書_無簡_全文_未移除符號.csv')
# fee_df = pd.read_csv('/workspace/111資料/111判決書提出標註目標/0114_ft_paragraph_district_TARGET.csv')
# opinion_df = pd.read_csv('/workspace/111資料/111判決書提出標註目標/0114_op_sentence_district_TARGET.csv')
# sub_df = pd.read_csv('/workspace/111資料/111判決書提出標註目標/0114_sub_paragraph_district_TARGET.csv')
print('syllabus_df length:', len(syllabus_df))
print('jud_date_df length:', len(jud_date_df))
print('jud_full_df length:', len(jud_full_df))
print('fee_df length:', len(fee_df), '\nfee_df last data index', fee_df.iloc[-1].name)
print('opinion_df length:', len(opinion_df), '\nopinion_df last data index', opinion_df.iloc[-1].name)
print('sub_df length:', len(sub_df), '\nsub_df last data index', sub_df.iloc[-1].name)

print('Same JID between syllabus_df and jud_date_df:', len(syllabus_df[syllabus_df['JID'].isin(jud_date_df['JID'])]))
print('Same JID between syllabus_df and fee_df:', len(syllabus_df[syllabus_df['JID'].isin(fee_df['JID'])]))
print('Same JID between syllabus_df and opinion_df:', len(syllabus_df[syllabus_df['JID'].isin(opinion_df['JID'])]))
print('Same JID between syllabus_df and sub_df:', len(syllabus_df[syllabus_df['JID'].isin(sub_df['JID'])]))

# # Check if JID is unique
# list(set(fee_df['JID'].tolist()))



In [None]:
target_df_paths = [[r'/workspace/111資料/111判決書提出標註目標/0114_op_sentence_district_TARGET.csv', 0],
                   [r'/workspace/111資料/111判決書提出標註目標/0114_ft_paragraph_district_TARGET.csv', 0],
                   [r'/workspace/111資料/111判決書提出標註目標/0114_sub_paragraph_district_TARGET.csv', 0]]
len(target_df_paths)
target_df_paths[0][0]
# target_df_paths[0][1]

In [None]:

# Store in vector database
import numpy as np
# import faiss
from tqdm import tqdm
dir_path = '/cluster/home/lawrencechh.cs/cdf_dataset/'

checkpoint_paths = [r'/workspace/111資料/20230620_724td_opinion_-6b-pt-token-1024-3e-3_0818/checkpoint-1200', 
                    r'/workspace/111資料/20230819_1070td_ft_-6b-pt-token-1024-3e-3_0818_2/checkpoint-1200', 
                    r'/workspace/111資料/if_sub_train_09262209_-6b-pt-token1024-2e-2/checkpoint-1500']

target_df_paths = [[r'/workspace/111資料/111判決書提出標註目標/0114_op_sentence_district_TARGET.csv', 50102],
                   [r'/workspace/111資料/111判決書提出標註目標/0114_ft_paragraph_district_TARGET.csv', 0],
                   [r'/workspace/111資料/111判決書提出標註目標/0114_sub_paragraph_district_TARGET.csv', 0]]

llm_path = r'/workspace/LLM/chatglm2-6b'
for df_index in range(len(target_df_paths)):
    merged_model = load_glm_checkpoint(checkpoint_paths[df_index], llm_path)
    tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)

    path = target_df_paths[df_index][0]
    embedding_df_length = target_df_paths[df_index][1]

    output_csv_path = path.split('.csv')
    output_csv_path = output_csv_path[0] + '_embedding.csv'
    print(output_csv_path)
    df = pd.read_csv(path, encoding='utf-8-sig')[embedding_df_length:]
    
    
    # basic_column, category_column, target_column_name = df.columns[0], df.columns[1], df.columns[2]
    output_df = pd.DataFrame(columns=list(df.columns)+['embedding'])

    if os.path.isfile(output_csv_path) == False:
        output_df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    else:
        print("File exists, append data to existed file.")

    stop_num = 0
    for i in tqdm(range(len(df))):
        tmp_data_dict = dict(df.iloc[i])
        input_text = tmp_data_dict['sentence']
        embedding = get_mean_pooling_embedding(input_text, tokenizer, merged_model)
        embedding = str(embedding.to("cpu").numpy().astype(np.float32).tolist())        
        tmp_data_dict['embedding'] = embedding
        
        output_df.loc[0] = tmp_data_dict
        output_df.to_csv(output_csv_path, mode="a", index=False, header=False, encoding='utf-8-sig')

        gc.collect()
        embedding = None
        torch.cuda.empty_cache()
        gc.collect()
        

    

# 20240119 Merge basic_csv and categories

## Load seperate csv files

In [None]:
import pandas as pd
import os

# court_type_df = pd.read_csv('/workspace/111資料/111判決書院別/111判決書院別.csv')
# jud_date_df = pd.read_csv('/workspace/111資料/111判決書日期/111_date.csv')
# # case_kind and basic_info are in same paragraph
# basic_info_df = pd.read_csv('/workspace/111資料/111判決書案由與基本資料/data.csv')
# syllabus_df = pd.read_csv('/workspace/111資料/111判決書主文及去符號全文/111_main.csv')
# jud_full_df = pd.read_csv('/workspace/111資料/111判決書全文_含上級審與地院_有符號與無符號/18個檔案的地院111判決書_無簡_全文_未移除符號.csv')

# fee_df = pd.read_csv('/workspace/111資料/111判決書提出標註目標/0114_ft_paragraph_district_TARGET.csv')
# opinion_df = pd.read_csv('/workspace/111資料/111判決書提出標註目標/0114_op_sentence_district_TARGET.csv')
# sub_df = pd.read_csv('/workspace/111資料/111判決書提出標註目標/0114_sub_paragraph_district_TARGET.csv')

print('court_type_df length:', len(court_type_df))
print('court_type_df columns:', court_type_df.columns)
print('jud_date_df length:', len(jud_date_df))
print('jud_date_df columns:', jud_date_df.columns)
print('basic_info_df length:', len(basic_info_df))
print('basic_info_df columns:', basic_info_df.columns)
print('syllabus_df length:', len(syllabus_df))
print('syllabus_df columns:', syllabus_df.columns)
print('jud_full_df length:', len(jud_full_df))
print('jud_full_df columns:', jud_full_df.columns)

print('fee_df columns:', len(fee_df), '\nfee_df last data index', fee_df.iloc[-1].name)
print('opinion_df length:', len(opinion_df), '\nopinion_df last data index', opinion_df.iloc[-1].name)
print('sub_df length:', len(sub_df), '\nsub_df last data index', sub_df.iloc[-1].name)

df_list = [[court_type_df, 'court_type_df'], [basic_info_df, 'basic_info_df'], [syllabus_df,'syllabus_df'], [jud_full_df, 'jud_full_df']] 
for tmp_list in df_list:
    print(f'jud_date_df JID == {tmp_list[1]} JID:', tmp_list[0]['JID'].tolist()==jud_date_df['JID'].tolist())

print('Same JID between syllabus_df and jud_date_df:', len(syllabus_df[syllabus_df['JID'].isin(jud_date_df['JID'])]))
print('Same JID between syllabus_df and fee_df:', len(syllabus_df[syllabus_df['JID'].isin(fee_df['JID'])]))
print('Same JID between syllabus_df and opinion_df:', len(syllabus_df[syllabus_df['JID'].isin(opinion_df['JID'])]))
print('Same JID between syllabus_df and sub_df:', len(syllabus_df[syllabus_df['JID'].isin(sub_df['JID'])]))

not_in_basic_info = jud_full_df[~jud_full_df['JID'].isin(basic_info_df['JID'])]
print('Numbers of JID not in basic_info_df: ', len(not_in_basic_info))
# # Check if JID is unique
# list(set(fee_df['JID'].tolist()))



## Merge several target columns to main_basic.csv

In [None]:
jud_date_merged = jud_date_df.copy()
jud_date_merged = jud_date_merged.merge(basic_info_df[['JID', 'Pair']], on='JID', how='left')
# # Check jud_date_merged['Pair] are correctly merged
# print('jud_date_merged length: ', len(jud_date_merged))
# import random
# inbasic = jud_date_merged[jud_date_merged['JID'].isin(basic_info_df['JID'])]
# print('inbasic length: ', len(inbasic))
# for i in range(20):
#     print(inbasic.iloc[random.choice(range(len(inbasic)))])
basic_info_list = jud_date_merged['Pair'].tolist()

main_basic_df = pd.DataFrame({
                              'UID': [data_index for data_index in range(len(jud_date_df))],
                              'JID': jud_date_df['JID'].tolist(),
                              'court_type': court_type_df['court'].tolist(),
                              'jud_date': jud_date_df['date'].tolist(),
                              'basic_info': basic_info_list,
                              'syllabus': syllabus_df['main'].tolist(),
                              'jud_full': jud_full_df['JFULL'].tolist(),
                              'jud_url': [f'https://judgment.judicial.gov.tw/FJUD/data.aspx?ty=JD&id={jid}' for jid in jud_date_df['JID']],
                              })
print('main_basic_df length: ', len(main_basic_df))

# # Check main_basic_df is correctly stored
for i in range(20):
    # print(main_basic_df.iloc[random.choice(range(len(main_basic_df)))])
    random_data = main_basic_df.iloc[random.choice(range(len(main_basic_df)))]
    print(random_data)
    # print(random_data['JID'])
    # print(random_data['jud_url'])
# # Save DataFrame to csv
# main_basic_df.to_csv('/workspace/111資料/20240120_main_basic.csv', encoding='utf-8-sig', index=False)


## Add UID to category_csv

In [None]:
import pandas as pd
# main_basic_df = pd.read_csv('/workspace/111資料/20240120_main_basic.csv')
df_list = [fee_df, opinion_df, sub_df]
file_name = ['fee', 'opinion', 'sub']
for tmp_index, tmp_df in enumerate(df_list):
    
    tmp_merged = tmp_df.copy()
    tmp_merged = tmp_merged.merge(main_basic_df[['JID', 'UID']], on='JID', how='left')
    tmp_merged = tmp_merged.rename(columns={'Unnamed: 0':'EID'})
    print('tmp_merged length:', len(tmp_merged))
    print('Number of unique JID:', len(list(set(tmp_merged['JID'].tolist()))))
    print("Number of tmp_merged['UID'] is nan: ", len(tmp_merged[tmp_merged['UID'].isna()]))
    print()
    tmp_merged.to_csv(f'/workspace/111資料/20240120_category_{file_name[tmp_index]}.csv', encoding='utf-8-sig', index=False)
    # # # Check tmp_merged is correctly merged
    # for i in range(20):
    #     random_data = tmp_merged.iloc[random.choice(range(len(tmp_merged)))]
    #     print(random_data)
    #     # main_basic_df_tmp = main_basic_df[main_basic_df['JID']==random_data['JID']]
    #     # print('main_basic_df_tmp: ', main_basic_df_tmp.iloc[0]['JID'], main_basic_df_tmp.iloc[0]['UID'])
    #     # print('random_data', random_data['JID'], random_data['UID'])
    #     # print(main_basic_df_tmp.iloc[0]['JID']==random_data['JID'])
    #     # print(main_basic_df_tmp.iloc[0]['UID']==random_data['UID'])

In [None]:
# Check Saved csv
file_name = ['fee', 'opinion', 'sub']
for name in file_name:
    tmp_df = pd.read_csv(f'/workspace/111資料/20240120_category_{name}.csv')
    print(f'File: {name}, Length: {len(tmp_df)}')
    print('Unique JID length:', len(list(set(tmp_df['JID'].tolist()))))
    print('Unique UID length:', len(list(set(tmp_df['UID'].tolist()))))
    print('Unique EID length:', len(list(set(tmp_df['EID'].tolist()))))
    print('Last Data:\n')
    print(tmp_df.iloc[-1])
    print()


# 20240223 Remove data of (最高法院、高院) from fee and sub

In [None]:
import pandas as pd
import faiss

main_basic_df = pd.read_csv('/workspace/111資料/db_loaded/20240120_main_basic.csv')
opinion_df = pd.read_csv('/workspace/111資料/db_loaded/20240120_category_opinion.csv')
sub_df = pd.read_csv('/workspace/111資料/db_loaded/20240120 含高院、最高法院/20240120_category_sub.csv')
fee_df = pd.read_csv('/workspace/111資料/db_loaded/20240120 含高院、最高法院/20240120_category_fee.csv')

opinion_flat = faiss.read_index('/workspace/111資料/db_loaded/0114_op_sentence_district_TARGET_embedding.bin')
fee_flat = faiss.read_index('/workspace/111資料/db_loaded/20240120 含高院、最高法院/0114_ft_paragraph_district_TARGET_embedding.bin')
sub_flat = faiss.read_index('/workspace/111資料/db_loaded/20240120 含高院、最高法院/0114_sub_paragraph_district_TARGET_embedding.bin')


In [None]:
def remove_indices_from_df_and_flat(basic_df, category_df, index_flat, category_name):

    # Merge category_df and basic_df
    target_basic_df_columns = basic_df.columns.tolist()
    del target_basic_df_columns[target_basic_df_columns.index('JID')]
    merged_df = category_df.merge(basic_df[target_basic_df_columns], on='UID', how='left')
    # merged_df.rename(columns={'sentence': category_name}, inplace=True)

    # Remove rows according to index
    # removed_indices = merged_df[merged_df['court_type'].isin(['最高法院', '高等法院'])].index
    removed_indices = merged_df[merged_df['court_type'].str.contains('高等法院') | merged_df['court_type'].str.contains('最高法院')].index
    merged_df.drop(removed_indices, inplace=True)
    merged_df.reset_index(inplace=True, drop=True)
    merged_df['EID'] = merged_df.index

    print('merged_df length:', len(merged_df))

    # Remove vectors according to index
    index_flat.remove_ids(removed_indices)
    print(f'{category_name}_flat length:', index_flat.ntotal)

    # Keep only certain columns ['EID', 'JID', 'sentence', 'type', 'UID']
    merged_df = merged_df[['EID', 'JID', 'sentence', 'type', 'UID']]
    
    # # Save
    # merged_df.to_csv(f'/workspace/111資料/db_loaded/20240225_category_{category_name}.csv', encoding='utf-8-sig', index=False)
    # faiss.write_index(index_flat, f"/workspace/111資料/db_loaded/20240225_embedding_{category_name}.bin")

remove_indices_from_df_and_flat(main_basic_df, sub_df, sub_flat, 'sub')
remove_indices_from_df_and_flat(main_basic_df, fee_df, fee_flat, 'fee')

In [None]:
nc_flat = faiss.read_index('/workspace/111資料/db_loaded/20240225_embedding_sub.bin')
nc_df = pd.read_csv('/workspace/111資料/db_loaded/20240225_category_sub.csv')

# print(set(nc_df['court_type']))
# print(fee_flat.ntotal)
# print(nc_flat.ntotal)

# print(fee_flat.reconstruct(108))
# print(nc_flat.reconstruct(0))
# print(fee_df.iloc[108])
print(nc_df.iloc[0])

# 20240225 RE basic info 

In [66]:
import pandas as pd
from tqdm import tqdm
import re
# main_basic_df = pd.read_csv('/workspace/111資料/db_loaded/20240120_main_basic.csv')
main_basic_df = pd.read_csv('/workspace/111資料/db_loaded/20240228_main_basic.csv')
print('main_basic_df length:', len(main_basic_df))

nona_df = main_basic_df[~main_basic_df['basic_info_20240120'].isna()]
print('nona_df length:', len(nona_df))
print()
print(nona_df.iloc[0])
print(nona_df.iloc[0]['jud_url'])


main_basic_df length: 24970
nona_df length: 21445

Unnamed: 0                                                             2
UID                                                                    2
JID                                        IPCM,110,刑智上更(一),2,20220126,2
court_type                                                     智慧財產及商業法院
jud_date                                                        20220126
basic_info_20240120    智慧財產及商業法院刑事判決\n110年度刑智上更(一)字第2號\n上訴人\n即被告昱盛國際企...
syllabus                                                       \n一、原判決關於
jud_full               智慧財產及商業法院刑事判決  \r\n110年度刑智上更(一)字第2號\r\n上  訴  人...
jud_url                https://judgment.judicial.gov.tw/FJUD/data.asp...
Name: 2, dtype: object
https://judgment.judicial.gov.tw/FJUD/data.aspx?ty=JD&id=IPCM,110,刑智上更(一),2,20220126,2


In [15]:
# new_basic_df = main_basic_df.copy()
# new_basic_df.rename(columns={'basic_info': 'basic_info_20240120'}, inplace=True)
# print(new_basic_df.iloc[-1])
# new_basic_df.to_csv('/workspace/111資料/db_loaded/20240228_main_basic.csv', encoding='utf-8-sig')
# new_basic_df = pd.read_csv('/workspace/111資料/db_loaded/20240228_main_basic.csv')
# print(new_basic_df.iloc[-1])


Unnamed: 0                                                         24969
UID                                                                24969
JID                                            TPSM,111,台非,72,20220526,1
court_type                                                          最高法院
jud_date                                                        20220526
basic_info_20240120    最高法院刑事判決111年度台非字第72號\n上訴人最高檢察署檢察總長\n被告康惠嵐\n\n\...
syllabus                                               \n原判決撤銷。\n本件免訴。\n
jud_full               最高法院刑事判決　　　　　　　　　　 111年度台非字第72號\r\n上　訴　人　最高檢察署...
jud_url                https://judgment.judicial.gov.tw/FJUD/data.asp...
Name: 24969, dtype: object


In [67]:
# Get case_num, basic_info, case_type
def get_re_multi_matched_list(input_text_list):
    patterns = [
        
        r'(.+?號)\n(上訴人.+?)\n.+?因(.+?)等?案件',
        r'(.+?號)\n(.+?)上列.+因(.+?)等?案件',
        r'(.+?號)\n(.+?)上列?被告因?(.+)等?',
        r'(.+?號)\n(.+?)上?下?列?開?.+?違反(.+)等?案?件?',
        r'(.+?號)\n(.+?)因(.+)等?案?件?',

    ]

    error_signs = ['\u3000', '\u30005', '\u3000111', '\u30004', '\u300011', '\u30003', '\u300024', '&nbsp;', '\u3000110', '\u300012', '\u300030', ' ']
    matched_indices = [[] for _ in range(len(patterns))]
    empty_list = []
    print('matched_indices', matched_indices)

    for input_list_order in tqdm(range(len(input_text_list))):
    # for input_list_order in tqdm(range(20580, len(input_text_list))):
        input_text = input_text_list[input_list_order][1]
        # print([input_list_order, input_text])
        input_text = input_text
        df_id = input_text_list[input_list_order][0]
        # ---------------------------
        # for sign in error_signs:
        #     input_text = input_text.replace(sign, '')
        # ---------------------------

        for rule_index, pattern in enumerate(patterns):
            search_matched = re.search(pattern, input_text, re.DOTALL)
            if search_matched:
                # search_matched_groups = search_matched.groups()
                # length_below_50 = True
                # for input_list_order in range(3):
                #     if len(search_matched_groups[input_list_order]) > 50:
                #         length_below_50 = False
                #         break
                # if length_below_50:
                #     matched_indices[rule_index].append([df_id, search_matched])
                #     break
                matched_indices[rule_index].append([input_list_order, df_id, search_matched.groups(), [len(group) for group in search_matched.groups()]])
                break

        if not search_matched:
            empty_list.append([input_list_order, df_id, input_text])

    return [matched_indices, empty_list]

input_texts = [[index, row['basic_info_20240120']] for index, row in nona_df.iterrows()]
result_list = get_re_multi_matched_list(input_texts)

matched_length = [len(matched) for matched in result_list[0]]
print('matched:', matched_length)
print('empty:', len(result_list[1]))



matched_indices [[], [], [], [], []]


100%|██████████| 21445/21445 [00:02<00:00, 7482.55it/s] 

matched: [3527, 17884, 27, 4, 3]
empty: 0





In [None]:
# Check empty list
count = 0
matched_list = []
empty_list = []
for text in result_list[1]:
    print(text)
    # input_text = text[1].replace('\n', '')
    input_text = text[2]
    # matched = re.search(r'(.+號)\n(.+?)\n違反(.+)件', input_text, re.DOTALL)
    # matched = re.search(r'(.+?號)\n(.+?)因(.+)等?案?件?', input_text, re.DOTALL)
    matched = re.search(r'(.+?號)\n(.+?)因(.+)等罪案件', input_text, re.DOTALL)
    print(matched)
    if matched:
        matched_list.append(matched)
    else:
        empty_list.append(text)
    # count+=1
    # if count>50:
    #     break
print(matched_list)
print('matched_list length: ', len(matched_list))
print(empty_list)
print('empty_list length: ', len(empty_list))

[]
matched_list length:  0
[]
empty_list length:  0


In [None]:
# Show result according to rule_index and group_index
rule_index = 0
group_index = 1
group_length = 3
rule_length = len(result_list[0])

sorted_result = []
for rule_index in range(rule_length):
    sorted_result.append([])
    for group_index in range(group_length):
        sorted_result[rule_index].append([])
        sorted_result[rule_index][group_index] = sorted(result_list[0][rule_index], key=lambda x: x[3][group_index], reverse=True)
        print(f'rule_index:{rule_index}, group_index:{group_index}, {sorted_result[rule_index][group_index][0][3][group_index]}')

rule_index:0, group_index:0, 189
rule_index:0, group_index:1, 36
rule_index:0, group_index:2, 23
rule_index:1, group_index:0, 67
rule_index:1, group_index:1, 660
rule_index:1, group_index:2, 1230
rule_index:2, group_index:0, 26
rule_index:2, group_index:1, 302
rule_index:2, group_index:2, 15
rule_index:3, group_index:0, 25
rule_index:3, group_index:1, 1
rule_index:3, group_index:2, 11
rule_index:4, group_index:0, 27
rule_index:4, group_index:1, 34
rule_index:4, group_index:2, 6


In [69]:
# Store new csv according to original csv copy
output_basic_df = main_basic_df.copy()
for rule_list in result_list[0]:
    print('rule_list:', len(rule_list))
    for data in rule_list:
        df_index = data[1]
        case_num = data[2][0].replace('\n', ' ')
        basic_info = data[2][1].replace('\n', ' ')
        case_type = data[2][2]
        new_data_dict = {'case_num': case_num, 'basic_info': basic_info, 'case_type': case_type}

        for key, value in new_data_dict.items():
            output_basic_df.loc[df_index, key] = value
# output_basic_df.to_csv('/workspace/111資料/db_loaded/20240228_main_basic.csv', encoding='utf-8-sig')

rule_list: 3527
rule_list: 17884
rule_list: 27
rule_list: 4
rule_list: 3


In [84]:
# Check output data
main_basic_df = pd.read_csv('/workspace/111資料/db_loaded/20240228_main_basic.csv')
import random
print(len(main_basic_df))
for i in range(10):
    print(main_basic_df.iloc[random.choice(range(len(main_basic_df)))])
    print('----------------------')

24970
Unnamed: 0.1                                                         127
Unnamed: 0                                                           127
UID                                                                  127
JID                                           TPSM,111,台上,125,20220119,1
court_type                                                          最高法院
jud_date                                                        20220119
basic_info_20240120    最高法院刑事判決111年度台上字第125號\n上訴人林益德\n選任辯護人洪千惠律師\n上列上...
syllabus                                                       \n上訴駁回。\n
jud_full               最高法院刑事判決　　　　　　　　　　111年度台上字第125號\r\n上　訴　人　林益德\r...
jud_url                https://judgment.judicial.gov.tw/FJUD/data.asp...
case_num                                           最高法院刑事判決111年度台上字第125號
basic_info                                                        上訴人林益德
case_type                                                     違反毒品危害防制條例
Name: 127, dtype: object
--------------------

# 20240229 CDB data numbers

In [100]:
import pandas as pd
import faiss
main_basic_df = pd.read_csv('/workspace/111資料/db_loaded/20240228_main_basic.csv')
opinion_df = pd.read_csv('/workspace/111資料/db_loaded/20240120_category_opinion.csv')
sub_df = pd.read_csv('/workspace/111資料/db_loaded/20240225_category_sub.csv')
fee_df = pd.read_csv('/workspace/111資料/db_loaded/20240225_category_fee.csv')
old_fee_df = pd.read_csv('/workspace/111資料/db_loaded/20240120 含高院、最高法院/20240120_category_fee.csv')
old_sub_df = pd.read_csv('/workspace/111資料/db_loaded/20240120 含高院、最高法院/20240120_category_sub.csv')

opinion_flat = faiss.read_index('/workspace/111資料/db_loaded/0114_op_sentence_district_TARGET_embedding.bin')
fee_flat = faiss.read_index('/workspace/111資料/db_loaded/20240225_embedding_fee.bin')
sub_flat = faiss.read_index('/workspace/111資料/db_loaded/20240225_embedding_sub.bin')

In [96]:
print('main_basic_df: ', len(main_basic_df))
print('opinion_df: ', len(opinion_df))
print('fee_df: ', len(fee_df))
print('sub_df: ', len(sub_df))
print('opinion_flat', opinion_flat.ntotal)
print('fee_flat', fee_flat.ntotal)
print('sub_flat', sub_flat.ntotal)
print('old_fee_df: ', len(old_fee_df))
print('old_sub_df: ', len(old_sub_df))
print('-------------')
print('main_basic_df data 0: ', main_basic_df.iloc[0])
print('-------------')


main_basic_df:  24970
opinion_df:  113341
fee_df:  1988
sub_df:  470
opinion_flat 113341
fee_flat 1988
sub_flat 470
old_fee_df:  164183
old_sub_df:  80323
-------------
main_basic_df data 0:  Unnamed: 0.1                                                           0
Unnamed: 0                                                             0
UID                                                                    0
JID                                          IPCM,109,刑智上重訴,4,20220127,7
court_type                                                     智慧財產及商業法院
jud_date                                                        20220127
basic_info_20240120                                                  NaN
syllabus               \n原判決撤銷。\n何建廷、王永銘、聯華電子股份有限公司犯如附表一所示之罪，\n各處附表一所...
jud_full               智慧財產及商業法院刑事判決\r\n                             ...
jud_url                https://judgment.judicial.gov.tw/FJUD/data.asp...
case_num                                                             NaN
basic

In [111]:
main_basic_df = pd.read_csv('/workspace/111資料/db_loaded/20240228_main_basic.csv')


In [113]:
# main_basic_df.sort_values(by='jud_date', inplace=True)
# print(main_basic_df.tail(10))
# print(main_basic_df.head(10))
# print(len(main_basic_df[main_basic_df['jud_date']>20220531]))
print(len(main_basic_df[main_basic_df['jud_date']<=20220103]))



66
