## 데이터 불러오기, 전처리

In [9]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import nltk

In [10]:
# 데이터 불러오기
df=pd.read_csv('filtered_df.csv')

# 주제묶음 데이터
subject=pd.read_csv('주제별묶음.csv')

# label 데이터
label_df=pd.read_csv('combined_file.csv')

In [11]:
# df
subject.rename(columns={'Unnamed: 0': '법령명한글'}, inplace=True)
df = pd.merge(df, subject, on='법령명한글', how='left')


# label_df
label_df = pd.merge(label_df, subject, left_on='label', right_on='법령명한글', how='left')
label_df.drop(columns=['법령명한글'], inplace=True)
label_df.head(2)


# category 
categories = ['가정법률', '교통/운전', '국가 및 지자체', '국방/보훈', '근로/노동', '금융/금전', '무역/출입국', '문화/여가생활',
              '민·형사소송', '복지', '부동산/임대차', '사회안전/범죄', '소비자', '아동·청소년/교육', '정보통신/기술', '환경/에너지', '미포함']

def subject_labels(row):
    return ', '.join([category for category in categories if category in row and row[category] == 1])

df['subject'] = df.apply(subject_labels, axis=1)
label_df['subject'] = df.apply(subject_labels, axis=1)

# 최종 데이터
df=df[['Contents','법령명한글','법령명영문','keyword Contents','extract_keywords','subject']]
label_df=label_df[['sentence','label','subject']]

# df['subject'].str.contains('미포함').sum()/df.shape[0] -> 55% 가 미포함
# label_df['subject'].str.contains('미포함').sum()/label_df.shape[0] -> 53% 가 미포함

## tf-idf 전처리

In [12]:
# 전처리 과정
def preprocess_text(text):
    # lowercasing
    lowercased_text=text.lower()

    # cleaning
    import re
    cleaned_text = re.sub(r'\n', ' ', lowercased_text)  #  \n  제거
    cleaned_text = re.sub(r'[^0-9a-zA-Z\s]', '', cleaned_text)  # 영어,숫자 제외 제거
    cleaned_text = cleaned_text.strip()

    #tokenization
    from nltk.tokenize import word_tokenize
    tokenized_sentence= word_tokenize(cleaned_text)

    # stop words
    from nltk.corpus import stopwords
    stopwords=set(stopwords.words('english'))
    stopwords_remove= [word for word in tokenized_sentence if word not in stopwords]

    # stemming # 결과가 좋지 않아 적용 x
    #from nltk.stem import PorterStemmer
    #ps=PorterStemmer()
    #stemmed_text=[ps.stem(word) for word in stopwords_remove]

    return ' '.join(stopwords_remove)

def preprocessing(corpus):
    # tqdm 적용
    tqdm.pandas()
    # stemming 컬럼 추가
    corpus['Stemming'] = corpus['keyword Contents'].progress_apply(preprocess_text)
    return corpus

result=preprocessing(df)
# result

100%|██████████| 83450/83450 [01:02<00:00, 1327.93it/s]


In [13]:
law=df[['법령명한글','법령명영문','Contents','Stemming','extract_keywords','subject']]

## tf-idf 추천시스템

In [14]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

# tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(min_df=1, analyzer='word', max_features=2000, stop_words='english') #featue 값:33023 -> max_feature=2000
tfidf_matrix = vectorizer.fit_transform(law['Stemming'])
# tfidf_matrix.shape # (83450, 33023) #featue 값: 22752(porterstemmer)->33023

# top 20 법령명 추출
def find_most_similar_content(input_keyword, top_n=20):
    input_vector = vectorizer.transform([input_keyword])
    cosine_similarities = cosine_similarity(input_vector, tfidf_matrix).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    results = law.iloc[top_indices][['법령명한글','subject']]
    return results, input_vector.toarray(), cosine_similarities

# 결과 내 top 2 법령명 추출
def find_top_law_names(results, top_law_n=2):
    top_law_names = results['법령명한글'].value_counts().nlargest(top_law_n).index.tolist()
    return top_law_names


# 추출된 top2 내의 코사인 유사도
def find_final_similar_content(input_keyword, top_law_names, top_n=20):
    law_filtered = law[law['법령명한글'].isin(top_law_names)]
    tfidf_matrix_filtered = vectorizer.transform(law_filtered['Stemming'])
    input_vector = vectorizer.transform([input_keyword])
    cosine_similarities = cosine_similarity(input_vector, tfidf_matrix_filtered).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    final_results = law_filtered.iloc[top_indices]
    return final_results, cosine_similarities[top_indices]

# input sentence
input_keyword = input('situation explaining: ')

# result
initial_results, keyword_vector, cosine_similarities = find_most_similar_content(input_keyword)
top_law_names = find_top_law_names(initial_results)
final_results, final_cosine_similarities = find_final_similar_content(input_keyword, top_law_names)

# result print
# DataFrame
result_df = final_results.copy()
result_df['코사인 유사도'] = final_cosine_similarities
result_df['situation sentence'] = input_keyword
result_df[['situation sentence','법령명한글', 'Contents', '코사인 유사도','subject']]

Unnamed: 0,situation sentence,법령명한글,Contents,코사인 유사도,subject
38324,"involved in a car accident, and I need legal assistance to handle the situation",생활보호법,"Article 4 (Basic Principles of Assistance) (1) The basic principle of assistance under this Act shall be that assistance will be provided on the condition that the person to receive assistance makes his best effort to maintain and improve his standard of living by using his assets and abilities, and assistance provided under this Act shall supplement the efforts of the individual.\n (2) Assistance from the responsible supporter and assistance under other Acts and subordinate statutes shall be given prior to assistance under this Act: Provided, That where the assistance agency recognizes an urgent situation as determined by the Presidential Decree, it may provide assistance under this Act to persons receiving assistance from the responsible supporter or assistance under other Acts and subordinate statutes until the situation is alleviated.",0.377914,미포함
78001,"involved in a car accident, and I need legal assistance to handle the situation",해양사고의 조사 및 심판에 관한 법률,"(1) The judgment on an inquiry shall be made only through oral pleadings: Provided, That oral pleadings are not required for judgment in any of the following cases:\n 1. Where a person involved in a marine accident fails to make an appearance on the inquiry date without any justifiable ground;\n 2. Where a person involved in a marine accident files a written pleading with permission from the presiding judge;\n 3. Where the presiding judge deems it unnecessary to summon persons involved in a marine accident in order to discover the cause of the accident on any ground, such as where oral pleadings by persons involved in a marine accident are unnecessary because investigations conducted by investigators into the accident are sufficient;\n 4. Where a case is examined by summary inquiry under Article 41-3.\n (2) In cases falling under paragraph (1) 3, oral pleadings shall not be omitted contrary to an intention expressly manifested by a person involved in the relevant marine accident.\n [This Article Wholly Amended on Jun. 15, 2011]",0.372171,미포함
77988,"involved in a car accident, and I need legal assistance to handle the situation",해양사고의 조사 및 심판에 관한 법률,"(1) Where an investigator files a request for an inquiry pursuant to Article 38, he/she shall designate a person deemed relevant to the cause of the occurrence of a marine accident as a person involved in the marine accident.\n (2) Where an investigator designates a person involved in a marine accident pursuant to paragraph (1), he/she shall notify the details thereof to the person involved in the marine accident, as prescribed by Presidential Decree. <Amended on Feb. 18, 2020>\n [This Article Wholly Amended on Dec. 29, 2009]\n [Title Amended on Feb. 18, 2020]",0.358009,미포함
38328,"involved in a car accident, and I need legal assistance to handle the situation",생활보호법,"(1) The types of assistance under this Act shall be as follows:\n 1. Livelihood Assistance;\n 2. Medical Assistance;\n 3. Self-sufficiency Assistance;\n 4. Education Assistance;\n 5. Childbirth Assistance; and\n 6. Funeral Assistance.\n (2) Assistance under this Act shall be any one of the types listed in paragraph (1) above, or two or more types according to the needs of the person to receive assistance.\n (3) The medical assistance under paragraph (1) 2 shall be in accordance with the provisions of other Acts.",0.310081,미포함
77973,"involved in a car accident, and I need legal assistance to handle the situation",해양사고의 조사 및 심판에 관한 법률,"Article 30 (Selection of Tribunal-Appointed Inquiry Counsels) (1) Where a person involved in a marine accident is not represented by an inquiry counsel in any of the following cases, the competent Tribunal shall select an inquiry counsel (hereafter the same shall apply in this Article) ex officio, within the budgetary limits, from among persons registered pursuant to Article 28 (2):\n 1. Where the person involved in a marine accident is a minor;\n 2. Where the person involved in a marine accident is not less than 70 years of age;\n 3. Where the person involved in a marine accident has an impediment in hearing or speech;\n 4. Where the person involved in a marine accident is suspected to have a mental disorder.\n (2) Where a person involved in a marine accident cannot afford an inquiry counsel due to poverty or any other situation, the competent Tribunal may appoint an inquiry counsel for such person at his/her request, within the budgetary limits.\n (3) Where a Tribunal deems it necessary to protect the rights of a person involved in a marine accident, with consideration given to the age, intelligence, and educational background of the person, it may appoint an inquiry counsel for the person, within the budgetary limits. In such cases, the appointment of an inquiry counsel shall not be contrary to an intention expressly manifested by the person involved in a marine accident.\n (4) Necessary matters concerning the management of Tribunal-appointed inquiry counsels, including the selection of inquiry counsels under paragraphs (1) through (3), shall be prescribed by Ordinance of the Ministry of Oceans and Fisheries. <Amended on Mar. 23, 2013>\n [This Article Newly Inserted on Jun. 15, 2011]",0.30938,미포함
78002,"involved in a car accident, and I need legal assistance to handle the situation",해양사고의 조사 및 심판에 관한 법률,"Any presiding judge shall verify the identity of a person involved in a marine accident by questioning his/her name, resident registration number and address, and by questioning the type, etc. of licenses if the person involved in a marine accident is a ship officer or a pilot.\n [This Article Wholly Amended on Dec. 29, 2009]",0.303813,미포함
38345,"involved in a car accident, and I need legal assistance to handle the situation",생활보호법,"(1) Where a ward's income, financial situation, or ability to work changes, the assistance agency may alter the type and method of assistance on its own authority or in accordance with an application from the ward, his relative, or other related person.\n (2) Written notification with a clear statement of the reasons for the alteration of assistance under paragraph (1) shall be given to the ward.",0.292804,미포함
38346,"involved in a car accident, and I need legal assistance to handle the situation",생활보호법,"(1) Where a ward falls under any one of the following subparagraphs, the assistance agency may suspend all or part of the assistance:\n 1. Where all or part of the assistance provided to a ward becomes unnecessary; and\n 2. Where the ward refuses all or part of the assistance.\n (2) Deleted. <by Act No. 5836, Feb. 8, 1999>\n (3) The provisions of shall apply mutatis mutandis to the cases in paragraph (1) above. <Amended by Act No. 5836, Feb. 8, 1999>",0.291114,미포함
38349,"involved in a car accident, and I need legal assistance to handle the situation",생활보호법,"(1) Where the head of an assistance facility is commissioned by the assistance agency to provide assistance to a person to receive assistance, he may not refuse in the absence of reasonable grounds.\n (2) The head of an assistance facility shall provide assistance at a level of or above the minimum standard of living as determined by the Minister of Health and Welfare. <Amended by Act No. 5360, Aug. 22, 1997>\n (3) The head of an assistance facility shall not discriminate on the basis of gender, creed, or social status in providing assistance.\n (4) The head of an assistance agency may not force persons to participate in religious activities.",0.282268,미포함
38342,"involved in a car accident, and I need legal assistance to handle the situation",생활보호법,"(1) Where necessary for the determination or provision of assistance, the assistance agency may have a related civil servant investigate the financial condition or health of a ward or person to receive assistance, or have the ward or person to receive assistance undergo a medical examination at a medical institution designated by the assistance agency.\n (2) The civil servant conducting the investigation under the provisions of paragraph (1) shall carry a certificate indicating his authority and present it to the person concerned.\n (3) Where the ward or person to receive assistance refuses, disrupts, or avoids the investigation under the provisions of paragraph (1), or does not follow the doctor’s instructions after a medical examination, the assistance agency may dismiss the application for assistance or cancel a decision to provide assistance to a person to receive assistance, and may change the type or method of assistance, halt or suspend assistance to ward. In this case, the provisions of shall apply mutatis mutandis.",0.282046,미포함


## 평가지표

In [19]:
# 평가 함수 정의
# label데이터의 문장들과 추천시스템 결과와의 매칭
# 정답 label 과 일치하는 경우 1 로 하여, 일치컬럼/전체컬럼의 비율

def evaluate_accuracy(label_df, top_n=20):
    correct_predictions = 0
    total_predictions = label_df.shape[0]
    ox_list=[]

    for index, row in label_df.iterrows():
        input_keyword = row['sentence']
        true_subjects = row['subject'].split(', ')
        
        results, input_vector, cosine_similarities = find_most_similar_content(str(input_keyword))
        predicted_subjects = results['subject'].unique()
        
        if any(true_subject in predicted_subjects for true_subject in true_subjects):
            correct_predictions += 1
            ox_list.append(1)
        else:
            ox_list.append(0)

    # check 컬럼을 생성하여 1,0 값 넣기
    label_df['check']=ox_list
    accuracy = correct_predictions / total_predictions
    return accuracy

# 일치여부 계산
accuracy = evaluate_accuracy(label_df)
print(f'Accuracy: {accuracy:.2f}') # 0.66 # 일치여부가 66% 정도

# label_df.to_csv('final.csv',index=False)

Accuracy: 0.66


### mAP 계산

In [24]:
# ap 값 계산
def calculate_ap(checks):
    relevant_documents = 0
    precision_sum = 0.0
    
    for i, check in enumerate(checks):
        if check == 1:
            relevant_documents += 1
            precision = relevant_documents / (i + 1)
            precision_sum += precision
    
    if relevant_documents == 0:
        return 0.0
    
    return precision_sum / relevant_documents
ap=calculate_ap(label_df['check'])

print(f'mAP: {ap:.4f}')

mAP: 0.6731


In [26]:
subject_ap = {}
all_subjects = set(subject for subjects in label_df['subject'] for subject in subjects.split(', '))

for subject in all_subjects:
    subject_checks = label_df[label_df['subject'].str.contains(subject)]['check'].tolist()
    ap = calculate_ap(subject_checks)
    subject_ap[subject] = ap

# 결과 출력
for subject, ap in subject_ap.items():
    print(f'Subject: {subject}, AP: {ap:.4f}')

Subject: , AP: 0.6731
Subject: 정보통신/기술, AP: 0.2481
Subject: 교통/운전, AP: 0.4940
Subject: 소비자, AP: 0.2624
Subject: 무역/출입국, AP: 0.3944
Subject: 민·형사소송, AP: 0.4311
Subject: 문화/여가생활, AP: 0.3499
Subject: 금융/금전, AP: 0.3420
Subject: 아동·청소년/교육, AP: 0.4671
Subject: 부동산/임대차, AP: 0.4326
Subject: 미포함, AP: 0.9959
Subject: 국방/보훈, AP: 0.0000
Subject: 복지, AP: 0.4411
Subject: 환경/에너지, AP: 0.4227
Subject: 가정법률, AP: 0.4374
Subject: 근로/노동, AP: 0.5323
Subject: 국가 및 지자체, AP: 0.0192
Subject: 사회안전/범죄, AP: 0.3743
