<a href="https://colab.research.google.com/github/J-Jaehyun-SEO/Project_Jeohui/blob/main/(1)_Jeohui_data_merge%26preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

####(0)Install

In [None]:
# 필요한 패키지 설치
!pip install --upgrade matplotlib kiwipiepy pandas tqdm nltk gensim scikit-learn flashtext konlpy xlsxwriter
!pip install numpy==1.23.5

# Nanum 폰트 설치
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm -rf ~/.cache/matplotlib

# 라이브러리 임포트
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from collections import Counter
import itertools
import nltk
from nltk import collocations
from flashtext import KeywordProcessor
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy.spatial.distance import pdist, squareform
from itertools import combinations
from operator import itemgetter
import networkx as nx

# NLTK 다운로드
nltk.download('punkt')

##(1)DATA LOADING

###데이터 1 - 조선, 동아 1954-1999

In [None]:
old_news_df = pd.read_csv('/content/저희_조선동아_1954_1999_문장분리.csv')
old_news_df=old_news_df.drop(columns=['index','sents','저희_sents','저희_str','저희_문장_index'])
old_news_df

In [None]:
import pandas as pd
import re
from kiwipiepy import Kiwi

# Kiwi 형태소 분석기
kiwi = Kiwi()

# '저희'가 있는 문장 추출 및 저장할 리스트
sentences_with_jeohee = []

# 문장 구분 기호
split_chars = r'[!?。.．。!？]'
split_chars_2 = r'[.!?]'

# 원본 데이터프레임에 인덱스 추가
old_news_df.reset_index(inplace=True)

# 'text' 열에서 '저'와 '희' 사이의 스페이스 제거
old_news_df['text'] = old_news_df['text'].str.replace(r'저\s+희', '저희', regex=True)

for _, row in old_news_df.iterrows():
    # 문장 부호 통일
    text = row['text'].replace('。', '.')
    text = text.replace('．', '.')
    text = text.replace('!', '!')
    text = text.replace('？', '?')

    # '저희'가 있는 문장 추출
    sentences = re.split(split_chars_2, text)  # 통일된 문장 부호 기준으로 분리
    for sentence in sentences:
        if '저희' in sentence:
            if sentence.count('저희') >= 3:
                # '저희'가 3개 이상이면 kiwi로 문장 분리
                kiwi_sentences = kiwi.split_into_sents(sentence)
                for kiwi_sentence in kiwi_sentences:
                    if '저희' in kiwi_sentence.text:
                        sentences_with_jeohee.append((kiwi_sentence.text.strip(), row['index']))
            else:
                sentences_with_jeohee.append((sentence.strip(), row['index']))

# 새로운 데이터프레임 생성
new_df_jeohee = pd.DataFrame(sentences_with_jeohee, columns=['jeohee_sentence', 'original_index'])

# '저희' 개수 세는 컬럼 추가
new_df_jeohee['jeohee_count'] = new_df_jeohee['jeohee_sentence'].apply(lambda x: x.count('저희'))

# 앞, 뒤 문장 컬럼 추가
new_df_jeohee['previous_sentence'] = ''
new_df_jeohee['next_sentence'] = ''

# 앞, 뒤 문장 붙이기
for index, row in new_df_jeohee.iterrows():
    # 원본 데이터프레임에서 해당 문장 찾기
    original_text = old_news_df.loc[row['original_index'], 'text']
    jeohee_index = row['original_index']

    if jeohee_index is not None:
        original_sentences = re.split(split_chars_2, original_text)

        # 부분 문자열 포함 여부로 찾기
        jeohee_sentence_index = None
        for i, s in enumerate(original_sentences):
            if row['jeohee_sentence'] in s:
                jeohee_sentence_index = i
                break

        if jeohee_sentence_index is not None:
            # 앞 문장 추가
            if jeohee_sentence_index > 0:
                new_df_jeohee.loc[index, 'previous_sentence'] = original_sentences[jeohee_sentence_index - 1].strip()

            # 뒤 문장 추가
            if jeohee_sentence_index < len(original_sentences) - 1:
                new_df_jeohee.loc[index, 'next_sentence'] = original_sentences[jeohee_sentence_index + 1].strip()

# 문장 구분 없는 경우 처리
for index, row in new_df_jeohee.iterrows():
    if row['previous_sentence'] == '' and row['next_sentence'] == '':
        # 문장 구분이 없는 경우 kiwi로 분리
        kiwi_sentences = kiwi.split_into_sents(row['jeohee_sentence'])
        if len(kiwi_sentences) > 1:
            new_df_jeohee.loc[index, 'previous_sentence'] = kiwi_sentences[0].text.strip()
            new_df_jeohee.loc[index, 'next_sentence'] = '. '.join([s.text for s in kiwi_sentences[1:]]).strip()

# 원본 데이터프레임의 관련 열을 새로운 데이터프레임에 병합
new_df_jeohee = new_df_jeohee.merge(old_news_df, left_on='original_index', right_on='index', suffixes=('', '_original'))

# 불필요한 인덱스 열 삭제
new_df_jeohee.drop(columns=['index', 'original_index'], inplace=True)

# 'jeohee_count'가 2 이상인 문장 다시 처리
for index, row in new_df_jeohee[ (new_df_jeohee['jeohee_count'] >= 2) | (new_df_jeohee['jeohee_sentence'].str.len() >= 200) ].iterrows():
    sentence = row['jeohee_sentence']
    kiwi_sentences = kiwi.split_into_sents(sentence)
    new_sentences = [s.text for s in kiwi_sentences]

    # 문장 재구성
    previous_sentence = ''
    jeohee_sentence = ''
    next_sentence = ''

    for i, s in enumerate(new_sentences):
        if '저희' in s:
            jeohee_sentence = s.strip()
            if i > 0:
                previous_sentence = new_sentences[i-1].strip()
            if i < len(new_sentences) - 1:
                next_sentence = new_sentences[i+1].strip()
            break

    new_df_jeohee.at[index, 'previous_sentence'] = previous_sentence
    new_df_jeohee.at[index, 'jeohee_sentence'] = jeohee_sentence
    new_df_jeohee.at[index, 'next_sentence'] = next_sentence

# 'jeohee_count' 업데이트
new_df_jeohee['jeohee_count'] = new_df_jeohee['jeohee_sentence'].apply(lambda x: x.count('저희'))

# 결과 확인
new_df_jeohee.head()


In [None]:
jeohee_count_stats = new_df_jeohee['jeohee_count'].value_counts()
new_df_jeohee = new_df_jeohee[new_df_jeohee['jeohee_count'] == 1]
new_df_jeohee = new_df_jeohee.sort_values('jeohee_count',ascending=False)

In [None]:
# 문장 부호 치환 함수
def replace_punctuation(text):
    text = text.replace('。', '.')
    text = text.replace('．', '.')
    text = text.replace('!', '!')
    text = text.replace('？', '?')
    text = text.replace('?', '?')

    return text

# 문장 처리 후 적용할 위치
# 원본 데이터프레임의 관련 열을 병합한 후 문장 부호를 치환합니다.
new_df_jeohee = new_df_jeohee.merge(old_news_df, left_on='original_index', right_on='index', suffixes=('', '_original'))

# replace_punctuation 함수를 'context' 컬럼에 적용
new_df_jeohee['context'] = new_df_jeohee['jeohee_sentence'].apply(replace_punctuation)

# 불필요한 인덱스 열 삭제
new_df_jeohee.drop(columns=['index', 'original_index'], inplace=True)


저희 개별 문장 추출

In [None]:
# Function to extract sentences containing a specific keyword
def extract_sentences(text, word):
    sentences = text.split('.')
    return '. '.join(sentence.strip() + '.' for sentence in sentences if word.lower() in sentence.lower())

# Assuming df is defined earlier and has a column named 'text'

# Apply the function to extract sentences containing '저희'
new_df['extracted_sentences'] = new_df['context'].apply(lambda x: extract_sentences(x, '저희'))
new_df

2 개 나란히 배령된 중복 문장 확인

In [None]:
new_df['duplicates'] = new_df['extracted_sentences'].duplicated(keep=False)


연속해서 저희가 와서 두 문장이 중복되면, 문장을 분할해서 한 문장만 남기기.

In [None]:
for i in range(1, len(new_df)):
    if new_df.loc[i, 'duplicates'] and new_df.loc[i-1, 'extracted_sentences'] == new_df.loc[i, 'extracted_sentences']:
        split_sentences = new_df.loc[i, 'extracted_sentences'].split('.')
        if split_sentences:
            new_df.loc[i-1, 'extracted_sentences'] = split_sentences[0] + '.'
            if len(split_sentences) > 1:
                new_df.loc[i, 'extracted_sentences'] = '.'.join(split_sentences[1:]) + '.'

In [None]:
# 중복된 값 찾기
duplicates = new_df['extracted_sentences'].duplicated(keep=False)

# 중복된 값의 개수 세기
num_duplicates = duplicates.sum()

print(num_duplicates)

In [None]:
new_df_unique = new_df.drop_duplicates(subset=['extracted_sentences'], keep='first')
new_df_unique = new_df_unique[new_df_unique['extracted_sentences'] != '.']
new_df_unique
new_df=new_df_unique

###데이터 2- 조선, 동아 1990-2024(0525)



In [None]:
recent_news_df = pd.read_excel('/content/bigkinds/bigkinds_JOSEON_DONGA.xlsx')

In [None]:
#recent_news_df = recent_news_df.drop(columns=['URL'])
recent_news_df['year'] = recent_news_df['Published Date'].str[:4]
recent_news_df['text'] = recent_news_df['Title'] + ' ' + recent_news_df['Body']
recent_news_df=recent_news_df.drop(columns=['URL', 'Category','Published Date','Title','Body'])
recent_news_df.rename(columns={'Newspaper': 'publisher'}, inplace=True)
recent_news_df['publisher'] = recent_news_df['publisher'].replace({'동아일보': 'donga', '조선일보': 'chosun'})
recent_news_df['text'] = recent_news_df['text'].astype(str)

In [None]:
import re

special_chars = []
for index, row in recent_news_df.iterrows():
    text = row['text']
    # 특수 문자 추출
chars = re.findall(r'[^a-zA-Z0-9ㄱ-ㅎㅏ-ㅣ가-힣\s\u2E80-\u2EFF\u31C0-\u31EF\u3200-\u32FF\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]', text)
special_chars.extend(chars)

# 중복 제거
unique_special_chars = list(set(special_chars))

print(unique_special_chars)

In [None]:
new_df_jeohee_2 = new_df_jeohee
new_df_jeohee_2

####중복 값 검증 및 처리

In [None]:
# 중복된 값 찾기
duplicates = new_df_jeohee_2['jeohee_sentence'].duplicated(keep=False)

# 중복된 값의 개수 세기
num_duplicates = duplicates.sum()

print(num_duplicates)

중복되는 행들간에 값이 완전히 겹치면 하나만 남기고 제거하고, 총 몇개를 제거했는지 프린트



In [None]:
# 모든 열의 값이 완전히 겹치는 경우 제거
before_count = len(new_df_jeohee_2)
new_df_jeohee_2.drop_duplicates(subset=new_df_jeohee_2.columns, inplace=True)
after_count = len(new_df_jeohee_2)

# 제거된 행 개수 출력
print("제거된 행 개수:", before_count - after_count)

중에서 jeohee_sentence jeohee_count previous_sentence next_sentence 까지 겹치고 나머지가 다른 경우에는 중복 값을 제거

In [None]:
# 'jeohee_sentence', 'jeohee_count', 'previous_sentence', 'next_sentence' 컬럼 기준으로 중복 제거
before_count = len(new_df_jeohee_2)
new_df_jeohee_2.drop_duplicates(subset=['jeohee_sentence', 'jeohee_count', 'previous_sentence', 'next_sentence'], inplace=True)
after_count = len(new_df_jeohee_2)

# 제거된 행 개수 출력
print("제거된 행 개수:", before_count - after_count)

In [None]:
import pandas as pd

# 'jeohee_sentence' 컬럼을 기준으로 그룹화하고, 각 그룹 내에서 다른 컬럼의 차이 확인
diff_dfs = []
for sentence, group in duplicate_sentences.groupby('jeohee_sentence'):
    if len(group) > 1:
        diff_df = group[['jeohee_sentence', 'jeohee_count', 'previous_sentence', 'next_sentence']].drop_duplicates()
        diff_df['duplicate_group'] = sentence
        diff_dfs.append(diff_df)

# 차이점을 보여주는 DataFrame 생성
result_df = pd.concat(diff_dfs)
result_df

전체 df 에서 위와 같이 중복을 검증해보고, 차이나는 글자수가 50자 이내인 경우, 중복되는 값중 더 글자 수가 작은 값을 제거하고 한 개만 남김

In [None]:
import pandas as pd

def remove_duplicates_by_char_count(df):
    before_count = len(df)

    # 'jeohee_sentence' 컬럼을 기준으로 그룹화
    for sentence, group in df.groupby('jeohee_sentence'):
        if len(group) > 1:
            # 각 그룹 내에서 문자열 길이를 기준으로 정렬
            group_sorted = group.copy()
            group_sorted['sentence_length'] = group_sorted['jeohee_sentence'].apply(len)
            group_sorted = group_sorted.sort_values(by='sentence_length', ascending=True)

            # 가장 짧은 문자열과 나머지 문자열들의 길이 차이 계산
            shortest_length = group_sorted.iloc[0]['sentence_length']
            length_diffs = group_sorted['sentence_length'] - shortest_length

            # 길이 차이가 50 이하인 행 제거
            rows_to_remove = group_sorted[length_diffs <= 50].index[1:]  # 첫 번째 행 (가장 짧은 문자열)은 제외
            df.drop(index=rows_to_remove, inplace=True)

    after_count = len(df)
    print("제거된 행 개수:", before_count - after_count)
    return df


# 함수 호출
new_df_jeohee_2 = remove_duplicates_by_char_count(new_df_jeohee_2)
new_df_jeohee_2


중복행 검증

In [None]:
# 중복된 값 찾기
duplicates = new_df_jeohee_2['jeohee_sentence'].duplicated(keep=False)

# 중복된 값만 필터링
duplicate_sentences = new_df_jeohee_2[duplicates]

# 중복된 값이 포함된 DataFrame 출력
duplicate_sentences

###두 데이터 통합해 처리

In [None]:
new_df_jeohee_1 = recent_news_df
new_df_jeohee_2 = old_news_df

In [None]:
new_df_jeohee_1

In [None]:
new_df_jeohee_2

In [None]:
# 두 데이터프레임 합치기

# 'year' 컬럼을 문자열로 변환
new_df_jeohee_1['year'] = new_df_jeohee_1['year'].astype(str)
new_df_jeohee_2['year'] = new_df_jeohee_2['year'].astype(str)

# 두 데이터프레임을 concat으로 병합
merged_df = pd.concat([new_df_jeohee_1, new_df_jeohee_2]).reset_index(drop=True)

merged_df

In [None]:
#저회 -> 저희
import re

# 찾을 패턴 정의
pattern = r"저\s*회"

# 각 컬럼별로 값 변경
for column in ['previous_sentence', 'jeohee_sentence', 'next_sentence']:
    new_df_jeohee[column] = new_df_jeohee[column].str.replace(pattern, "저희", regex=True)

##전체 토크나이징 코드

In [None]:
import pandas as pd
import re
from konlpy.tag import Kkma
from kiwipiepy import Kiwi

# Kiwi 형태소 분석기
kiwi = Kiwi()

# 특수기호만 있는 문장 패턴
pattern = r"^[^\w\s]+$"

# 원본 데이터프레임에 인덱스 추가
old_news_df.reset_index(inplace=True)

# 'text' 열에서 '저'와 '희' 사이의 스페이스 제거
old_news_df['text'] = old_news_df['text'].str.replace(r'저\s+희', '저희', regex=True)
old_news_df = old_news_df.replace('\n', '-', regex=True)

In [None]:
# 특수 기호들을 처리하는 함수
def process_special_characters(text):
    # 여는 큰 따옴표로 처리할 특수 기호들
    opening_quotation_marks = {'『', '「','‘','“'}
    for mark in opening_quotation_marks:
        text = text.replace(mark, '"')

    # 닫는 큰 따옴표로 처리할 특수 기호들
    closing_quotation_marks = {'』', '」','”',' ’'}
    for mark in closing_quotation_marks:
        text = text.replace(mark, '"')
    #'…'는 미처리
    # '*'으로 처리할 특수 기호들
    star_marks = {'○','●', '▼', '◇', '△',  '▲'}
    for mark in star_marks:
        text = text.replace(mark, '*')

    # 제거할 특수 기호들
    remove_marks = {'|', '｜','-', '—'}
    for mark in remove_marks:
        text = text.replace(mark, '')

    return text

# 데이터프레임의 각 텍스트를 일괄 토크나이징
tokenized_texts = []
for _, row in old_news_df.iterrows():
    text = row['text']

    # 특수 기호 처리
    text = process_special_characters(text)

    # 기존 특수 기호 처리
    text = text.replace('。', '.').replace('．', '.').replace('!', '!').replace('？', '?')

    # 토크나이징 수행
    tokenized_sentences = kiwi.split_into_sents(text)
    tokenized_texts.append((tokenized_sentences, row['index']))


In [None]:

# '저희'가 포함된 문장과 그 앞뒤 문장을 저장할 리스트
sentences_with_jeohee = []

# '저희'가 포함된 문장과 그 앞뒤 문장을 추출
for tokenized_sentences, idx in tokenized_texts:
    sentences = [sentence.text for sentence in tokenized_sentences]
    for i, sentence in enumerate(sentences):
        if '저희' in sentence:
            # 특수기호만 있는 문장은 제외
            previous_sentence = sentences[i-1] if i > 0 and not re.match(pattern, sentences[i-1].strip()) else ''
            next_sentence = sentences[i+1] if i < len(sentences) - 1 and not re.match(pattern, sentences[i+1].strip()) else ''
            sentences_with_jeohee.append((previous_sentence.strip(), sentence.strip(), next_sentence.strip(), idx))

# 새로운 데이터프레임 생성
new_df_jeohee = pd.DataFrame(sentences_with_jeohee, columns=['previous_sentence', 'jeohee_sentence', 'next_sentence', 'original_index'])

# '저희' 개수 세는 컬럼 추가
new_df_jeohee['jeohee_count'] = new_df_jeohee['jeohee_sentence'].apply(lambda x: x.count('저희'))

# 원본 데이터프레임 병합
new_df_jeohee = new_df_jeohee.merge(old_news_df, left_on='original_index', right_on='index', suffixes=('', '_original'))

# 불필요한 인덱스 열 삭제
new_df_jeohee.drop(columns=['index', 'original_index'], inplace=True)

# '저희' 개수 업데이트
new_df_jeohee['jeohee_count'] = new_df_jeohee['jeohee_sentence'].apply(lambda x: x.count('저희'))


####중복 값 검증 및 처리

In [None]:
import pandas as pd

def remove_duplicates_by_char_count(df):
    before_count = len(df)

    # 'jeohee_sentence' 컬럼을 기준으로 그룹화
    for sentence, group in df.groupby('jeohee_sentence'):
        if len(group) > 1:
            # 각 그룹 내에서 문자열 길이를 기준으로 정렬
            group_sorted = group.copy()
            group_sorted['sentence_length'] = group_sorted['jeohee_sentence'].apply(len)
            group_sorted = group_sorted.sort_values(by='sentence_length', ascending=True)

            # 가장 짧은 문자열과 나머지 문자열들의 길이 차이 계산
            shortest_length = group_sorted.iloc[0]['sentence_length']
            length_diffs = group_sorted['sentence_length'] - shortest_length

            # 길이 차이가 50 이하인 행 제거
            rows_to_remove = group_sorted[length_diffs <= 50].index[1:]  # 첫 번째 행 (가장 짧은 문자열)은 제외
            df.drop(index=rows_to_remove, inplace=True)

    after_count = len(df)
    print("제거된 행 개수:", before_count - after_count)
    return df


# 함수 호출
new_df_jeohee = remove_duplicates_by_char_count(new_df_jeohee)
new_df_jeohee


##학습데이터 추출(레이블링용)

In [None]:
import pandas as pd
import xlsxwriter

# Output file path
df['jeohee_sentence'] = df['jeohee_sentence'].astype(str)
output_path = "//content/학습용데이터/recent_df_0707-추출(sampled_df_2100)백업_1차.xlsx"

# Create a new Excel file and add a worksheet
workbook = xlsxwriter.Workbook(output_path)
worksheet = workbook.add_worksheet(name='Extracted Sentences')

# Define formats
red_format = workbook.add_format({'font_color': 'red'})
default_format = workbook.add_format({'font_color': 'black'})  # Default formatting

# Write headers
for col_num, value in enumerate(df.columns):
    worksheet.write(0, col_num, value, default_format)

# Iterate over rows to apply rich text formatting
for idx, row in enumerate(df.itertuples(index=False), start=1):
    for col_idx, value in enumerate(row):
        if df.columns[col_idx] == 'jeohee_sentence':  # Apply formatting to 'jeohee_sentence' column
            if isinstance(value, str) and value:  # Ensure the value is a non-empty string
                parts = value.split('저희')
                # Prepare the rich text with the appropriate formatting
                rich_text = []
                if parts[0]:
                    rich_text = [default_format, parts[0]]
                for part in parts[1:]:
                    rich_text.extend([red_format, '저희', default_format, part])
                # Write the rich string to the corresponding cell
                if rich_text:
                    worksheet.write_rich_string(idx, col_idx, *rich_text)
                else:
                    worksheet.write(idx, col_idx, value, default_format)
            else:
                worksheet.write(idx, col_idx, value, default_format)
        else:
            worksheet.write(idx, col_idx, value, default_format)

# Close the workbook and save the file
workbook.close()
print(f"Filtered data saved to {output_path}")


In [None]:
import pandas as pd

# 연도별 데이터 비율 계산
year_counts = new_df_jeohee['year'].value_counts(normalize=True)

# 총 2100건 샘플링
sampled_df = pd.DataFrame()
for year, proportion in year_counts.items():
    num_samples = round(proportion * 2100)
    year_samples = new_df_jeohee[new_df_jeohee['year'] == year].sample(n=num_samples, random_state=1)
    sampled_df = pd.concat([sampled_df, year_samples])

# 인덱스 초기화 및 연도별 정렬
sampled_df_2100 = sampled_df.reset_index(drop=True).sort_values(by='year', ascending=True)


###Training 데이터 편집 및 추출

In [None]:
# '저희' 포함 문장과 그 앞뒤 문장 추출
new_rows = []
for index, row in recent_news_df.iterrows():
    # 'text' 컬럼이 NaN이 아닌 경우에만 처리
    if pd.notna(row['text']):
        text = str(row['text']).replace('\n', ' ')  # 'text'를 문자열로 변환

        # . , 。 , ! , ? 을 기준으로 문장 분리
        sentences = re.split(r'(?<=[\.!?。． ])', text)

        for i, sent in enumerate(sentences):
            if '저희' in sent:
                new_row = row.to_dict()

                context_sentences = []
                if i > 0:
                    context_sentences.append(sentences[i - 1].strip())
                context_sentences.append(sent.strip())
                if i < len(sentences) - 1:
                    context_sentences.append(sentences[i + 1].strip())

                new_row['context'] = ' '.join(context_sentences)
                new_rows.append(new_row)

new_df = pd.DataFrame(new_rows)

# 결과 출력
print(new_df)

In [None]:
new_df = new_df.rename(columns={'Newspaper': 'publisher'})

##분류모델

In [None]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

# Paths to dataset files
train_path = '/content/data/저희 분류 모델/train_small.csv'
test_path = '/content/data/저희 분류 모델/test_small.csv'

# Load datasets and print their heads
train_dataset = pd.read_csv(train_path)
test_dataset = pd.read_csv(test_path)

print("Train Dataset Head:")
print(train_dataset.head())

print("\nTest Dataset Head:")
print(test_dataset.head())

# Set up GPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset Class
class ClassifyDataset(Dataset):
    def __init__(self, csv_file):
        self.dataset = pd.read_csv(csv_file).dropna().drop_duplicates(subset=['sentence'])

        # Print the columns for debugging
        print("Columns in dataset:", self.dataset.columns)

        # Check if '학습용 분류' column exists
        if '학습용 분류' not in self.dataset.columns:
            raise KeyError("The '학습용 분류' column is missing in the dataset")

        self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
        print(self.dataset.describe())

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx]
        text, y = row['sentence'], row['학습용 분류']
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=256, add_special_tokens=True)
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), torch.tensor(y, dtype=torch.long)

# Load datasets into DataLoader
train_dataset = ClassifyDataset(train_path)
test_dataset = ClassifyDataset(test_path)

# Initialize Model
model = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base", num_labels=2).to(device)

# Fixing the embedding size mismatch
model.resize_token_embeddings(50135)

# Load pre-trained weights if available
model_path = "/content/data/저희 분류 모델/jh_model.pt"
try:
    state_dict = torch.load(model_path)
    # Remove unexpected keys and load state dict
    model_state_dict = model.state_dict()
    for key in state_dict.keys():
        if key in model_state_dict and model_state_dict[key].size() == state_dict[key].size():
            model_state_dict[key] = state_dict[key]
    model.load_state_dict(model_state_dict)
except RuntimeError as e:
    print(f"Error loading state_dict: {e}")
    # Handle specific mismatches here if needed

In [None]:
# Training Setup
epochs = 5
batch_size = 16
optimizer = AdamW(model.parameters(), lr=5e-6)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training Loop
losses = []
accuracies = []
for epoch in range(epochs):
    model.train()
    total_loss = total = correct = 0
    for input_ids, attention_masks, y_batch in tqdm(train_loader):
        input_ids, attention_masks, y_batch = input_ids.to(device), attention_masks.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks)
        loss = torch.nn.functional.cross_entropy(outputs.logits, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == y_batch).sum().item()
        total += y_batch.size(0)

    accuracy = correct / total
    losses.append(total_loss / len(train_loader))
    accuracies.append(accuracy)
    print(f"Epoch {epoch+1}: Loss {losses[-1]:.4f}, Accuracy {accuracies[-1]:.4f}")

# Save Model
torch.save(model.state_dict(), "/content/data/저희 분류 모델/updated_jh_model.pt")

In [None]:
# Evaluate Model
model.eval()
test_correct = test_total = 0
for input_ids, attention_masks, y_batch in tqdm(test_loader):
    input_ids, attention_masks, y_batch = input_ids.to(device), attention_masks.to(device), y_batch.to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
        predictions = torch.argmax(outputs.logits, dim=1)
        test_correct += (predictions == y_batch).sum().item()
        test_total += y_batch.size(0)

# Print Test Accuracy
test_accuracy = test_correct / test_total
print(f"Test Accuracy: {test_accuracy:.4f}")

# Function to classify sentences
def classify_sentence(sentence):
    tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
    model.eval()
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding='max_length', max_length=256, add_special_tokens=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits.squeeze(0).detach().cpu().numpy()
    return np.argmax(logits)

# Example usage
print(classify_sentence("여기 예문을 넣으세요"))

In [None]:
print(classify_sentence("다 저희 잘못입니다"))

In [None]:
print(classify_sentence("저희 놈들은 죽어 마땅해요!"))

In [None]:
print(classify_sentence("저희편인지 우리편인지 구분도 못하는 놈들은 죽어 마땅해요!"))

In [None]:
# GPU 사용
device = torch.device("cuda")

In [None]:
news_df = new_df_jeohee

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, ElectraForSequenceClassification
import matplotlib.pyplot as plt
import seaborn as sns

# 모델과 토크나이저 로드
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
model = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base", num_labels=2).to(device)

# 가중치 로드 (필요할 경우)
model_path = "/content/data/저희 분류 모델/updated_jh_model.pt"
state_dict = torch.load(model_path, map_location=device)

# 모델의 임베딩 레이어 크기 조정
model.resize_token_embeddings(50135)

# 가중치 로드, strict=False allows ignoring non-matching keys
model.load_state_dict(state_dict, strict=False)

# 문장 추출 및 분류 함수
def classify_sentence(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding='max_length', max_length=256, add_special_tokens=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits.squeeze(0).detach().cpu().numpy()
    return np.argmax(logits)

def extract_and_classify(df):
    results = []
    for idx, row in df.iterrows():
        for column in ['Title', 'Summary']:
            if pd.notna(row[column]):
                sentences = row[column].split('. ')
                for sentence in sentences:
                    if '저희' in sentence:
                        label = classify_sentence(sentence)
                        results.append({
                            "Sentence": sentence,
                            "Label": label,
                            "Column": column,
                            "Index": idx,
                            "Newspaper": row["Newspaper"],
                            "Published Date": row["Published Date"]
                        })
    return pd.DataFrame(results)


In [None]:
# 결과 추출 및 분류
results_df = extract_and_classify(news_df)

# 연도 추출 (if results_df is not empty)
results_df['Published Date'] = results_df['Published Date'].astype(str)
results_df['Published Year'] = results_df['Published Date'].apply(lambda x: x.split('.')[0])

# 결과 출력
results_df

In [None]:

# 통계 계산
statistics_df = results_df.groupby(["Newspaper", "Published Year", "Label"]).size().reset_index(name='Count')

# 통계 출력
statistics_df

# Dataset 만들어서 불러오기

In [None]:
class ClassifyDataset(Dataset):

  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file).dropna(axis=0)
    # 중복제거
    self.dataset.drop_duplicates(subset=['sentence'], inplace=True)

    self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base") #monologg/koelectra-small-v2-discriminator

    print(self.dataset.describe())

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 0:3].values #idx 행과 0,1,2 columns
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text,
        return_tensors='pt', #return pytorch tensors
        truncation=True, #reducing long sequences, 256개의 token만 살리고 뒤는 자름
        max_length=256,
        pad_to_max_length=True, #padding
        add_special_tokens=True #자동으로 문장 앞뒤로 special tocken - padding 부착
        )

    input_ids = inputs['input_ids'][0] #모델의 입력
    attention_mask = inputs['attention_mask'][0] #padding(0이면 패딩 없음)

    return input_ids, attention_mask, y

In [None]:
train_dataset = ClassifyDataset(train)
test_dataset = ClassifyDataset(test)

# Create Model

In [None]:
model = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base").to(device)
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
sentiment_classifier = TextClassificationPipeline(tokenizer=tokenizer, model=model, device=0)

In [None]:
model.load_state_dict(torch.load("/content/data/저희 분류 모델/jh_model.pt"))

# Learn

In [None]:
epochs = 5
batch_size = 16

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-6)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
losses = []
accuracies = []

for i in range(epochs): #epoch 5
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train() #forward

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader): #tqdm 진행상황 확인
  # train_loader batch_size = 16 -> iterations에 대해서 batches? (data size / batch size = num of iterations ---> 1 epoch)
    optimizer.zero_grad()

    y_batch = y_batch.type(torch.LongTensor)
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0] #to(device) : gpu에 복사본 저장(pass data to device)
    loss = F.cross_entropy(y_pred, y_batch)

    loss.backward()
    optimizer.step() #update params(weights and biases)

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1) #max로 하는 이유?
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)

  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total) #예측한 결과 loss, accuracy (지도학습)

In [None]:
losses, accuracies

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F

# Assuming model and datasets are already defined
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

epochs = 5
batch_size = 16

optimizer = AdamW(model.parameters(), lr=5e-6)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

losses = []
accuracies = []

for i in range(epochs):
    model.train()
    total_loss, correct, total, batches = 0.0, 0, 0, 0

    for input_ids_batch, attention_masks_batch, y_batch in train_loader:
        optimizer.zero_grad()

        y_batch = y_batch.type(torch.LongTensor).to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)
        batches += 1

        if batches % 100 == 0:
            print(f"Batch {batches}: Loss: {total_loss / batches:.2f}, Accuracy: {correct / total * 100:.2f}%")

    losses.append(total_loss / batches)
    accuracies.append(correct / total)
    print(f"Epoch {i+1}: Train Loss: {losses[-1]:.2f}, Accuracy: {accuracies[-1] * 100:.2f}%")

losses, accuracies


In [None]:
# 모델 저장하기
torch.save(model.state_dict(), "2nd_jh_model.pt")

In [None]:
from transformers import ElectraForSequenceClassification, ElectraConfig

# Ensure you have the correct number of tokens and any other necessary configuration details
config = ElectraConfig.from_pretrained('google/electra-small-discriminator', num_labels=2)
config.vocab_size = 50135  # This needs to match the saved model's vocab size

model = ElectraForSequenceClassification(config)

# Load the model
model.load_state_dict(torch.load("/content/data/저희 분류 모델/2nd_jh_model.pt"))


테스트 데이터셋 정확도 확인하기

In [None]:
import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # 모델을 설정된 장치로 이동

model.eval()  # 모델을 평가 모드로 설정

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    # 입력 텐서를 설정된 장치로 이동
    input_ids_batch = input_ids_batch.to(device)
    attention_masks_batch = attention_masks_batch.to(device)
    y_batch = y_batch.to(device).type(torch.LongTensor)  # 장치로 이동 후 타입 변환

    # 디버그 메시지 추가
    print("y_batch device:", y_batch.device)

    with torch.no_grad():
        y_pred = model(input_ids_batch, attention_mask=attention_masks_batch)[0]
        _, predicted = torch.max(y_pred, 1)

        # 정확한 예측 수 계산
        test_correct += (predicted == y_batch).sum().item()
        test_total += y_batch.size(0)

# 정확도 계산 및 출력
accuracy = test_correct / test_total
print("Accuracy:", accuracy)


In [None]:
#문장 하나하나 분류
def sentences_predict(sent):
    tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
    model.eval()
    tokenized_sent = tokenizer(
            sent,
            return_tensors="pt",
            truncation=True,
            add_special_tokens=True,
            max_length=256
    )
    tokenized_sent.to(device)

    with torch.no_grad():# 그라디엔트 계산 비활성화
        outputs = model(
            input_ids=tokenized_sent['input_ids'],
            attention_mask=tokenized_sent['attention_mask'],
            token_type_ids=tokenized_sent['token_type_ids']
            )

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits)
    return result

# 조선 동아 데이터 분류

In [None]:
cd_data= pd.read_pickle('./저희_조선동아_1954_1999.pkl')
cd_data

In [None]:
score = [] # label - score

total_len = len(cd_data)

for cnt, review in enumerate(cd_data['text']):
  pred = sentiment_classifier(review)
  score.append(pred)
  print(cnt, '개 문장 분류 완료')

In [None]:
len(score) #전체 데이터 개수와 같은지 확인

In [None]:
cd_data['predicted'] = 0 # label(예측 결과): 1(긍정) / 0(부정)
cd_data['score']=0


for i in range(len(score)):
    cd_data['predicted'][i] = int(score[i][0].get('label')[-1])
    cd_data['score'][i] = float(score[i][0].get('score'))

cd_data

In [None]:
cd_data.to_csv('조선동아_저희 모델 분류 결과_10words.csv', index=False)
#test_pd.to_excel('저희 모델 분류 결과.xlsx', index=False)

In [None]:
df=cd_data.groupby(['year', 'publisher', 'predicted']).count()
df

In [None]:
df=df[['text']]
df

In [None]:
df.to_excel('./조선동아_저희 모델 분류 결과 추세.xlsx')

### 시각화

#### 전체

In [None]:
import os


# Mac OS의 경우와 그 외 OS의 경우로 나누어 설정

if os.name == 'posix':

    plt.rc("font", family="AppleGothic")

else :

    plt.rc("font", family="Malgun Gothic")

In [None]:
df=cd_data.groupby(['year', 'predicted']).count()
df

In [None]:
df.reset_index(inplace=True)
df

In [None]:
df.set_index(['year'], inplace=True)
df

In [None]:
df['겸양의 저희'] =0
df['지칭의 저희'] = 0

for idx in df.index :
    df['겸양의 저희'][idx] = df['text'][df['predicted']==1][idx]

    try :
        df['지칭의 저희'][idx] =df['text'][df['predicted']==0][idx]
    except :
    #donga['겸양의 저희'][idx] =0
        df['지칭의 저희'][idx] =0
df

In [None]:
df= df[['겸양의 저희', '지칭의 저희']]

In [None]:
df.reset_index(inplace=True)

In [None]:
df= df.drop_duplicates('year')

In [None]:
df.set_index('year', inplace=True)
df.plot()