## Imports

In [None]:
# !pip install numpy==1.23.5
# !pip install -U pip setuptools wheel
# !pip install -U 'spacy[cuda-autodetect]'
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md

In [None]:
#!pip install numpy requests nlpaug
#!pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import unicodedata
import re
from tqdm import tqdm
import os

import itertools
import spacy
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk import ne_chunk
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist
import nlpaug.augmenter.word as naw

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', 30)

# 1. Dataset Preparation

## Load the dataset

In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

# 1. Data Preprocess

## Statistics before Augmentation

In [None]:
avg_char_before_augmentation = train['facts'].apply(lambda x: len(str(x))).mean()
print(f'Average train facts character length (before augmentation): {avg_char_before_augmentation:.0f}')

avg_word_before_augmentation = train['facts'].apply(lambda x: len(str(x).split())).mean()
print(f'Average train facts word length (before augmentation): {avg_word_before_augmentation:.0f}')

In [None]:
avg_char_before_augmentation = test['facts'].apply(lambda x: len(str(x))).mean()
print(f'Average test facts character length (before augmentation): {avg_char_before_augmentation:.0f}')

avg_word_before_augmentation = test['facts'].apply(lambda x: len(str(x).split())).mean()
print(f'Average test facts word length (before augmentation): {avg_word_before_augmentation:.0f}')

## Will 통일

In [None]:
columns_to_apply = ['first_party', 'second_party', 'facts']

In [None]:
for column in columns_to_apply:
    train[column] = train[column].apply(lambda x: re.sub(r'\bWill\b', 'Willn', x))

for column in columns_to_apply:
    test[column] = test[column].apply(lambda x: re.sub(r'\bWill\b', 'Willn', x))

## United States 통일

In [None]:
replace_dict = {
    'U. S. C.': ' USC ',
    'U.S.C.': ' USC ',
    'U.S.A.': ' USA ',
    'U. S.': ' USA ',
    'U.S.': ' USA ',
    'US ': ' USA ',
    'United States of America': ' USA ',
    'United States': ' USA ',
    'united states': ' USA '
}

In [None]:
for col in train.columns:
    train[col] = train[col].replace(replace_dict, regex=True)
    
for col in test.columns:
    test[col] = test[col].replace(replace_dict, regex=True)

In [None]:
def remove_extra_whitespaces_func(text):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))

## 한글자 + '.'' 제거

In [None]:
def remove_Large_and_Large(text):
    pattern1 = r'(?<=[A-Z])\.+\s+(?=[A-Z]+\.)'
    pattern2 = r'(?<=[A-Z])\.+(?=[A-Z]+\.)'
    pattern3 = r'([A-Z])\.'
    
    
    result1 = re.sub(pattern1, '', text)
    result2 = re.sub(pattern2, '', result1)
    result3 = re.sub(pattern3, lambda match: match.group(1)+' ', result2)    
    
    return result3

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_Large_and_Large))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_Large_and_Large))

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))

## & 처리

In [None]:
def remove_And(text):
    pattern1 = r'(?<=[A-Z])\s+\&\s+(?=[A-Z])'
    pattern2 = r'(?<=[A-Z])\&(?=[A-Z])'\
    
    result1 = re.sub(pattern1, 'n', text)
    result2 = re.sub(pattern2, 'n', result1)
    result3 = result2.replace('&',' and ')
    
    return result3

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_And))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_And))

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))

## 대문자 한글자 제거

In [None]:
def remove_One_Large(text):
    text = ' ' + text + ' '
    pattern = r'(?<=\s)[A-Z](?=\s)'
    
    result = re.sub(pattern, ' ', text) 
    
    return result

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_One_Large))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_One_Large))

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))

## Co. Bd. Mt. 처리

In [None]:
Hon_rep = [r'Co\.', r'CO\.' , r'Bd\.', r'Mt\.']

In [None]:
def replace_values_in_column(column, pattern, replacement):
    new_column = []
    for item in tqdm(column):
        new_item = re.sub(pattern, replacement, item)
        new_column.append(new_item)
    return new_column

# 해당 패턴에 맞는 값들을 원하는 replacement 값으로 바꾸는 함수
def replace_values_in_train(train, column_name, pattern, replacement):
    train[column_name] = replace_values_in_column(train[column_name], pattern, replacement)
    return train

def replace_values_in_test(test, column_name, pattern, replacement):
    test[column_name] = replace_values_in_column(test[column_name], pattern, replacement)
    return test

In [None]:
# train 데이터프레임에서 각 컬럼들에 대해 값을 바꿈
for pattern, replacement in zip(Hon_rep, [' Company ', ' Company ', ' Building ', ' Mount ']):
    train = replace_values_in_train(train, 'first_party', pattern, replacement)
    train = replace_values_in_train(train, 'second_party', pattern, replacement)
    train = replace_values_in_train(train, 'facts', pattern, replacement)

In [None]:
# test 데이터프레임에서 각 컬럼들에 대해 값을 바꿈
for pattern, replacement in zip(Hon_rep, [' Company ', ' Company ', ' Building ', ' Mount ']):
    test = replace_values_in_test(test, 'first_party', pattern, replacement)
    test = replace_values_in_test(test, 'second_party', pattern, replacement)
    test = replace_values_in_test(test, 'facts', pattern, replacement)

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))

## INC 처리

In [None]:
def remove_dot_after_inc1(text):
    pattern = r'Inc\.\s+([^A-Z\s]+)'

    def replace_dot(match):
        return match.group(0).replace(".", "")

    result = re.sub(pattern, replace_dot, text)
    return result

def remove_dot_after_inc2(text):
    pattern = r'Inc\.([^A-Z\s]+)'

    def replace_dot2(match):
        return match.group(0).replace(".", "")

    result = re.sub(pattern, replace_dot2, text)
    return result

def remove_dot_after_ltd1(text):
    pattern = r'Ltd\.\s+([^A-Z\s]+)'

    def replace_dot3(match):
        return match.group(0).replace(".", "")

    result = re.sub(pattern, replace_dot3, text)
    return result

def remove_dot_after_ltd2(text):
    pattern = r'Ltd\.([^A-Z\s]+)'

    def replace_dot4(match):
        return match.group(0).replace(".", "")

    result = re.sub(pattern, replace_dot4, text)
    return result

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_dot_after_inc1))
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_dot_after_inc2))
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_dot_after_ltd1))
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_dot_after_ltd2))

test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_dot_after_inc1))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_dot_after_inc2))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_dot_after_ltd1))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_dot_after_ltd2))

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))

# 2. Data Augmentation

## Extracted the nouns from the facts

In [None]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [None]:
noun_str_train = []
noun_str_test = []

In [None]:
noun_str_train = train['facts'].apply(lambda x: [chunk.text for chunk in nlp(x).noun_chunks]).tolist()
noun_str_test = test['facts'].apply(lambda x: [chunk.text for chunk in nlp(x).noun_chunks]).tolist()

## Augmented the data using contextual word embeddigs

In [None]:
train_0 = train[train['first_party_winner'] == 0].copy()
train_0 = pd.concat([train_0]*10, ignore_index=True)
train_0['number'] = train_0['ID'].str.replace('TRAIN_', '').astype(int)
train_0 = train_0.sort_values('number').reset_index(drop=True)
train_0 = train_0.drop('number', axis=1)

In [None]:
train_1 = train[train['first_party_winner'] == 1].copy()
train_1 = pd.concat([train_1]*10, ignore_index=True)
train_1['number'] = train_1['ID'].str.replace('TRAIN_', '').astype(int)
train_1 = train_1.sort_values('number').reset_index(drop=True)
train_1 = train_1.drop('number', axis=1)

In [None]:
noun_str_train_2 = []

In [None]:
for i in range(len(noun_str_train)):
    noun_str_train_2.append(list(set(noun_str_train[i])))

In [None]:
final_facts_train_0 = []
final_facts_train_1 = []

In [None]:
indices_train_0 = train[train['first_party_winner'] == 0].index.tolist()
indices_train_1 = train[train['first_party_winner'] == 1].index.tolist()

In [None]:
for i in tqdm(indices_train_0):
    aug = naw.SynonymAug(aug_src='wordnet', stopwords=noun_str_train_2[i])
    final_facts_train_0.append([train['facts'][i]])
    for j in range(9):
        augmented_train_0 = aug.augment(train['facts'][i],1,8)
        final_facts_train_0.append(augmented_train_0)

In [None]:
for i in tqdm(indices_train_1):
    aug = naw.SynonymAug(aug_src='wordnet', stopwords=noun_str_train_2[i])
    final_facts_train_1.append([train['facts'][i]])
    for j in range(9):
        augmented_train_1 = aug.augment(train['facts'][i],1,8)
        final_facts_train_1.append(augmented_train_1)

In [None]:
final_facts_train_0 = [item for sublist in final_facts_train_0 for item in sublist]
final_facts_train_1 = [item for sublist in final_facts_train_1 for item in sublist]

In [None]:
train_0['facts'] = final_facts_train_0
train_1['facts'] = final_facts_train_1

In [None]:
train = pd.concat([train_0,train_1]).reset_index(drop=True)

In [None]:
train['number'] = train['ID'].str.replace('TRAIN_', '').astype(int)
train = train.sort_values('ID').reset_index(drop=True)
train = train.drop('number', axis=1)
train['ID'] = train.index.map(lambda x: f'TRAIN_{x:04}')

## et al. 제거

In [None]:
replace_dict = {
    
    'et. al.': ' ',
    'et. al': ' ',
    'et al.': ' ',
    'et al': ' ',

    'at. al.': ' ',
    'at. al': ' ',
    'at al.': ' ',
    'at al': ' ',
    
    'et. ux.': ' ',
    'et. ux': ' ',
    'et ux.': ' ',
    'et ux': ' ',
    
    'et. ex.': ' ',
    'et. ex': ' ',
    'et ex.': ' ',
    'et ex': ' ',

    'ex. re.': ' ',
    'ex. re': ' ',
    'ex re.': ' ',
    'ex re': ' ',

    'et. re.': ' ',
    'et. re': ' ',
    'et re.': ' ',
    'et re': ' ',
    
    'et. seq.': ' ',
    'et. seq': ' ',
    'et seq.': ' ',
    'et seq': ' ',
    
    'et. vir.': ' ',
    'et. vir': ' ',
    'et vir.': ' ',
    'et vir': ' ',
    
    'ex. rel.': ' ',
    'ex. rel': ' ',
    'ex rel.': ' ',
    'ex rel': ' ',
    'etc' : ' '
    
}

In [None]:
for col in train.columns:
    train[col] = train[col].replace(replace_dict, regex=True)
    
for col in test.columns:
    test[col] = test[col].replace(replace_dict, regex=True)

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))

## 영어 호칭

In [None]:
Honor = [r'Mr.', r'Mrs.', r'Miss.', r'Dr.', r'Rev.', r'Prof.', r'Capt.', r'Sgt.', r'St.', r'Sr.', r'Jr.', r'Ms.', r'No.']

In [None]:
for i in tqdm(range(len(train))):
    for k in Honor:
        train.loc[i, 'facts'] = " " + train.loc[i, 'facts'] + " "
        train.loc[i, 'facts'] = train.loc[i, 'facts'].replace(k,' ')
        train.loc[i, 'facts'] = re.sub(r'\s+', ' ', train.loc[i, 'facts'])
        train.loc[i, 'facts'] = re.sub(r"^\s+|\s+$", "", train.loc[i, 'facts'])
        
        train.loc[i, 'first_party'] = " " + train.loc[i, 'first_party'] + " "
        train.loc[i, 'first_party'] = train.loc[i, 'first_party'].replace(k,' ')
        train.loc[i, 'first_party'] = re.sub(r'\s+', ' ', train.loc[i, 'first_party'])
        train.loc[i, 'first_party'] = re.sub(r"^\s+|\s+$", "", train.loc[i, 'first_party'])
        
        train.loc[i, 'second_party'] = " " + train.loc[i, 'second_party'] + " "
        train.loc[i, 'second_party'] = train.loc[i, 'second_party'].replace(k,' ')
        train.loc[i, 'second_party'] = re.sub(r'\s+', ' ', train.loc[i, 'second_party'])
        train.loc[i, 'second_party'] = re.sub(r"^\s+|\s+$", "", train.loc[i, 'second_party'])
        
        
for i in tqdm(range(len(test))):
    for k in Honor:
        test.loc[i, 'facts'] = " " + test.loc[i, 'facts'] + " "
        test.loc[i, 'facts'] = test.loc[i, 'facts'].replace(k,' ')
        test.loc[i, 'facts'] = re.sub(r'\s+', ' ', test.loc[i, 'facts'])
        test.loc[i, 'facts'] = re.sub(r"^\s+|\s+$", "", test.loc[i, 'facts'])
        
        test.loc[i, 'first_party'] = " " + test.loc[i, 'first_party'] + " "
        test.loc[i, 'first_party'] = test.loc[i, 'first_party'].replace(k,' ')
        test.loc[i, 'first_party'] = re.sub(r'\s+', ' ', test.loc[i, 'first_party'])
        test.loc[i, 'first_party'] = re.sub(r"^\s+|\s+$", "", test.loc[i, 'first_party'])
        
        test.loc[i, 'second_party'] = " " + test.loc[i, 'second_party'] + " "
        test.loc[i, 'second_party'] = test.loc[i, 'second_party'].replace(k,' ')
        test.loc[i, 'second_party'] = re.sub(r'\s+', ' ', test.loc[i, 'second_party'])
        test.loc[i, 'second_party'] = re.sub(r"^\s+|\s+$", "", test.loc[i, 'second_party'])

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))

## Sentence Segmentation

In [None]:
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_md")

In [None]:
sentence_train = []
sentence_test = []

In [None]:
train['facts'] = train['facts'].apply(lambda text: ' '.join([str(sent) + ' [SEP]' for sent in nlp(text).sents]))
test['facts'] = test['facts'].apply(lambda text: ' '.join([str(sent) + ' [SEP]' for sent in nlp(text).sents]))

# 3. Preprocess Dataset

## Statistics before Preprocessing

In [None]:
avg_char_before_preprocessing = train['facts'].apply(lambda x: len(str(x))).mean()
print(f'Average train facts character length (before preprocessing): {avg_char_before_preprocessing:.0f}')

avg_word_before_preprocessing = train['facts'].apply(lambda x: len(str(x).split())).mean()
print(f'Average train facts word length (before preprocessing): {avg_word_before_preprocessing:.0f}')

In [None]:
avg_char_before_preprocessing = test['facts'].apply(lambda x: len(str(x))).mean()
print(f'Average test facts character length (before preprocessing): {avg_char_before_preprocessing:.0f}')

avg_word_before_preprocessing = test['facts'].apply(lambda x: len(str(x).split())).mean()
print(f'Average test facts word length (before preprocessing): {avg_word_before_preprocessing:.0f}')

## Text Preprocessing Functions

In [None]:
def remove_html_tags_func(text):
    return BeautifulSoup(text, 'html.parser').get_text()


def remove_url_func(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)


def remove_accented_chars_func(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')


def remove_punctuation_func(text):
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)


def remove_irr_char_func(text):
    return re.sub(r'[^a-zA-Z]', ' ', text)


def remove_extra_whitespaces_func(text):
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

def remove_english_stopwords_func(text):
    t = [token for token in text if token.lower() not in stopwords.words("english")]
    text = ' '.join(t)    
    return text

def norm_lemm_v_a_func(text):
    words1 = word_tokenize(text)
    text1 = ' '.join([WordNetLemmatizer().lemmatize(word, pos='v') for word in words1])
    words2 = word_tokenize(text1)
    text2 = ' '.join([WordNetLemmatizer().lemmatize(word, pos='a') for word in words2])
    return text2

def remove_single_char_func(text, threshold=1):
    words = word_tokenize(text)
    text = ' '.join([word for word in words if len(word) > threshold])
    return text

## Text Cleaning

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.str.lower())
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_html_tags_func))
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_url_func))
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_accented_chars_func))
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_punctuation_func))
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_irr_char_func))
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))
train['facts'] = train['facts'].apply(lambda x: x.replace('cls', 'CLS').replace('sep', 'SEP'))

In [None]:
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.str.lower())
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_html_tags_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_url_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_accented_chars_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_punctuation_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_irr_char_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_extra_whitespaces_func))
test['facts'] = test['facts'].apply(lambda x: x.replace('cls', 'CLS').replace('sep', 'SEP'))

## Tokenization

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(word_tokenize))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(word_tokenize))

## Stop Words

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_english_stopwords_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_english_stopwords_func))

## Normalization

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(norm_lemm_v_a_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(norm_lemm_v_a_func))

## Removing Single Characters

In [None]:
train[columns_to_apply] = train[columns_to_apply].apply(lambda x: x.apply(remove_single_char_func))
test[columns_to_apply] = test[columns_to_apply].apply(lambda x: x.apply(remove_single_char_func))

## Statistics after Preprocessing

In [None]:
avg_char_after_preprocessing = train['facts'].apply(lambda x: len(str(x))).mean()
print(f'Average train facts character length (after preprocessing): {avg_char_after_preprocessing:.0f}')

avg_word_after_preprocessing = train['facts'].apply(lambda x: len(str(x).split())).mean()
print(f'Average train facts word length (after preprocessing): {avg_word_after_preprocessing:.0f}')

In [None]:
avg_char_after_preprocessing = test['facts'].apply(lambda x: len(str(x))).mean()
print(f'Average test facts character length (after preprocessing): {avg_char_after_preprocessing:.0f}')

avg_word_after_preprocessing = test['facts'].apply(lambda x: len(str(x).split())).mean()
print(f'Average test facts word length (after preprocessing): {avg_word_after_preprocessing:.0f}')

# 4. Text Exploration

## Text Exploration Functions

In [None]:
def most_common_word_func(text, n_words=25):
    words = word_tokenize(text)
    fdist = FreqDist(words) 
    
    n_words = n_words
    
    df_fdist = pd.DataFrame({'Word': fdist.keys(),
                             'Frequency': fdist.values()})
    df_fdist = df_fdist.sort_values(by='Frequency', ascending=False).head(n_words)
    
    return df_fdist

def label_func(winner):
    if winner == 0:
        return 'second_party'
    elif winner == 1:
        return 'first_party'
    
def least_common_word_func(text, n_words=25):
    words = word_tokenize(text)
    fdist = FreqDist(words) 
    
    n_words = n_words
    
    df_fdist = pd.DataFrame({'Word': fdist.keys(),
                             'Frequency': fdist.values()})
    df_fdist = df_fdist.sort_values(by='Frequency', ascending=False).tail(n_words)
    
    return df_fdist

In [None]:
df_copy = pd.concat([train.iloc[:,:-1],test]).reset_index(drop=True)
train_copy = train.copy()
test_copy = test.copy()

## Most common Words

In [None]:
text_corpus = df_copy['facts'].str.cat(sep=' ')

df_most_common_words_text_corpus = most_common_word_func(text_corpus)

df_most_common_words_text_corpus[~df_most_common_words_text_corpus['Word'].isin(['SEP', 'CLS'])].head(10)

In [None]:
plt.figure(figsize=(11,7))
plt.bar(df_most_common_words_text_corpus[~df_most_common_words_text_corpus['Word'].isin(['SEP', 'CLS'])]['Word'], 
        df_most_common_words_text_corpus[~df_most_common_words_text_corpus['Word'].isin(['SEP', 'CLS'])]['Frequency'])

plt.xticks(rotation = 45)

plt.xlabel('Most common Words')
plt.ylabel("Frequency")
plt.title("Frequency distribution of the 25 most common words")

plt.show()

## For parts of the DF

In [None]:
train_copy['Label'] = train_copy['first_party_winner'].apply(label_func)

In [None]:
cols = list(train_copy.columns)
cols = [cols[-1]] + cols[:-1]
train_copy = train_copy[cols]

In [None]:
first_party = train_copy[(train_copy["Label"] == 'first_party')]['facts']
second_party = train_copy[(train_copy["Label"] == 'second_party')]['facts']

In [None]:
text_corpus_first_party = first_party.str.cat(sep=' ')
text_corpus_second_party = second_party.str.cat(sep=' ')

In [None]:
df_most_common_words_text_corpus_first_party = most_common_word_func(text_corpus_first_party)
df_most_common_words_text_corpus_second_party = most_common_word_func(text_corpus_second_party)

In [None]:
splited_data = [df_most_common_words_text_corpus_first_party[~df_most_common_words_text_corpus_first_party['Word'].isin(['SEP', 'CLS'])],
                df_most_common_words_text_corpus_second_party[~df_most_common_words_text_corpus_second_party['Word'].isin(['SEP', 'CLS'])]]

color_list = ['green', 'red']
title_list = ['First party', 'Second party']


for item in range(2):
    plt.figure(figsize=(11,7))
    plt.bar(splited_data[item]['Word'], 
            splited_data[item]['Frequency'],
            color=color_list[item])
    plt.xticks(rotation = 45)
    plt.xlabel('Most common Words')
    plt.ylabel("Frequency")
    plt.title("Frequency distribution of the 25 most common words")
    plt.suptitle(title_list[item], fontsize=15)
    plt.show()

# 5. Tokenize

## Data resampling

In [None]:
train_02 = train.copy()

In [None]:
train_02['first_party'], train_02['second_party'] = train_02['second_party'], train_02['first_party']
train_02['first_party_winner'] = 1 - train_02['first_party_winner']

In [None]:
train_02 = pd.concat([train, train_02], ignore_index=True)

In [None]:
train_02['number'] = train_02['ID'].str.replace('TRAIN_', '').astype(int)

In [None]:
train_02 = train_02.sort_values(['number', 'first_party_winner']).reset_index(drop=True)

In [None]:
train = train_02.drop('number', axis=1)

In [None]:
train['ID'] = 'TRAIN_' + train.index.map(lambda x: f'{x:04}')

## Special token

In [None]:
for column in columns_to_apply:
    train[column] = train[column].apply(lambda x: re.sub(r'\bSEP\b', '[SEP]', x))
    
for column in columns_to_apply:
    test[column] = test[column].apply(lambda x: re.sub(r'\bSEP\b', '[SEP]', x))

In [None]:
train['facts'] = '[CLS] ' + train['first_party'] + ' [SEP] ' + train['second_party'] + ' [SEP] ' + train['facts']
test['facts'] = '[CLS] ' + test['first_party'] + ' [SEP] ' + test['second_party'] + ' [SEP] ' + test['facts']

## To_csv

In [None]:
train.to_csv('train_last_3.csv', index = False)
test.to_csv('test_last_3.csv', index = False)