In [None]:
import pandas as pd
import re
import string
import nltk
import spacy

# Book title
전처리 이후 train_data['Book-Title'].nunique() >>> 205946

In [None]:
nltk.download("stopwords")
nltk.download("wordnet")
from nltk.corpus import stopwords
nlp = spacy.load("en_core_web_sm")

In [None]:
train_data = pd.read_csv("./data/train.csv")
df = pd.DataFrame(train_data[["Book-ID","Book-Title"]])

In [None]:
def to_lowercase(text):
    return text.lower()

def check_special_chars(text):
    return set(re.findall(r"[{}]".format(string.punctuation), text))

def remove_special_chars(text):
    return re.sub(r"[{}]".format(string.punctuation), "", text)

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    return " ".join([word for word in text.split() if word not in stop_words])

def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

df["Book-Title_lower"] = df["Book-Title"].apply(to_lowercase)
df["Book-Title_special_chars"] = df["Book-Title_lower"].apply(check_special_chars)
df["Book-Title_no_special_chars"] = df["Book-Title_lower"].apply(remove_special_chars)
df["Book-Title_no_stopwords"] = df["Book-Title_no_special_chars"].apply(remove_stopwords)
df["Book-Title_lemmatized"] = df["Book-Title_no_stopwords"].apply(lemmatize)

In [None]:
bookTitle_prePro_selected = bookTitle_prePro[["Book-Title_lemmatized"]]
bookTitle_prePro_selected.rename(columns={'Book-Title_lemmatized': 'Book-Title'}, inplace=True)
train_data.update(bookTitle_prePro_selected)

word2vec 임베딩으로 변환. 다만 이 작업은 전략적으로 고려했을 때, Book Node ID로 Feature matrix 정렬이 되고 난 다음이 유리함.
* 따라서 여기서 중단!

# Location

In [None]:
import pickle

train_data['Location'] = train_data['Location'].apply(lambda x: x.split(', ')[-1].strip())

with open("country_name_mapping.pkl", "rb") as f:
    country_name_mapping = pickle.load(f)
    
train_data['Location'] = train_data['Location'].apply(lambda x: country_name_mapping.get(x, x))

# Age, Year-Of-Publication

In [None]:
from sklearn.preprocessing import MinMaxScaler

features_to_scale = ['Age', 'Year-Of-Publication']
scaler = MinMaxScaler()
train_data[features_to_scale] = scaler.fit_transform(train_data[features_to_scale])

# Book-Author
Publisher_list = train_data["Book-Author"].unique().tolist() >>> 92,635     
지나치게 sparse

# Publisher

In [None]:
train_data['Publisher'] =train_data['Publisher'].str.replace('[^a-zA-Z0-9\s]+', '').str.lower()

train_data['Publisher'] = train_data['Publisher'].str.strip()

In [None]:
train_data['Publisher'].nunique()

In [None]:
top_publishers = train_data['Publisher'].value_counts()

In [None]:
top_publishers.head(50)  # 유사한 출판사 이름 확인

In [None]:
# 1 단어 이상 일치하면 같은 것으로 통합해버리는 map을 만들어 매핑.
# difflib 라이브러리를 사용할 수도 있지만, time cost가 너무 크다.

def create_publisher_map(publisher_list):
    publisher_map = {}
    for publisher in publisher_list:
        words = publisher.split()
        for word in words:
            if word not in publisher_map:
                publisher_map[word] = publisher
            else:
                if len(publisher) < len(publisher_map[word]):
                    publisher_map[word] = publisher
    return publisher_map

def map_publisher(publisher, publisher_map):
    words = publisher.split()
    for word in words:
        if word in publisher_map:
            return publisher_map[word]
    return publisher

In [None]:
unique_publishers = train_data['Publisher'].unique()
publisher_map = create_publisher_map(unique_publishers)
train_data['Publisher_cleaned'] = train_data['Publisher'].apply(lambda x: map_publisher(x, publisher_map))

In [None]:
train_data['Publisher_cleaned'].nunique()

In [None]:
publisher_counts = train_data['Publisher_cleaned'].value_counts()
rare_publishers = publisher_counts[publisher_counts <= 2].index
train_data['Publisher_cleaned'] = train_data['Publisher_cleaned'].apply(lambda x: 'Other' if x in rare_publishers else x)

In [None]:
train_data['Publisher_cleaned'].nunique()

In [None]:
train_data['Publisher_cleaned'].value_counts().head(50)  # 정리된 것 확인.

In [None]:
train_data.drop(["Publisher"],axis=1,inplace=True)
train_data.rename(columns={'Publisher_cleaned': 'Publisher'}, inplace=True)

# Done!

In [None]:
train_data

In [None]:
train_data.to_csv("prepro_train_data.csv")