In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from entities import Title

In [2]:
def match_mangas(source_df, target_df, target_df_name_columns: list, target_df_name: str):
    
    # create manga titles
    ru_names = source_df.name.tolist()
    an_names = source_df.original_name.tolist()
    en_names = source_df.eng_name.tolist()
    ids = source_df.index.tolist()
    gtitles = [Title([ru_names[i], an_names[i], en_names[i]], meta={'index': ids[i]}) for i in range(len(ru_names))]
    
    target_manga_names = list(target_df[target_df_name_columns].to_records(index=False))
    ids = target_df.index.tolist()
    mtitles = [Title(list(target_manga_names[i]), meta={'index': ids[i]}) for i in range(len(target_manga_names))]

    print('source titles n:', len(gtitles))
    print('target titles n:', len(mtitles))
    
    matched = []
    q_matches = []
    source_df.loc[:, f'{target_df_name}_id'] = None
    source_df.loc[f'n_duplicates_{target_df_name}'] = 0

    for i, gtitle in tqdm(enumerate(gtitles), total=len(gtitles)):
        matches = 0
        for j, mtitle in enumerate(mtitles):

            if mtitle is None:
                continue

            if mtitle == gtitle:
                if matches > 0:
                    matched[-1].append(mtitle)
                else:
                    matched.append([gtitle, mtitle])
                matches += 1


        if matches == 1:
            # if and only one2one corresponding found
            mtitle = matched[-1][-1]
            source_df.loc[i, f'{target_df_name}_id'] = mtitle.get_index()

        elif matches > 1:
            # search the most similar
            g_index = gtitle.get_index()
            curr_matched = matched[-1]
            n_similars = [gtitle.strong_equal_names_n(mtitle) for mtitle in curr_matched[1:]]
            if n_similars.count(max(n_similars)) > 1:
                # if more then 1 top similar
                top_similars_recorded = 0
                for k, (n_sim, item) in enumerate(zip(n_similars, curr_matched[1:])):
                    if n_sim == max(n_similars) and top_similars_recorded == 0:
                        source_df.loc[g_index, f'{target_df_name}_id'] = item.get_index()
                    elif n_sim == max(n_similars):
                        source_df.loc[g_index, f'{target_df_name}_id_%d' % top_similars_recorded] = item.get_index()
                    top_similars_recorded += 1
                source_df.loc[g_index, f'n_duplicates_{target_df_name}'] = top_similars_recorded
            else:
                # if 1 similar
                top_simimlar_ix = n_similars.index(max(n_similars))
                top_similar_item = curr_matched[1:][top_simimlar_ix]
                source_df.loc[g_index, f'{target_df_name}_id'] = top_similar_item.get_index()

        q_matches.append(matches)
        
    return {'q_matches': q_matches,
            'source_df': source_df,
            'matched': matched}

# Match remanga

In [3]:
remanga = pd.read_csv('./data/raw/remanga_catalog_full.csv')
remanga.head(2)

Unnamed: 0,rus_name,en_name,issue_year,avg_rating,total_views,total_votes,type,dir,n_chapters
0,Поднятие уровня в одиночку,Solo Leveling,2018.0,9.6,9546754,1210457,Манхва,solo-leveling,151.0
1,Начало после конца,The Beginning After the End,2018.0,9.6,6130139,1172388,Западный комикс,the_beginning_after_the_end,107.0


## With readmanga

In [4]:
gmanga = pd.read_csv('./data/raw/manga.csv', sep=';')
gmanga.head(2)

Unnamed: 0,id,name,eng_name,original_name,another_names,chapters_count
0,1,Ван Пис,One Piece,One Piece,Budak Getah / Большой Куш / ワンピース / Один Кусок,1028
1,2,Наруто,Naruto,Naruto,ナルト,704


In [None]:
res_gmanga_rm = match_mangas(gmanga, remanga, ['rus_name', 'en_name'], 'remanga')
gmanga = res_gmanga_rm['source_df']

source titles n: 20825
target titles n: 16899


HBox(children=(FloatProgress(value=0.0, max=20825.0), HTML(value='')))

In [None]:
gmanga.to_csv('data/gmanga_matched_remanga_ids.csv', sep=';')

In [None]:
_ = plt.hist(q_matches)

## With mintmanga

In [None]:
gmint = pd.read_csv('./data/raw/mint.csv', sep=';')
print(gmint.shape)
gmint.head()

In [None]:
res_gmint_rm = match_mangas(gmint, remanga, ['rus_name', 'rn_name'], 'remanga')
gmint = res_gmint_rm['source_df']

In [None]:
gmint.to_csv('data/gmint_matched_remanga_ids.csv', sep=';', index=False)

In [None]:
_ = plt.hist(q_matches)

# Match mangalib

In [None]:
mangalib = pd.read_csv('./data/raw/mangalib_catalog_titles.csv')
print(mangalib.shape)
mangalib.head()

## with readmanga

In [None]:
gmanga = pd.read_csv('data/gmanga_matched_remanga_ids.csv', sep=';')
print(gmanga.shape)

In [None]:
# create manga titles
ru_names = gmanga.name.tolist()
an_names = gmanga.original_name.tolist()
en_names = gmanga.eng_name.tolist()
ids = gmanga.index.tolist()
gtitles = [Title([ru_names[i], an_names[i], en_names[i]], meta={'index': ids[i]}) for i in range(len(ru_names))]

ru_names = mangalib.name.tolist()
ids = mangalib.index.tolist()
mtitles = [Title([ru_names[i],], meta={'index': ids[i]}) for i in range(len(ru_names))]

len(gtitles), len(mtitles)

In [None]:
res_gmanga_mangalib = match_mangas(gmanga, mangalib, ['name'], 'mangalib')
gmanga = res_gmanga_mangalib['source_df']

In [None]:
gmanga.to_csv('./data/gmanga_matched_remanga_mangalib_ids.csv', sep=';', index=False)

In [None]:
_ = plt.hist(q_matches)

## with mintmanga

In [None]:
gmint = pd.read_csv('data/gmint_matched_remanga_ids.csv', sep=';')
gmint.head(2)

In [None]:
res_gmint_mangalib = match_mangas(gmint, mangalib, ['name'], 'mangalib')
gmint = res_gmint_mangalib['source_df']

In [None]:
gmint.to_csv('data/gmint_matched_remanga_mangalib_ids.csv', sep=';', index=False)

In [None]:
_ = plt.hist(res_gmint_mangalib['q_matches'])