In [None]:
#imports
import config
import requests
import json
import csv
import pprint
import pandas as pd
import time
import ast
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import regex as re

# import unicodedata.category as cat
import unicodedata
import sys

# pretty printer for nicer formating when needed
pp = pprint.PrettyPrinter(indent = 1)

In [None]:
# function for removing weird non japanese characters from japanese strings
tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P'))

def remove_punctuation(text):
    alphanum_full = r'[！-～]'
    symbols_punct = r'[、-〿]'
    text = text.replace(" ","")
    text = text.replace("　","")
    text = text.replace("♡","")
    text = text.replace("♥", "")
    text = text.replace("☆","")
    text = text.replace("★", "")
    text = text.replace("△","")
    text = text.replace("→", "")
    text = text.replace("×", "")
    text = text.lower()
    text = re.sub(alphanum_full, "", text)
    text = re.sub(symbols_punct, "", text)
    return text.translate(tbl)

In [None]:
anilist = pd.read_csv('anilist_raw.csv')

In [None]:
for index, row in anilist.iterrows():
    # changing format of date data
    month = '0'
    day = '0'

    # converting the columns from string into dict/list
    row['startDate'] = ast.literal_eval(row['startDate']) 
    row['title'] = ast.literal_eval(row['title'])
    row['synonyms'] = ast.literal_eval(row['synonyms'])

    # making the 'synonyms' column lists in the actual dataframe
    anilist.at[index, 'synonyms'] = row['synonyms']

    # making month and days double digits
    try:
        if row['startDate']['month'] < 10:
            month = '0' + str(row['startDate']['month'])
        else:
            month = str(row['startDate']['month'])
    except:
        pass

    try:
        if row['startDate']['day'] < 10:
            day = '0' + str(row['startDate']['day'])
        else:
            day = str(row['startDate']['day'])
    except:
        pass

    date = f"{str(row['startDate']['year'])}-{month}-{day}"

    # adding new start date column
    anilist.loc[index, 'start_date'] = date
    
    # adding column for year only
    anilist.loc[index, 'year'] = row['startDate']['year']

    # adding column for month only
    anilist.loc[index, 'month'] = month

    # changing format of name data
    romaji_name = row['title']['romaji']
    original_name = row['title']['native']
    common_name = row['title']['english']

    # adding new columns for the name
    anilist.loc[index, 'name'] = romaji_name
    anilist.loc[index, 'original_name'] = original_name
    anilist.loc[index, 'common_name'] = common_name


    # removing rows that are not from Japan
    if 'JP' not in row["countryOfOrigin"]:
        anilist.drop(index, inplace = True)


In [None]:
# cleaning the japanese names
for index, row in anilist.iterrows():

    # cleaning the japanese name of the shows
    cleaned_name = None
    try:
        cleaned_name = remove_punctuation(row['original_name'])
    except:
        pass
    anilist.loc[index, 'cleaned_name'] = cleaned_name

    # cleaning the synonyms and updating the synonym row
    synonym_copy = row['synonyms'][::]
    for synonym in row['synonyms']:
        synonym_copy.append(remove_punctuation(synonym))

    anilist.at[index, 'synonyms'] = synonym_copy
    

In [None]:
# converting camelCase to snake_case because snake_case is superior
anilist = anilist.rename({"countryOfOrigin" : "origin_country", "averageScore": "average_score", 'meanScore' : 'mean_score', 'isAdult' : 'is_adult', 'id' : 'anilist_id'}, axis = 'columns')

# dropping the useless columns
anilist = anilist.drop(["title", "startDate"], axis = 1)

In [None]:
anilist.head()

In [None]:
reordered_columns = ['anilist_id', 'name', 'original_name', 'common_name', 'synonyms', 'description', 'origin_country', 'average_score', 'mean_score', 'genres', 'popularity', 'favourites', 'source', 'is_adult', 'start_date', 'year', 'month', 'cleaned_name']
anilist = anilist.reindex(columns = reordered_columns)

In [None]:
anilist.to_excel('anilist_cleaned.xlsx')
anilist.to_csv('anilist_cleaned.csv')