In [4]:
import os
import requests
import time
import json
import pandas as pd
import numpy as np
import shutil
import zipfile
import random
import csv
import json
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import ast

**Base path to interested drive**

In [5]:
BASE_PATH = "/content/drive/MyDrive/dataDir"
USER_PATH = BASE_PATH + "/users"
MANGA_PATH = BASE_PATH + "/manga"
ANIME_PATH = BASE_PATH + "/anime"
ANIME_REV_PATH = ANIME_PATH + "/anime_rev"
MANGA_REV_PATH = MANGA_PATH + "/manga_rev"
ANIME_SCORES_PATH = ANIME_PATH + "/anime_scores"
MANGA_SCORES_PATH = MANGA_PATH + "/manga_scores"
MONGO_PATH = BASE_PATH + "/mongo_csv"
NEO_PATH = BASE_PATH + "/neo4j_csv"

#CLUBS

Getting users from same clubs allow to get slients with similar tastes, creating an interesting and connected database

**Get Clubs from first 9 pages of web site with at least 30 members**

In [None]:
def get_number(string):
    return int(string.strip().replace(",", ""))


clubs_id = set()
possibles_users = 0
page = 1

for i in range(1,9):
    print(f"\r{page}", end="")

    time.sleep(3)  # Wait 3 seconds per page
    data = requests.get(f"https://myanimelist.net/clubs.php?p={page}")
    soup = BeautifulSoup(data.text, "html.parser")
    rows = soup.find_all("tr", {"class": "table-data"})
    for row in rows:
        members = get_number(row.find("td", {"class": "ac"}).text)
        club_id = get_number(
            row.find("a", {"class": "fw-b"}).get("href").split("=")[-1]
        )
        if (
            club_id not in clubs_id and members > 30
        ):  # Only save groups with more than 30 members
            possibles_users += members
            clubs_id.add(club_id)

    page += 1
    if possibles_users > 1000000:  # Threshold to stop
        break

with open(f"{BASE_PATH}/clubs.txt", "w") as file:
    for club in clubs_id:
        file.write(f"{club}\n")

***Creating and opening file to retrieve usernames***

In [None]:
if not os.path.exists(f"{BASE_PATH}/users_list.txt"):
    with open(f"{BASE_PATH}/users_list.txt", "w", encoding="UTF-8") as file:
        pass

if not os.path.exists(f"{BASE_PATH}/_revised_clubs.txt"):
    with open(f"{BASE_PATH}/_revised_clubs.txt", "w", encoding="UTF-8") as file:
        pass

In [None]:
with open(f"{BASE_PATH}/clubs.txt") as file:
    clubs_id = [x.strip() for x in file.readlines()]

with open(f"{BASE_PATH}/users_list.txt", encoding="UTF-8") as file:
    users = set([x.strip() for x in file.readlines()])

with open(f"{BASE_PATH}/_revised_clubs.txt", encoding="UTF-8") as file:
    revised_clubs = set([int(x.strip()) for x in file.readlines()])

len(users), len(revised_clubs), len(clubs_id)

##USERS

**Retrieving usernames in clubs with JikanAPI**

In [None]:
for i, club_id in enumerate(clubs_id):
    if club_id in revised_clubs:
        continue

    page = 1
    while True:
        print(f"\r{i+1}/{len(clubs_id)} --> {str(page).zfill(2)}", end="")
        link = f"https://api.jikan.moe/v4/clubs/{club_id}/members"

        try:
            time.sleep(0.5)
            data = requests.get(link,f"page={page}")
        except KeyboardInterrupt:
            raise KeyboardInterrupt()
        except:  # Other exception wait 2 min and try again
            time.sleep(120)
            continue

        if data.status_code != 200:
            break

        with open(f"{BASE_PATH}/users_list.txt", "a", encoding="UTF-8") as file:
            for user in map(lambda x: x["username"], json.loads(data.text)["data"]):
                if user not in users and user != "":
                    file.write(f"{user}\n")
                    users.add(user)
        page += 1

    revised_clubs.add(club_id)
    with open(f"{BASE_PATH}/_revised_clubs.txt", "a", encoding="UTF-8") as file:
        file.write(f"{club_id}\n")

***Shuffling username and giving them new ids, creating users.csv***

In [None]:
with open(f"{BASE_PATH}/users_list.txt", encoding="UTF-8") as file:
    users = list(set([x.strip() for x in file.readlines()]))[1:]
    random.shuffle(users)

with open(f"{BASE_PATH}/users.csv", "w", encoding="UTF-8") as file:
    file.write("user_id,username\n")
    for i, user in enumerate(users):
        file.write(f"{i},{user}\n")

#MANGA

**Using MAL API**




## **For each Username retrieving his manga**
gotten data: manga_id, manga_title, status, chapters_read, score

In [None]:
with open(f"{BASE_PATH}/users.csv", "r", encoding="UTF-8") as file:
    file.readline()
    users = [x.strip().split(",") for x in file.readlines()]
    users = [(int(x[0]), x[1]) for x in users]

last_revised_users = -1
if os.path.exists(f"{BASE_PATH}/_manga_revised_users.txt"):
    with open(f"{BASE_PATH}/_manga_revised_users.txt", "r", encoding="UTF-8") as file:
        last_revised_users = int(file.readlines()[-1])

len(users), last_revised_users

In [None]:
header = {
    'X-MAL-CLIENT-ID': '',
}
for i, (user_id, username) in enumerate(users):
    if user_id <= last_revised_users:
        continue

    now = datetime.now()
    print(f'\r{str(now).split(".")[0]} --> {i+1}/{len(users)}', end="")
    offset = 0
    all_manga = []

    while True:
        link = f"https://api.myanimelist.net/v2/users/{username}/mangalist?offset={offset}&fields=list_status&limit=300"
        try:
            time.sleep(0.4)
            data = requests.get(link, headers = header, timeout=15)
        except KeyboardInterrupt:
            raise KeyboardInterrupt()
        except:  # Other exception wait 2 min and try again
            time.sleep(120)
            continue

        if data.status_code != 200:
            break

        payload = json.loads(data.text)["data"]
        for manga in payload:
            all_manga.append((manga["node"]["id"], manga["node"]["title"].replace(',', '..'), manga["list_status"]["status"], manga["list_status"]["num_chapters_read"], manga["list_status"]["score"]))

        offset += 300
        if len(payload) < 300:
            break

    if len(all_manga) != 0:
        if not os.path.exists(f"{USER_PATH}/user_{user_id}"):
          os.makedirs(f"{USER_PATH}/user_{user_id}", exist_ok=True)
        with open(f"{USER_PATH}/user_{user_id}/user_{user_id}_manga_list.csv", "w") as f1:
            f1.write(f"manga_id,manga_title,status,read_chapters,score\n")
            for manga_id, manga_title, status, read_chapters,score in all_manga:
                f1.write(
                    f"{manga_id},{manga_title},{status},{read_chapters},{score}\n"
                )

    last_revised_users = user_id
    with open(f"{BASE_PATH}/_manga_revised_users.txt", "a", encoding="UTF-8") as file:
        file.write(f"{user_id}\n")

**Saving users for whom I have actually retrieved the lists**

In [None]:
with open(f"{BASE_PATH}/users.csv", "r", encoding="UTF-8") as file:
    file.readline()
    users = [x.strip().split(",") for x in file.readlines()]
    users = [(int(x[0]), x[1]) for x in users]

last_revised_users = -1
if os.path.exists(f"{BASE_PATH}/_manga_revised_users.txt"):
    with open(f"{BASE_PATH}/_manga_revised_users.txt", "r", encoding="UTF-8") as f1:
        last_revised_users = int(f1.readlines()[-1])

with open(f'{BASE_PATH}/manga_users.csv', 'w') as f2:
   for user_id, username in users:
      if user_id <= last_revised_users:
         f2.write(f'{user_id},{username}\n')
         print(user_id)

##Creating manga list based on users lists content

In [None]:
unique_manga = set() # il set consente solo valori unici
folders = os.listdir(USER_PATH)
for i, user_dir in enumerate(folders):
    folder_content = os.listdir(f"{USER_PATH}/{user_dir}")
    for j, user_file in enumerate(folder_content):
      if "manga_list.csv" not in user_file:
          continue

      print(f"\r{i + 1}/{len(folders)} -> file {j+1}", end="")
      with open(f"{USER_PATH}/{user_dir}/{user_file}", "r") as file:
          file.readline()
          for line in file:
              manga = line.strip().split(",")[0]
              unique_manga.add(manga)

print("         ")
print(len(unique_manga))

In [None]:
if not os.path.exists(f"{BASE_PATH}/_revised_manga.txt"):
    with open(f"{BASE_PATH}/_revised_manga.txt", "w", encoding="UTF-8") as file:
        pass

with open(f"{BASE_PATH}/_revised_manga.txt", encoding="UTF-8") as file:
  revised_manga = set([int(x.strip()) for x in file.readlines()])

len(unique_manga), len(revised_manga)

In [None]:
def get_manga_detail(text_data, field):
  content=json.loads(text_data)
  if field in content:
    if field == "genres":
      genres_list = []
      for genre in content[field]:
        genres_list.append(genre['name'])
      genres = "-".join(genres_list)
      return genres
    elif field == "authors":
      author_list = []
      for author in content[field]:
        name = ""
        if "first_name" in author['node']:
          name = author['node']['first_name']
        if "last_name" in author['node']:
          name = name + "+"+ author['node']['last_name']
        author_list.append(name)
      authors = "-".join(author_list)
      return authors
    elif field == "title" or field == "synopsis":
      comma_removed_field = content[field].replace(",", "....")
      if field == "synopsis":
        comma_removed_field = comma_removed_field.replace("\n", "__")
      return comma_removed_field
    else:
      return content[field]
  else:
    return ""



Getting manga details for each manga found in users lists

In [None]:
header = {
    'X-MAL-CLIENT-ID': '',
}
get_fields = ["title", "status", "media_type", "start_date", "num_chapters", "authors{first_name, last_name}", "genres", "synopsis", "rank", "mean"]
used_fields = ["title", "status", "media_type", "start_date", "num_chapters", "authors", "genres", "synopsis", "rank", "mean"]
get_query = f"?fields={','.join(get_fields)}"

skipped = 0
for i, manga_id in enumerate(unique_manga):
    if int(manga_id) in revised_manga:
        skipped += 1
        continue

    now = datetime.now()
    print(f'\r{str(now).split(".")[0]} --> {i+1}/{len(unique_manga)} ({manga_id}), skip = {skipped}', end="")
    all_details = []

    while True:
        link = f"https://api.myanimelist.net/v2/manga/{manga_id}{get_query}"
        try:
            time.sleep(0.5)
            data = requests.get(link, headers = header, timeout=5)
        except KeyboardInterrupt:
            raise KeyboardInterrupt()
        except:  # Other exception wait 1.5 min and try again
            time.sleep(90)
            continue

        if data.status_code != 200:
            break

        payload = json.loads(data.text)
        for field in used_fields:
          all_details.append(get_manga_detail(data.text, field))
        break

    if len(all_details) != 0:
        manga_row = ",".join(str(det) for det in all_details)
        if not os.path.exists(f"{MANGA_PATH}/manga_list.csv"):
          with open(f"{MANGA_PATH}/manga_list.csv", "w") as f1:
            f1.write(f"manga_id,manga_title,status,media_type,start_date,num_chapters,authors,genres,synopsis,rank,mean\n")
            f1.close()
        with open(f"{MANGA_PATH}/manga_list.csv", "a") as f2:
          f2.write(f"{manga_id},{manga_row}\n")

    revised_manga.add(manga_id)
    with open(f"{BASE_PATH}/_revised_manga.txt", "a", encoding="UTF-8") as file:
        file.write(f"{manga_id}\n")

In [None]:
with open(f"{MANGA_PATH}/manga_list.csv", "r") as f:
    righe = f.readlines()

righe_senza_duplicati = list(set(righe))
len(righe), len(righe_senza_duplicati), len(righe) - len(righe_senza_duplicati)

## Scrivere le righe uniche in un nuovo file
#with open(f"{BASE_PATH}/_revised_manga.txt", "w") as f:
#    f.writelines(righe_senza_duplicati)

#ANIME

## **For each Username retrieving his anime lists**
gotten data: anime_id, anime_title, status, episodes_watched score

In [None]:
with open(f"{BASE_PATH}/users.csv", "r", encoding="UTF-8") as file:
    file.readline()
    users = [x.strip().split(",") for x in file.readlines()]
    users = [(int(x[0]), x[1]) for x in users]

last_revised_users = -1
if os.path.exists(f"{BASE_PATH}/_anime_revised_users.txt"):
    with open(f"{BASE_PATH}/_anime_revised_users.txt", "r", encoding="UTF-8") as file:
        last_revised_users = int(file.readlines()[-1])

last_revised_manga_users = -1
if os.path.exists(f"{BASE_PATH}/_manga_revised_users.txt"):
    with open(f"{BASE_PATH}/_manga_revised_users.txt", "r", encoding="UTF-8") as file:
        last_revised_manga_users = int(file.readlines()[-1])

len(users), last_revised_users, last_revised_manga_users

In [None]:
header = {
    'X-MAL-CLIENT-ID': '',
}
for i, (user_id, username) in enumerate(users):
    if user_id <= last_revised_users:
        continue
    if user_id > (last_revised_manga_users + 1000):
        break

    now = datetime.now()
    print(f'\r{str(now).split(".")[0]} --> {i+1}/{len(users)}', end="")
    offset = 0
    all_anime = []

    while True:
        link = f"https://api.myanimelist.net/v2/users/{username}/animelist?offset={offset}&fields=list_status&limit=300"
        try:
            time.sleep(0.4)
            data = requests.get(link, headers = header, timeout=15)
        except KeyboardInterrupt:
            raise KeyboardInterrupt()
        except:  # Other exception wait 1.5 min and try again
            time.sleep(90)
            continue

        if data.status_code != 200:
            break

        payload = json.loads(data.text)["data"]
        for anime in payload:
            all_anime.append((anime["node"]["id"], anime["node"]["title"].replace(',', '..'), anime["list_status"]["status"], anime["list_status"]["num_episodes_watched"], anime["list_status"]["score"]))

        offset += 300
        if len(payload) < 300:
            break

    if len(all_anime) != 0:
        if not os.path.exists(f"{USER_PATH}/user_{user_id}"):
          os.makedirs(f"{USER_PATH}/user_{user_id}", exist_ok=True)
        with open(f"{USER_PATH}/user_{user_id}/user_{user_id}_anime_list.csv", "w") as f1:
            f1.write(f"anime_id,anime_title,status,watched_episodes,score\n")
            for anime_id,anime_title,status,watched_episodes,score in all_anime:
                f1.write(
                    f"{anime_id},{anime_title},{status},{watched_episodes},{score}\n"
                )

    last_revised_users = user_id
    with open(f"{BASE_PATH}/_anime_revised_users.txt", "a", encoding="UTF-8") as file:
        file.write(f"{user_id}\n")

In [None]:
with open(f"{BASE_PATH}/users.csv", "r", encoding="UTF-8") as file:
    file.readline()
    users = [x.strip().split(",") for x in file.readlines()]
    users = [(int(x[0]), x[1]) for x in users]

last_revised_users = -1
if os.path.exists(f"{BASE_PATH}/_anime_revised_users.txt"):
    with open(f"{BASE_PATH}/_anime_revised_users.txt", "r", encoding="UTF-8") as f1:
        last_revised_users = int(f1.readlines()[-1])

with open(f'{BASE_PATH}/anime_users.csv', 'w') as f2:
   for user_id, username in users:
      if user_id <= last_revised_users:
         f2.write(f'{user_id},{username}\n')
         print(user_id)

##Creating anime list based on users lists content

In [None]:
unique_anime = set() # il set consente solo valori unici
folders = os.listdir(USER_PATH)
for i, user_dir in enumerate(folders):
    folder_content = os.listdir(f"{USER_PATH}/{user_dir}")
    for j, user_file in enumerate(folder_content):
      if "anime_list.csv" not in user_file:
          continue

      print(f"\r{i + 1}/{len(folders)} -> file {j+1}", end="")
      with open(f"{USER_PATH}/{user_dir}/{user_file}", "r") as file:
          file.readline()
          for line in file:
              anime = line.strip().split(",")[0]
              unique_anime.add(anime)

print("         ")
print(len(unique_anime))

In [None]:
if not os.path.exists(f"{BASE_PATH}/_revised_anime.txt"):
    with open(f"{BASE_PATH}/_revised_anime.txt", "w", encoding="UTF-8") as file:
        pass

with open(f"{BASE_PATH}/_revised_anime.txt", encoding="UTF-8") as file:
  revised_anime = set([int(x.strip()) for x in file.readlines()])

len(unique_anime), len(revised_anime)

In [None]:
def get_anime_detail(text_data, field):
  content=json.loads(text_data)
  if field in content:
    if field == "genres":
      genres_list = []
      for genre in content[field]:
        genres_list.append(genre['name'])
      genres = "-".join(genres_list)
      return genres
    elif field == "studios":
      studios_list = []
      for studio in content[field]:
          studios_list.append(studio['name'].replace(',', '....'))
      studios = "-".join(studios_list)
      return studios
    elif field == "title" or field == "synopsis":
      comma_removed_field = content[field].replace(",", "....")
      if field == "synopsis":
        comma_removed_field = comma_removed_field.replace("\n", "__")
      return comma_removed_field
    else:
      return content[field]
  else:
    return ""



In [None]:
header = {
    'X-MAL-CLIENT-ID': '',
}
get_fields = ["title", "status", "media_type", "start_date", "end_date", "num_episodes", "average_episode_duration", "studios", "source", "genres", "synopsis", "rank", "mean"]
get_query = f"?fields={','.join(get_fields)}"

skipped = 0
for i, anime_id in enumerate(unique_anime):
    if int(anime_id) in revised_anime:
        skipped += 1
        continue

    now = datetime.now()
    print(f'\r{str(now).split(".")[0]} --> {i+1}/{len(unique_anime)} ({anime_id}), skip = {skipped}', end="")
    all_details = []

    while True:
        link = f"https://api.myanimelist.net/v2/anime/{anime_id}{get_query}"
        try:
            time.sleep(0.5)
            data = requests.get(link, headers = header, timeout=5)
        except KeyboardInterrupt:
            raise KeyboardInterrupt()
        except:  # Other exception wait 1.5 min and try again
            time.sleep(90)
            continue

        if data.status_code != 200:
            break

        payload = json.loads(data.text)
        for field in get_fields:
          all_details.append(get_anime_detail(data.text, field))
        break

    if len(all_details) != 0:
        anime_row = ",".join(str(det) for det in all_details)
        if not os.path.exists(f"{ANIME_PATH}/anime_list.csv"):
          with open(f"{ANIME_PATH}/anime_list.csv", "w") as f1:
            f1.write(f"anime_id,anime_title,status,media_type,start_date, end_date,num_episodes,average_ep_duration_sec,studios,source, genres,synopsis,rank,mean\n")
            f1.close()
        with open(f"{ANIME_PATH}/anime_list.csv", "a") as f2:
          f2.write(f"{anime_id},{anime_row}\n")

    revised_anime.add(anime_id)
    with open(f"{BASE_PATH}/_revised_anime.txt", "a", encoding="UTF-8") as file:
        file.write(f"{anime_id}\n")

#JOINING DATA

creating a list joining users used for manga and anime retrieval

In [None]:
with open(f"{BASE_PATH}/manga_users.csv", "r", encoding="UTF-8") as file:
    manga_users = [x.strip().split(",") for x in file.readlines()]
    manga_users = [(int(x[0]), x[1]) for x in manga_users]
with open(f"{BASE_PATH}/anime_users.csv", "r", encoding="UTF-8") as file:
    anime_users = [x.strip().split(",") for x in file.readlines()]
    anime_users = [(int(x[0]), x[1]) for x in anime_users]

if not os.path.exists(f"{BASE_PATH}/_revised_join_users.txt"):
    with open(f"{BASE_PATH}/_revised_join_users.txt", "w", encoding="UTF-8") as file:
        pass

with open(f"{BASE_PATH}/_revised_join_users.txt", encoding="UTF-8") as file:
  revised_join_users = set([x.strip() for x in file.readlines()])

print(len(revised_join_users), len(manga_users), len(anime_users))

Adding users from which I've taken the manga list to the joined users list

In [None]:
def get_user_data(text_data, field):
  content=json.loads(text_data)["data"]
  if field in content:
    if field == "location" and content[field] != None:
        return content[field].replace(",", "-")
    return content[field]
  else:
    return ""

retrieving user information

In [None]:
user_field = ["birthday", "location", "joined"]
skipped = 0
for i, (user_id, username) in enumerate(anime_users):
    if username in revised_join_users:
        skipped += 1
        continue
    if not os.path.exists(f"{USER_PATH}/user_{user_id}/user_{user_id}_anime_list.csv"):
        continue


    now = datetime.now()
    print(f'\r{str(now).split(".")[0]} --> {i+1}/{len(anime_users)} ({username}), skip = {skipped}', end="")
    all_user_data = []

    while True:
        link = f"https://api.jikan.moe/v4/users/{username}"
        try:
            time.sleep(0.5)
            data = requests.get(link, timeout=5)
        except KeyboardInterrupt:
            raise KeyboardInterrupt()
        except:  # Other exception wait 1.5 min and try again
            time.sleep(90)
            continue

        if data.status_code != 200:
            break

        payload = json.loads(data.text)
        for field in user_field:
          all_user_data.append(get_user_data(data.text, field))
        break
    if len(all_user_data) != 0:
        user_row = ",".join(str(det) for det in all_user_data)
        if not os.path.exists(f"{BASE_PATH}/joined_users.csv"):
          with open(f"{BASE_PATH}/joined_users.csv", "w") as f1:
            f1.write(f"user_id,username,birthday,location,joined_at\n")
            f1.close()
        with open(f"{BASE_PATH}/joined_users.csv", "a") as f2:
          f2.write(f"{user_id},{username},{user_row}\n")
    revised_join_users.add(username)
    with open(f"{BASE_PATH}/_revised_join_users.txt", "a", encoding="UTF-8") as file:
        file.write(f"{username}\n")


#REVIEWS

In [None]:
def prepare_text(rev):
  rev = rev.replace(",", "....")
  rev = rev.replace("\n", "|")
  rev = rev.replace("\n\n", "||")
  rev = rev.replace("\r", " ")
  return rev


## ANIME REVIEWS

Recovering all users and anime retrieved

In [None]:
if not os.path.exists(f"{ANIME_PATH}/anime_rev"):
          os.makedirs(f"{ANIME_PATH}/anime_rev", exist_ok=True)

with open(f"{BASE_PATH}/joined_users.csv", "r", encoding="UTF-8") as file:
    file.readline()
    all_users = [x.strip().split(",") for x in file.readlines()]
    all_users = [x[1] for x in all_users] # Retrieving usernames to get reviwes only of saved users
with open(f"{ANIME_PATH}/anime_list.csv", "r", encoding="UTF-8") as file:
    file.readline()
    all_anime = [x.strip().split(",") for x in file.readlines()]
    all_anime = [(int(x[0]), x[1]) for x in all_anime] # Retrieving usernames to get reviwes only of saved anime

if not os.path.exists(f"{BASE_PATH}/_revised_anime_rev.txt"):
    with open(f"{BASE_PATH}/_revised_anime_rev.txt", "w", encoding="UTF-8") as file:
        pass

with open(f"{BASE_PATH}/_revised_anime_rev.txt", encoding="UTF-8") as file:
  revised_anime_rev = set([int(x.strip()) for x in file.readlines()])

print(len(revised_anime_rev), len(all_anime), len(all_users))

Scraping anime review for anime in our db and user registered

In [None]:
skipped = 0
for i, (anime_id, anime_title) in enumerate(all_anime):
    if anime_id in revised_anime_rev:
        skipped += 1
        continue
    all_anime_reviews = []

    page = 1
    while True:
        print(f"\r{i+1}/{len(all_anime)} --> {str(page).zfill(2)}, anime: {anime_title}  (skipped = {skipped})", end="")
        link = f"https://api.jikan.moe/v4/anime/{anime_id}/reviews"

        try:
            time.sleep(0.6)
            data = requests.get(link,f"page={page}")
        except KeyboardInterrupt:
            raise KeyboardInterrupt()
        except:  # Other exception wait 1.5 min and try again
            time.sleep(90)
            continue

        if data.status_code != 200:
            break

        payload = json.loads(data.text)["data"]
        for review in payload:
            if review["user"]["username"] in all_users:
              all_anime_reviews.append((review["user"]["username"], review["score"], prepare_text(review["review"]), review["date"]))
        if len(payload) < 20:
            break
        page += 1

    if len(all_anime_reviews) != 0:
        with open(f"{ANIME_REV_PATH}/reviews_{anime_id}.csv", "w") as f1:
            f1.write(f"username,score,review,date\n")
            for user, score, review, date in all_anime_reviews:
                f1.write(
                    f"{user},{score},{review},{date}\n"
                )

    revised_anime_rev.add(anime_id)
    with open(f"{BASE_PATH}/_revised_anime_rev.txt", "a", encoding="UTF-8") as file:
        file.write(f"{anime_id}\n")

##MANGA REVIEWS

Recovering all users and manga scraped

In [None]:
if not os.path.exists(f"{MANGA_PATH}/manga_rev"):
          os.makedirs(f"{MANGA_PATH}/manga_rev", exist_ok=True)

with open(f"{BASE_PATH}/joined_users.csv", "r", encoding="UTF-8") as file:
    file.readline()
    all_users = [x.strip().split(",") for x in file.readlines()]
    all_users = [x[1] for x in all_users] # Retrieving usernames to get reviwes only of saved users
with open(f"{MANGA_PATH}/manga_list.csv", "r", encoding="UTF-8") as file:
    file.readline()
    all_manga = [x.strip().split(",") for x in file.readlines()]
    all_manga = [(int(x[0]), x[1]) for x in all_manga] # Retrieving usernames to get reviwes only of saved manga

if not os.path.exists(f"{BASE_PATH}/_revised_manga_rev.txt"):
    with open(f"{BASE_PATH}/_revised_manga_rev.txt", "w", encoding="UTF-8") as file:
        pass

with open(f"{BASE_PATH}/_revised_manga_rev.txt", encoding="UTF-8") as file:
  revised_manga_rev = set([int(x.strip()) for x in file.readlines()])

print(len(revised_manga_rev), len(all_manga), len(all_users))

Scraping anime review for manga in our db and user registered

In [None]:
skipped = 0
for i, (manga_id, manga_title) in enumerate(all_manga):
    if manga_id in revised_manga_rev:
        skipped += 1
        continue
    all_manga_reviews = []

    page = 1
    while True:
        print(f"\r{i+1}/{len(all_manga)} --> {str(page).zfill(2)}, manga: {manga_title}  (skipped = {skipped})", end="")
        link = f"https://api.jikan.moe/v4/manga/{manga_id}/reviews"

        try:
            time.sleep(0.6)
            data = requests.get(link,f"page={page}")
        except KeyboardInterrupt:
            raise KeyboardInterrupt()
        except:  # Other exception wait 1.5 min and try again
            time.sleep(90)
            continue

        if data.status_code != 200:
            break

        payload = json.loads(data.text)["data"]
        for review in payload:
            if review["user"]["username"] in all_users:
              all_manga_reviews.append((review["user"]["username"], review["score"], prepare_text(review["review"]), review["date"]))
        if len(payload) < 20:
            break
        page += 1

    if len(all_manga_reviews) != 0:
        with open(f"{MANGA_REV_PATH}/reviews_{manga_id}.csv", "w") as f1:
            f1.write(f"username,score,review,date\n")
            for user, score, review, date in all_manga_reviews:
                f1.write(
                    f"{user},{score},{review},{date}\n"
                )

    revised_manga_rev.add(manga_id)
    with open(f"{BASE_PATH}/_revised_manga_rev.txt", "a", encoding="UTF-8") as file:
        file.write(f"{manga_id}\n")

#CREATING MONGO DB IMPORT FILES


Combining data previously retrieved in a format accepted by mongo import

In [4]:
def get_userId(username, users):
    return int(users[users[" username"] == username]["user_id"].values[0])

def get_creation(username, users):
    creation = datetime.fromisoformat(users[users[" username"] == username]["joined_at"].values[0]).replace(tzinfo=None)
    return creation
def gen_rand_date(start_date, end_date):
    # Calcola la differenza in secondi tra le date
    # Calcola il delta tra la data iniziale e quella finale
    delta = end_date - start_date
    # Genera un numero casuale di secondi tra l'intervallo
    random_seconds = random.randint(0, int(delta.total_seconds()))
    # Aggiungi i secondi casuali alla data iniziale
    random_datetime = start_date + timedelta(seconds=random_seconds)
    return random_datetime.isoformat()


##For MANGA Collection

Creating a file with all scores from users with no review, for each manga.
To improve performance this file can be created when retrieving user lists and skipping the next two blocks

In [None]:
if not os.path.exists(f"{MANGA_PATH}/manga_scores"):
          os.makedirs(f"{MANGA_PATH}/manga_scores", exist_ok=True)

with open(f"{BASE_PATH}/joined_users.csv", "r", encoding="UTF-8") as file:
    file.readline()
    all_users = [x.strip().split(",") for x in file.readlines()]
    all_users = [(int(x[0]), x[1]) for x in all_users] # Retrieving user_id and usernames to get reviwes only of saved users

with open(f"{MANGA_PATH}/manga_list.csv", "r", encoding="UTF-8") as file:
    file.readline()
    all_manga = [x.strip().split(",") for x in file.readlines()]
    all_manga = [(int(x[0]), x[1]) for x in all_manga] # Retrieving usernames to get reviwes only of saved manga

if not os.path.exists(f"{BASE_PATH}/_revised_manga_score.txt"):
    with open(f"{BASE_PATH}/_revised_manga_score.txt", "w", encoding="UTF-8") as file:
        pass

with open(f"{BASE_PATH}/_revised_manga_score.txt", encoding="UTF-8") as file:
  revised_manga_score = set([int(x.strip()) for x in file.readlines()])

len(revised_manga_score), len(all_users), len(all_manga)

In [None]:
skipped = 0
for i, (manga_id, manga_title) in enumerate(all_manga):
    if manga_id in revised_manga_score:
        skipped += 1
        continue
    scores = []
    for j, (user_id, username) in enumerate(all_users):

        print(f"\r manga:{i+1}/{len(all_manga)} x users:{j+1}/{len(all_users)}, skipped: {skipped}", end="")

        user_file = f"{USER_PATH}/user_{user_id}/user_{user_id}_manga_list.csv"
        if os.path.exists(user_file):
            user_data = pd.read_csv(user_file)  # Colonne: manga_id, status, read_chapters, score
            user_manga = user_data[user_data["manga_id"] == manga_id]

            if not user_manga.empty:
                row = user_manga.iloc[0]  # Prendi la prima riga (se c'è più di una, è un'anomalia)
                scores.append({
                    "user_id": user_id,
                    "username": username,
                    "status": row["status"],
                    "read_chapters": row["read_chapters"],
                    "score": int(row["score"])
                })

    if scores:
        scores_df = pd.DataFrame(scores)
        scores_df.to_csv(f"{MANGA_SCORES_PATH}/manga_{manga_id}.csv", index=False)

    revised_manga_score.add(manga_id)
    with open(f"{BASE_PATH}/_revised_manga_score.txt", "a", encoding="UTF-8") as file:
        file.write(f"{manga_id}\n")




**Creating the file for mongo import**

"_id"
"name"
"status"
"chapters"
"sumScores"
"numScores"
"genres"
"type"
"authors"
"synopsis"
"reviews"

In [None]:
def process_manga_files_with_pandas(manga_file, reviews_path, scores_path, output_file, users):

    # Carica la lista dei manga
    manga_df = pd.read_csv(manga_file)
    manga_df["numScores"] = 0
    manga_df["sumScores"] = 0
    manga_df["reviews"] = None  # Aggiungi colonna per le recensioni

    # Itera sui manga per processare i file associati
    for index, manga_row in manga_df.iterrows():
        manga_id = manga_row["manga_id"]
        print(f"\r{index+1}/{len(manga_df)}, manga: {manga_id}", end="")

        # Percorsi per i file delle recensioni e dei punteggi
        reviews_file = os.path.join(reviews_path, f"reviews_{manga_id}.csv")
        scores_file = os.path.join(scores_path, f"manga_{manga_id}.csv")

        # Lista per raccogliere tutte le recensioni
        reviews_list = []

        # Carica recensioni con commenti (se esiste il file)
        if os.path.exists(reviews_file):
            reviews_df = pd.read_csv(reviews_file)
            for _, review_row in reviews_df.iterrows():
                reviews_list.append({
                    "userId": get_userId(review_row["username"], users),
                    "username": review_row["username"],
                    "comment": review_row.get("review", ""),
                    "score": int(review_row["score"]),
                    "timestamp": review_row["date"]
                })
                # Aggiorna i contatori
                if int(review_row["score"]) > 0:
                    manga_df.at[index, "numScores"] += 1
                    manga_df.at[index, "sumScores"] += int(review_row["score"])

        # Carica voti senza commenti (se esiste il file)
        if os.path.exists(scores_file):
            scores_df = pd.read_csv(scores_file)
            for _, score_row in scores_df.iterrows():
                if int(score_row["score"]) > 0:
                    reviews_list.append({
                        "userId": score_row["user_id"],
                        "username": score_row["username"],
                        "score": int(score_row["score"]),
                        "timestamp": gen_rand_date(get_creation(score_row["username"], users), datetime.now())
                    })
                    # Aggiorna i contatori
                    manga_df.at[index, "numScores"] += 1
                    manga_df.at[index, "sumScores"] += int(score_row["score"])

        # Salva le recensioni come JSON serializzato
        manga_df.at[index, "reviews"] = json.dumps(reviews_list, ensure_ascii=False)

    # Salva il risultato in un file CSV
    manga_df.to_csv(output_file, index=False)
    print(f"File CSV creato: {output_file}")

# Percorsi dei file
manga_file = MANGA_PATH + "/manga_list.csv"  # Il file CSV di input
reviews_path = MANGA_REV_PATH       # Directory contenente i file delle recensioni
scores_path = MANGA_SCORES_PATH        # Directory contenente i file dei punteggi
output_file = MONGO_PATH + "/manga_collection.csv"  # Il file CSV di output

users = pd.read_csv(f"{BASE_PATH}/joined_users.csv")


process_manga_files_with_pandas(manga_file, reviews_path, scores_path, output_file, users)


Cleaning the previous dataframe

In [None]:
manga_coll = pd.read_csv(MONGO_PATH + "/manga_collection.csv")
manga_coll.head()
manga_coll.rename(columns={"manga_id": "_id", "manga_title": "name", "media_type" : "type", "num_chapters": "chapters"}, inplace=True)
manga_coll.drop(columns=["start_date", "rank", "mean"], inplace=True)
try:
    manga_coll['genres'] = manga_coll['genres'].apply(lambda x: str(x).replace(" ", "").split("-"))
    manga_coll['authors'] = manga_coll['authors'].apply(lambda x: str(x).replace(" ", "").replace("+", " ").split("-"))
    manga_coll['status'] = manga_coll['status'].apply(lambda x: 'COMPLETE' if x == "finished" else 'ONGOING')
except Exception as e:
    print(e)

duplicati = manga_coll[manga_coll.duplicated(subset='_id', keep='first')]
manga_coll_no_double =manga_coll.drop_duplicates(subset='_id', keep='first')
# Mostra i risultati
print(manga_coll.shape[0], duplicati.shape[0], manga_coll_no_double.shape[0])
duplicati.head()
manga_coll_no_double.to_csv(MONGO_PATH + "/manga_collection.csv", index=False)


##For ANIME Collection

To improve performance this file can be created when retrieving user lists and skipping the next two blocks

In [None]:
if not os.path.exists(f"{ANIME_PATH}/anime_scores"):
          os.makedirs(f"{ANIME_PATH}/anime_scores", exist_ok=True)

with open(f"{BASE_PATH}/joined_users.csv", "r", encoding="UTF-8") as file:
    file.readline()
    all_users = [x.strip().split(",") for x in file.readlines()]
    all_users = [(int(x[0]), x[1]) for x in all_users] # Retrieving user_id and usernames to get reviwes only of saved users

with open(f"{ANIME_PATH}/anime_list.csv", "r", encoding="UTF-8") as file:
    file.readline()
    all_anime = [x.strip().split(",") for x in file.readlines()]
    all_anime = [(int(x[0]), x[1]) for x in all_anime] # Retrieving usernames to get reviwes only of saved anime

if not os.path.exists(f"{BASE_PATH}/_revised_anime_score.txt"):
    with open(f"{BASE_PATH}/_revised_anime_score.txt", "w", encoding="UTF-8") as file:
        pass

with open(f"{BASE_PATH}/_revised_anime_score.txt", encoding="UTF-8") as file:
  revised_anime_score = set([int(x.strip()) for x in file.readlines()])

len(revised_anime_score), len(all_users), len(all_anime)

In [None]:
skipped = 0
for i, (anime_id, anime_title) in enumerate(all_anime):
    if anime_id in revised_anime_score:
        skipped += 1
        continue
    scores = []
    for j, (user_id, username) in enumerate(all_users):

        print(f"\r anime:{i+1}/{len(all_anime)} x users:{j+1}/{len(all_users)}, skipped: {skipped}", end="")

        user_file = f"{USER_PATH}/user_{user_id}/user_{user_id}_anime_list.csv"
        if os.path.exists(user_file):
            user_data = pd.read_csv(user_file)  # Colonne: anime_id, status, read_chapters, score
            user_anime = user_data[user_data["anime_id"] == anime_id]

            if not user_anime.empty:
                row = user_anime.iloc[0]  # Prendi la prima riga (se c'è più di una, è un'anomalia)
                scores.append({
                    "userId": user_id,
                    "username": username,
                    "status": row["status"],
                    "watched_episodes": row["watched_episodes"],
                    "score": int(row["score"])
                })

    if scores:
        scores_df = pd.DataFrame(scores)
        scores_df.to_csv(f"{ANIME_SCORES_PATH}/anime_{anime_id}.csv", index=False)

    revised_anime_score.add(anime_id)
    with open(f"{BASE_PATH}/_revised_anime_score.txt", "a", encoding="UTF-8") as file:
        file.write(f"{anime_id}\n")




**Creating the file for mongo import**

"_id"
"name"
"status"
"episodes"
"sumScores"
"numScores"
"genres"
"type"
"source"
"duration"
"studio"
"synopsis"
"reviews"

In [None]:
def process_anime_files_with_pandas(anime_file, reviews_path, scores_path, output_file, users):

    # Carica la lista dei anime
    anime_df = pd.read_csv(anime_file)
    anime_df["numScores"] = 0
    anime_df["sumScores"] = 0
    anime_df["reviews"] = None  # Aggiungi colonna per le recensioni

    # Itera sui anime per processare i file associati
    for index, anime_row in anime_df.iterrows():
        anime_id = anime_row["anime_id"]
        print(f"\r{index+1}/{len(anime_df)}, anime: {anime_id}", end="")

        # Percorsi per i file delle recensioni e dei punteggi
        reviews_file = os.path.join(reviews_path, f"reviews_{anime_id}.csv")
        scores_file = os.path.join(scores_path, f"anime_{anime_id}.csv")

        # Lista per raccogliere tutte le recensioni
        reviews_list = []

        # Carica recensioni con commenti (se esiste il file)
        if os.path.exists(reviews_file):
            reviews_df = pd.read_csv(reviews_file)
            for _, review_row in reviews_df.iterrows():
                reviews_list.append({
                    "userId": get_userId(review_row["username"], users),
                    "username": review_row["username"],
                    "comment": review_row.get("review", ""),
                    "score": int(review_row["score"]),
                    "timestamp": review_row["date"]
                })
                # Aggiorna i contatori
                if int(review_row["score"]) > 0:
                    anime_df.at[index, "numScores"] += 1
                    anime_df.at[index, "sumScores"] += int(review_row["score"])

        # Carica voti senza commenti (se esiste il file)
        if os.path.exists(scores_file):
            scores_df = pd.read_csv(scores_file)
            for _, score_row in scores_df.iterrows():
                if int(score_row["score"]) > 0:
                    reviews_list.append({
                        "userId": score_row["userId"],
                        "username": score_row["username"],
                        "score": int(score_row["score"]),
                        "timestamp": gen_rand_date(get_creation(score_row["username"], users), datetime.now())
                    })
                    # Aggiorna i contatori
                    anime_df.at[index, "numScores"] += 1
                    anime_df.at[index, "sumScores"] += int(score_row["score"])

        # Salva le recensioni come JSON serializzato
        anime_df.at[index, "reviews"] = json.dumps(reviews_list, ensure_ascii=False)

    # Salva il risultato in un file CSV
    anime_df.to_csv(output_file, index=False)
    print(f"File CSV creato: {output_file}")

# Percorsi dei file
anime_file = ANIME_PATH + "/anime_list.csv"  # Il file CSV di input
reviews_path = ANIME_REV_PATH       # Directory contenente i file delle recensioni
scores_path = ANIME_SCORES_PATH        # Directory contenente i file dei punteggi
output_file = MONGO_PATH + "/anime_collection.csv"  # Il file CSV di output

users = pd.read_csv(f"{BASE_PATH}/joined_users.csv")


process_anime_files_with_pandas(anime_file, reviews_path, scores_path, output_file, users)


Cleaning the previous dataframe

In [None]:
anime_coll = pd.read_csv(MONGO_PATH + "/anime_collection.csv")
anime_coll.head()
anime_coll.rename(columns={"anime_id": "_id", "anime_title": "name", " genres": "genres", "media_type" : "type", "num_episodes": "episodes", "average_ep_duration_sec": "duration"}, inplace=True)
anime_coll.drop(columns=["start_date", " end_date", "rank", "mean"], inplace=True)
try:
    anime_coll['genres'] = anime_coll['genres'].apply(lambda x: str(x).replace(" ", "").split("-"))
    anime_coll['studios'] = anime_coll['studios'].apply(lambda x: str(x).replace(" ", "").replace("....", " ").split("-") if x else None)
    anime_coll['status'] = anime_coll['status'].apply(lambda x: 'COMPLETE' if x == "finished_airing" else 'ONGOING')

except Exception as e:
    print(f"errore: {e}")

duplicati = anime_coll[anime_coll.duplicated(subset='_id', keep='first')]
anime_coll_no_double =anime_coll.drop_duplicates(subset='_id', keep='first')
# Mostra i risultati
print(anime_coll.shape[0], duplicati.shape[0], anime_coll_no_double.shape[0])
duplicati.head()
anime_coll_no_double.to_csv(MONGO_PATH + "/anime_collection.csv", index=False)


## For USER collection

"_id"
"role"
"username"
"email"
"password"
"birthDate"
"followers" [1, 2, 4, 5…],
"privacyStatus" {NOBODY, FOLLOWERS, ALL},
"createdAt"

In [273]:
def gen_rand_privacy_status():
    return random.choice(["NOBODY", "FOLLOWERS", "ALL"])

In [None]:
def process_users_files_with_pandas(users_file, output_file, users):

    # Carica la lista dei users
    users_df = pd.read_csv(users_file)
    users_df["followers"] = None
    users_df["privacyStatus"] = None
    users_df["role"] = None
    users_df["email"] = None
    users_df["password"] = None
    users_df.rename(columns={"user_id": "_id", "birthday": "birthdate", "joined_at": "createdAt"}, inplace=True)
    users_df.drop(columns=["location"], inplace=True)


    # Itera sui users per processare i file associati
    for index, users_row in users_df.iterrows():
        users_id = users_row["_id"]
        print(f"\r{index+1}/{len(users_df)}, users: {users_id}", end="")

        # Lista per raccogliere tutte i followers
        followers_list = []
        num_follower = random.randint(0, 50)
        follower_ids = users["user_id"].sample(num_follower).tolist();
        if(users_row["_id"] not in follower_ids):
            users_df.at[index, "followers"] = follower_ids
        else:
            follower_ids.remove(users_row["_id"])
            users_df.at[index, "followers"] = follower_ids

        # aggiungo il privacy status
        users_df.at[index, "privacyStatus"] = gen_rand_privacy_status()

        # aggiungo il ruolo
        users_df.at[index, "role"] = "USER"

        # aggiungo email casuale
        users_df.at[index, "email"] = f"{users_row[' username']}@gmail.com"

        # aggiungo password casuale
        users_df.at[index, "password"] = ""

        # aggiungo compleanno casuale
        if pd.isna(users_row["birthdate"]):
            start_date = get_creation(users_row[" username"], users) - timedelta(days=365*10)
            users_df.at[index, "birthdate"] = gen_rand_date(start_date, get_creation(users_row[" username"], users))

    # Salva il risultato in un file CSV
    users_df.to_csv(output_file, index=False)
    print(f"File CSV creato: {output_file}")

# Percorsi dei file
users_file = BASE_PATH + "/joined_users.csv"  # Il file CSV di input
output_file = MONGO_PATH + "/users_collection.csv"  # Il file CSV di output

users = pd.read_csv(f"{BASE_PATH}/joined_users.csv")


process_users_files_with_pandas(users_file, output_file, users)


In [None]:
user_coll= pd.read_csv(MONGO_PATH + "/users_collection.csv")

duplicati = user_coll[user_coll.duplicated(subset='_id', keep='first')]
user_coll_no_double =user_coll.drop_duplicates(subset='_id', keep='first')
# Mostra i risultati
print(user_coll.shape[0], duplicati.shape[0], user_coll_no_double.shape[0])

## UTILS

When impporting csv into mongodb some data may not be formatted in the right way, usong the following script we can convert data types

convert reviews.timestamp to date type and reviews.userId to string

In [None]:
# db.myCollection.updateMany(
#   {}, // Seleziona tutti i documenti. Puoi aggiungere filtri specifici se necessario.
#   [
#     {
#       $set: {
#         reviews: {
#           $cond: {
#             if: { $and: [ { $isArray: "$reviews" }, { $gt: [{ $size: "$reviews" }, 0] } ] },
#             then: {
#               $map: {
#                 input: "$reviews",
#                 as: "review",
#                 in: {
#                   $mergeObjects: [
#                     "$$review",
#                     { timestamp: { $toDate: "$$review.timestamp" },
#                       userId: { $toString: "$$review.userIs"      }}
#                   ]
#                 }
#               }
#             },
#             else: "$reviews" // Lascia invariato se non è un array o è un array vuoto.
#           }
#         }
#       }
#     }
#   ]
# );

for manga

In [None]:
# db.manga.find({}).forEach(doc => {
#     if (doc.genres && typeof doc.genres === "string") {
#         db.manga.updateOne(
#             { _id: doc._id },
#             [{
#                 $set: {
#                     genres: {
#                         $function: {
#                             body: function (str) {
#                                 try {
#                                     return JSON.parse(str.replace(/'/g, '"'));
#                                 } catch (e) {
#                                     return []; // Handle invalid JSON gracefully
#                                 }
#                             },
#                             args: ["$genres"],
#                             lang: "js"
#                         }
#                     }
#                 }
#             }]
#         );
#     }
#
#     if (doc.authors && typeof doc.authors === "string") {
#         db.manga.updateOne(
#             { _id: doc._id },
#             [{
#                 $set: {
#                     authors: {
#                         $function: {
#                             body: function (str) {
#                                 try {
#                                     return JSON.parse(str.replace(/'/g, '"'));
#                                 } catch (e) {
#                                     return []; // Handle invalid JSON gracefully
#                                 }
#                             },
#                             args: ["$authors"],
#                             lang: "js"
#                         }
#                     }
#                 }
#             }]
#         );
#     }
# });

for anime

In [None]:
# db.anime.find({}).forEach(doc => {
#     if (doc.genres && typeof doc.genres === "string") {
#         db.anime.updateOne(
#             { _id: doc._id },
#             [{
#                 $set: {
#                     genres: {
#                         $function: {
#                             body: function (str) {
#                                 try {
#                                     return JSON.parse(str.replace(/'/g, '"'));
#                                 } catch (e) {
#                                     return []; // Handle invalid JSON gracefully
#                                 }
#                             },
#                             args: ["$genres"],
#                             lang: "js"
#                         }
#                     }
#                 }
#             }]
#         );
#     }
#
#     if (doc.studios && typeof doc.studios === "string") {
#         db.anime.updateOne(
#             { _id: doc._id },
#             [{
#                 $set: {
#                     studios: {
#                         $function: {
#                             body: function (str) {
#                                 try {
#                                     return JSON.parse(str.replace(/'/g, '"'));
#                                 } catch (e) {
#                                     return []; // Handle invalid JSON gracefully
#                                 }
#                             },
#                             args: ["$studios"],
#                             lang: "js"
#                         }
#                     }
#                 }
#             }]
#         );
#     }
# });

In [None]:
# db.anime.updateMany(
#     { studios: ["nan"] }, // Filtra i documenti con studios uguale a ["nan"]
#     { $set: { studios: "" } } // Aggiorna il campo studios con una stringa vuota
# );

# CREATING NODE4J IMPORTING FILES

## Users nodes

id:, username:, privacyStatus:,

In [316]:
user_nodes = pd.read_csv(f"{MONGO_PATH}/users_collection.csv")
user_nodes.rename(columns={"_id": "id", " username": "username"}, inplace=True)
user_nodes.drop(columns=["birthdate", "createdAt", "followers", "role", "email", "password"], inplace=True)
user_nodes.head()

if not os.path.exists(f"{NEO_PATH}"):
    os.makedirs(f"{NEO_PATH}", exist_ok=True)

user_nodes.to_csv(f"{NEO_PATH}/user_nodes.csv", index=False)


## Manga nodes

id:, name:, status:, chapters:, genres:[]

In [312]:
manga_nodes = pd.read_csv(f"{MONGO_PATH}/manga_collection.csv")
manga_nodes.rename(columns={"_id": "id"}, inplace=True)
manga_nodes.drop(columns=["type", "authors", "synopsis", "numScores", "sumScores", "reviews"], inplace=True)
manga_nodes.head()

manga_nodes.to_csv(f"{NEO_PATH}/manga_nodes.csv", index=False)

##Anime nodes

id:, name:, status:, episodes:, genres:[]

In [17]:
anime_nodes = pd.read_csv(f"{MONGO_PATH}/anime_collection.csv")
anime_nodes.rename(columns={"_id": "id"}, inplace=True)
anime_nodes.drop(columns=["type", "studios", "synopsis", "numScores", "sumScores", "reviews", "duration", "source"], inplace=True)
anime_nodes.head()

anime_nodes.to_csv(f"{NEO_PATH}/anime_nodes.csv", index=False)

## Follow Relationship

Creating relationship between users

In [None]:
full_users = pd.read_csv(f"{MONGO_PATH}/users_collection.csv")
full_users.rename(columns={"_id": "followed", " username": "username", "followers": "follower"}, inplace=True)
full_users.drop(columns=["birthdate", "createdAt", "role", "email", "password", "username", "privacyStatus"], inplace=True)
full_users["follower"] = full_users["follower"].apply(ast.literal_eval)
relations = full_users.explode("follower")
relations = relations[["follower", "followed"]]
relations["relation"] = "FOLLOW"
relations = relations.dropna(substet = ["follower"])
relations.shape[0]
relations.to_csv(f"{NEO_PATH}/follow_relationship.csv", index=False)

## Anime Relationship

Creating relationship between users and anime, adding progress attribute

In [None]:
users = pd.read_csv(f"{BASE_PATH}/joined_users.csv")
users_ids= users["user_id"].tolist()
no_anime = 0
list_anime_df = pd.DataFrame()
for index, user_id in enumerate(users_ids):
    print(f"\r{index+1}/{len(users)}, users: {user_id}, no_anime_list: {no_anime}", end="")
    user_file = f"{USER_PATH}/user_{user_id}/user_{user_id}_anime_list.csv"
    if os.path.exists(user_file):
        user_anime_list = pd.read_csv(user_file)  # Colonne: anime_id, anime_title, status, watched_episodes, score
        user_anime_list["userId"] = user_id
        user_anime_list.drop(columns=["status", "anime_title", "score"], inplace=True)
        user_anime_list.rename(columns={"anime_id": "animeId", "watched_episodes": "progress"}, inplace=True)
        list_anime_df = pd.concat([list_anime_df, user_anime_list], ignore_index=True)
    else:
      no_anime += 1

list_anime_df.shape

Checking if all anime in the relation have been retrieved

In [None]:
all_anime = pd.read_csv(f"{ANIME_PATH}/anime_list.csv")
anime_ids = all_anime["anime_id"].tolist()
filtered_anime_relation = list_anime_df[list_anime_df["animeId"].isin(anime_ids)]
list_anime_df.shape, filtered_anime_relation.shape

In [None]:
filtered_anime_relation.to_csv(f"{NEO_PATH}/anime_relationship.csv", index=False)

Splitting the file in chunks to import in neo4j

In [None]:
all_relations = pd.read_csv(f"{NEO_PATH}/anime_relationship.csv")
splitted_df = np.array_split(all_relations, 13)
final_shape = 0
for i, df_part in enumerate(splitted_df):
    final_shape += int(df_part.shape[0])
    print(f"shape {i}: {df_part.shape[0]}")
    df_part.to_csv(f"{NEO_PATH}/anime_relationship_part_{i}.csv", index=False)
if final_shape == all_relations.shape[0]:
    print("ok")
else:
    print(final_size), print(all_relations.shape[0])

## Manga Relationship

Creating relationship between users and manga, adding progress attribute

In [None]:
users = pd.read_csv(f"{BASE_PATH}/joined_users.csv")
users_ids= users["user_id"].tolist()
no_manga = 0
list_manga_df = pd.DataFrame()
for index, user_id in enumerate(users_ids):
    print(f"\r{index+1}/{len(users)}, users: {user_id}, no_manga_list: {no_manga}", end="")
    user_file = f"{USER_PATH}/user_{user_id}/user_{user_id}_manga_list.csv"
    if os.path.exists(user_file):
        user_manga_list = pd.read_csv(user_file)  # Colonne: manga_id, manga_title, status, watched_episodes, score
        user_manga_list["userId"] = user_id
        user_manga_list.drop(columns=["status", "manga_title", "score"], inplace=True)
        user_manga_list.rename(columns={"manga_id": "mangaId", "read_chapters": "progress"}, inplace=True)
        list_manga_df = pd.concat([list_manga_df, user_manga_list], ignore_index=True)
    else:
      no_manga += 1

list_manga_df.shape
list_manga_df.head()

Checking if all manga in the relation have been retrieved

In [None]:
all_manga = pd.read_csv(f"{MANGA_PATH}/manga_list.csv")
manga_ids = all_manga["manga_id"].tolist()
filtered_manga_relation = list_manga_df[list_manga_df["mangaId"].isin(manga_ids)]
list_manga_df.shape, filtered_manga_relation.shape

In [17]:
filtered_manga_relation.to_csv(f"{NEO_PATH}/manga_relationship.csv", index=False)

In [None]:
all_relations = pd.read_csv(f"{NEO_PATH}/manga_relationship.csv")
splitted_df = np.array_split(all_relations, 2)
final_shape = 0
for i, df_part in enumerate(splitted_df):
    final_shape += int(df_part.shape[0])
    print(f"shape {i}: {df_part.shape[0]}")
    df_part.to_csv(f"{NEO_PATH}/manga_relationship_part_{i}.csv", index=False)
if final_shape == all_relations.shape[0]:
    print("ok")
else:
    print(final_size), print(all_relations.shape[0])

##Utils

for user

In [None]:
# LOAD CSV WITH HEADERS FROM "file:///user_nodes.csv" AS row
# MERGE (:User {id: toString(row.id), username: row.username, privacyStatus: row.privacyStatus});

for manga

In [None]:
# LOAD CSV WITH HEADERS FROM "file:///manga_nodes.csv" AS row
# WITH row, [x IN split(substring(replace(row.genres, "'","\""), 2, size(row.genres) - 4), "\",\"") | x] AS veroArray
# MERGE (:Manga {id: toString(row.id), name: row.name, status: row.status, chapters: toInteger(row.chapters), genres: veroArray});

for anime

In [None]:
# LOAD CSV WITH HEADERS FROM "file:///anime_nodes.csv" AS row
# WITH row, [x IN split(substring(replace(row.genres, "'","\""), 2, size(row.genres) - 4), "\",\"") | x] AS veroArray
# MERGE (:Anime {id: toString(row.id), name: row.name, status: row.status, episodes: toInteger(row.episodes), genres: veroArray});

for follows relations

In [None]:
# LOAD CSV WITH HEADERS FROM "file:///follows_relationship.csv" AS row
# MATCH (a:User {id: toString(row.follower)}), (b:User {id: toString(row.followed)})
# MERGE (a)-[:FOLLOWS]->(b);

for anime relations

In [None]:
# USING PERIODIC COMMIT 10000
# LOAD CSV WITH HEADERS FROM "file:///anime_relationship.csv" AS row
# MATCH (a:User {id: toString(row.userId)}), (b:Anime {id: toString(row.animeId)})
# MERGE (a)-[:LIST_ELEMENT{progress: toInteger(row.progress)}]->(b);

for manga relations

In [None]:
# USING PERIODIC COMMIT 1000
# LOAD CSV WITH HEADERS FROM "file:///manga_relationship.csv" AS row
# MATCH (a:User {id: toString(row.userId)}), (b:Manga {id: toString(row.mangaId)})
# MERGE (a)-[:LIST_ELEMENT{progress: toInteger(row.progress)}]->(b);