In [44]:
import pandas as pd
import numpy as np
import time

In [45]:
user_lists = pd.read_csv('all_user_lists_v1.csv')

In [46]:
user_lists.rename({'0':'user'}, inplace=True, axis=1)

La variable statut indique si le lecteur a complété le manga, s'il est en train de le lire, s'il est en pause ou s'il a arrêté la lecture. Cela apporte beaucoup de nuances qui ne nous seront pas forcément utiles, d'autant plus que sur une série non terminée, le statut sera 'Reading' même si l'utilisateur a lu tout ce qui est déjà sorti.
Je préfère donc conserver seulement 2 statuts: Soit l'utilisateur n'a vraiment pas accroché et le statut est **Dropped** soit l'utilisateur apprécie le manga, je vais donc modifier la colonne pour obtenir une variable binaire qui sera plus simple à traiter.

J'ai quelques valeurs manquantes sur cette variable, je vais donc retirer les lignes où le score est de 0 donc l'utilisateur n'a pas évalué le manga, et pour les lignes où il existe un score je conserve et je mets le statut comme complété.

In [47]:
def clean_users(data):
    data.drop(data[(data['status'].isna()) & (data['score']==0)].index, inplace=True)
    data.reset_index(inplace=True,drop=True)
    data.replace({'reading': 0, 'completed': 0, 'dropped': 1, 'on_hold':0, np.nan:0}, inplace=True)
    return data.convert_dtypes()

In [48]:
user_lists = clean_users(user_lists)
user_lists

Unnamed: 0,user,id,title,status,score
0,funkykg,113138,Jujutsu Kaisen,0,0
1,funkykg,23390,Shingeki no Kyojin,0,0
2,goldmen53,39883,Ansatsu Kyoushitsu,0,8
3,goldmen53,9711,Bakuman.,0,9
4,goldmen53,10010,Beelzebub,1,0
...,...,...,...,...,...
1496538,Skyeinna_,656,Vagabond,0,0
1496539,Skyeinna_,745,Pluto,0,0
1496540,Skyeinna_,1131,Freesia,0,0
1496541,Skyeinna_,7375,Kokou no Hito,0,0


In [49]:
manga_list = pd.read_csv('manga_database.csv')

In [50]:
manga_list.drop(['main_picture','url','title_english','title_japanese','title_synonyms',
                 'synopsis','background','end_date','real_start_date','real_end_date',
                 'updated_at','created_at_before','approved','sfw','favorites',
                 'members','volumes','chapters','status','authors'], axis=1, inplace=True)

In [51]:
manga_list.drop(manga_list[~((manga_list['type'] == 'manga') | (manga_list['type'] == 'one_shot'))].index, inplace=True)
manga_list.drop('type',axis=1, inplace=True)
manga_list.reset_index(inplace=True,drop=True)

Je vais maintenant retirer les mangas qui n'ont été lus/notés par très peu d'utilisateurs car ces oeuvres seront dures à comparer aux autres. Cela va réduire grandement notre jeu de données pour obtenir uniquement les oeuvres les plus populaires, car ce sont ces mêmes oeuvres qui ont le plus grand potentiel d'avoir été lues par de nombreux utilisateurs, et ce sont également généralement les oeuvres les plus aimées donc meilleures pour mon système de recommendation.

In [52]:
manga_list = manga_list[manga_list['scored_by']>5000]
manga_list.reset_index(drop=True, inplace=True)
manga_list

Unnamed: 0,manga_id,title,score,scored_by,start_date,genres,themes,demographics,serializations
0,2,Berserk,9.47,319696,1989-08-25,"['Action', 'Adventure', 'Award Winning', 'Dram...","['Gore', 'Military', 'Mythology', 'Psychologic...",['Seinen'],['Young Animal']
1,13,One Piece,9.22,355375,1997-07-22,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Shounen Jump (Weekly)']
2,1706,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,9.30,151433,2004-01-19,"['Action', 'Adventure', 'Mystery', 'Supernatur...",['Historical'],"['Seinen', 'Shounen']",['Ultra Jump']
3,4632,Oyasumi Punpun,9.02,168459,2007-03-15,"['Drama', 'Slice of Life']",['Psychological'],['Seinen'],['Big Comic Spirits']
4,25,Fullmetal Alchemist,9.03,153151,2001-07-12,"['Action', 'Adventure', 'Award Winning', 'Dram...",['Military'],['Shounen'],['Shounen Gangan']
...,...,...,...,...,...,...,...,...,...
1213,54387,Saiteihen no Otoko,6.14,5546,2011-11-22,"['Horror', 'Mystery', 'Supernatural']",['Psychological'],['Shounen'],['Gangan Joker']
1214,110727,Gigant,5.92,8628,2017-12-08,"['Drama', 'Romance', 'Sci-Fi']",[],['Seinen'],['Big Comic Superior']
1215,45613,Pupa,5.89,7479,2011-03-12,"['Drama', 'Fantasy', 'Horror']",['Psychological'],[],['Comic Earth☆Star']
1216,117133,Samurai 8: Hachimaru Den,5.75,6935,2019-05-13,"['Action', 'Sci-Fi']",['Samurai'],['Shounen'],['Shounen Jump (Weekly)']


In [53]:
manga_list['start_date'] = manga_list['start_date'].str.slice(0,4)
manga_list.head(5)

Unnamed: 0,manga_id,title,score,scored_by,start_date,genres,themes,demographics,serializations
0,2,Berserk,9.47,319696,1989,"['Action', 'Adventure', 'Award Winning', 'Dram...","['Gore', 'Military', 'Mythology', 'Psychologic...",['Seinen'],['Young Animal']
1,13,One Piece,9.22,355375,1997,"['Action', 'Adventure', 'Fantasy']",[],['Shounen'],['Shounen Jump (Weekly)']
2,1706,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,9.3,151433,2004,"['Action', 'Adventure', 'Mystery', 'Supernatur...",['Historical'],"['Seinen', 'Shounen']",['Ultra Jump']
3,4632,Oyasumi Punpun,9.02,168459,2007,"['Drama', 'Slice of Life']",['Psychological'],['Seinen'],['Big Comic Spirits']
4,25,Fullmetal Alchemist,9.03,153151,2001,"['Action', 'Adventure', 'Award Winning', 'Dram...",['Military'],['Shounen'],['Shounen Gangan']


On va maintenant vouloir séparer les listes des colonnes **genres**, **themes** et **demographics**.

In [54]:
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from ast import literal_eval

mlb = MultiLabelBinarizer()
ohe = OneHotEncoder()

In [55]:
for i in range(len(manga_list)):
    manga_list['genres'][i] = literal_eval(manga_list['genres'][i])
    manga_list['themes'][i] = literal_eval(manga_list['themes'][i])
    manga_list['demographics'][i] = literal_eval(manga_list['demographics'][i])
    manga_list['serializations'][i] = literal_eval(manga_list['serializations'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manga_list['genres'][i] = literal_eval(manga_list['genres'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manga_list['themes'][i] = literal_eval(manga_list['themes'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manga_list['demographics'][i] = literal_eval(manga_list['demographics'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.ht

In [57]:
genres = pd.DataFrame(mlb.fit_transform(manga_list['genres']),
                   columns=mlb.classes_,
                   index=manga_list.index)

In [58]:
themes = pd.DataFrame(mlb.fit_transform(manga_list['themes']),
                   columns=mlb.classes_,
                   index=manga_list.index)

In [59]:
demos = pd.DataFrame(mlb.fit_transform(manga_list['demographics']),
                   columns=mlb.classes_,
                   index=manga_list.index)

On peut s'interroger sur la pertinence de certaines catégories dans le cas où elles sont très peu représentées.

In [16]:
genres.sum()

Action           368
Adventure        154
Award Winning    121
Boys Love         66
Comedy           465
Drama            434
Ecchi            123
Erotica           68
Fantasy          252
Girls Love        21
Gourmet           12
Hentai             1
Horror           116
Mystery          113
Romance          477
Sci-Fi           114
Slice of Life    123
Sports            35
Supernatural     264
Suspense          35
dtype: int64

In [17]:
genres.drop('Hentai', axis=1, inplace=True)

In [18]:
themes.sum().sort_values()

Idols (Female)         1
Racing                 1
Villainess             2
Medical                3
Memoir                 4
Anthropomorphic        5
Video Game             5
Performing Arts        5
Pets                   5
CGDCT                  6
Space                  6
Magical Sex Shift      6
Showbiz                7
Combat Sports          8
Visual Arts            9
Detective              9
Reverse Harem         10
Samurai               10
Parody                11
Mecha                 11
High Stakes Game      12
Mahou Shoujo          12
Strategy Game         12
Organized Crime       12
Workplace             12
Music                 14
Crossdressing         14
Gag Humor             14
Childcare             15
Otaku Culture         16
Military              18
Iyashikei             19
Team Sports           19
Delinquents           19
Survival              20
Time Travel           21
Reincarnation         21
Vampire               25
Adult Cast            26
Love Polygon          27


In [19]:
themes.drop(['Idols (Female)','Racing','Villainess','Medical','Memoir'], axis=1, inplace=True)

In [20]:
manga_list

Unnamed: 0,manga_id,title,score,scored_by,start_date,genres,themes,demographics,serializations
0,2,Berserk,9.47,319696,1989,"[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]",[Seinen],[Young Animal]
1,13,One Piece,9.22,355375,1997,"[Action, Adventure, Fantasy]",[],[Shounen],[Shounen Jump (Weekly)]
2,1706,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,9.30,151433,2004,"[Action, Adventure, Mystery, Supernatural]",[Historical],"[Seinen, Shounen]",[Ultra Jump]
3,4632,Oyasumi Punpun,9.02,168459,2007,"[Drama, Slice of Life]",[Psychological],[Seinen],[Big Comic Spirits]
4,25,Fullmetal Alchemist,9.03,153151,2001,"[Action, Adventure, Award Winning, Drama, Fant...",[Military],[Shounen],[Shounen Gangan]
...,...,...,...,...,...,...,...,...,...
1213,54387,Saiteihen no Otoko,6.14,5546,2011,"[Horror, Mystery, Supernatural]",[Psychological],[Shounen],[Gangan Joker]
1214,110727,Gigant,5.92,8628,2017,"[Drama, Romance, Sci-Fi]",[],[Seinen],[Big Comic Superior]
1215,45613,Pupa,5.89,7479,2011,"[Drama, Fantasy, Horror]",[Psychological],[],[Comic Earth☆Star]
1216,117133,Samurai 8: Hachimaru Den,5.75,6935,2019,"[Action, Sci-Fi]",[Samurai],[Shounen],[Shounen Jump (Weekly)]


In [21]:
for i in range(len(manga_list)):
    if manga_list['serializations'][i] == [] :
        manga_list['serializations'][i] = np.nan
    else:
        manga_list['serializations'][i] = manga_list['serializations'][i][0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manga_list['serializations'][i] = manga_list['serializations'][i][0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manga_list['serializations'][i] = np.nan


In [22]:
manga_list

Unnamed: 0,manga_id,title,score,scored_by,start_date,genres,themes,demographics,serializations
0,2,Berserk,9.47,319696,1989,"[Action, Adventure, Award Winning, Drama, Fant...","[Gore, Military, Mythology, Psychological]",[Seinen],Young Animal
1,13,One Piece,9.22,355375,1997,"[Action, Adventure, Fantasy]",[],[Shounen],Shounen Jump (Weekly)
2,1706,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,9.30,151433,2004,"[Action, Adventure, Mystery, Supernatural]",[Historical],"[Seinen, Shounen]",Ultra Jump
3,4632,Oyasumi Punpun,9.02,168459,2007,"[Drama, Slice of Life]",[Psychological],[Seinen],Big Comic Spirits
4,25,Fullmetal Alchemist,9.03,153151,2001,"[Action, Adventure, Award Winning, Drama, Fant...",[Military],[Shounen],Shounen Gangan
...,...,...,...,...,...,...,...,...,...
1213,54387,Saiteihen no Otoko,6.14,5546,2011,"[Horror, Mystery, Supernatural]",[Psychological],[Shounen],Gangan Joker
1214,110727,Gigant,5.92,8628,2017,"[Drama, Romance, Sci-Fi]",[],[Seinen],Big Comic Superior
1215,45613,Pupa,5.89,7479,2011,"[Drama, Fantasy, Horror]",[Psychological],[],Comic Earth☆Star
1216,117133,Samurai 8: Hachimaru Den,5.75,6935,2019,"[Action, Sci-Fi]",[Samurai],[Shounen],Shounen Jump (Weekly)


In [23]:
manga_list.drop(['genres','themes','demographics','scored_by','serializations'], axis=1, inplace=True)
manga_list = pd.concat([manga_list,genres,themes,demos], axis=1)
manga_list

Unnamed: 0,manga_id,title,score,start_date,Action,Adventure,Award Winning,Boys Love,Comedy,Drama,...,Time Travel,Vampire,Video Game,Visual Arts,Workplace,Josei,Kids,Seinen,Shoujo,Shounen
0,2,Berserk,9.47,1989,1,1,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,13,One Piece,9.22,1997,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1706,JoJo no Kimyou na Bouken Part 7: Steel Ball Run,9.30,2004,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,4632,Oyasumi Punpun,9.02,2007,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,25,Fullmetal Alchemist,9.03,2001,1,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,54387,Saiteihen no Otoko,6.14,2011,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1214,110727,Gigant,5.92,2017,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1215,45613,Pupa,5.89,2011,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1216,117133,Samurai 8: Hachimaru Den,5.75,2019,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [36]:
manga_list.drop(manga_list[manga_list.sort_values('score')['title'].duplicated(keep='last')].index,inplace=True)
manga_list.reset_index(inplace=True, drop=True)

  manga_list.drop(manga_list[manga_list.sort_values('score')['title'].duplicated(keep='last')].index,inplace=True)


On peut également maintenant retirer des listes de nos utilisateurs les mangas qui ne font pas partie du tri qu'on a effectué sur la popularité.

In [38]:
user_lists = user_lists[user_lists['title'].isin(manga_list['title'])]
user_lists.drop(user_lists[user_lists['user']=='Haelios'].index, inplace=True)
user_lists.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_lists.drop(user_lists[user_lists['user']=='Haelios'].index, inplace=True)


In [39]:
user_lists.to_csv('all_user_lists_v2.csv', index=False)

In [40]:
manga_list.to_csv('sorted_manga_list.csv', index=False)