In [2]:
import re

import lyrics as ly
import json
from pathlib import Path
import pandas as pd

from spacy.lang.fr import French

def tokenize(chaine):
    return [token.text for token in tokenizer(chaine)]

def preclean(chaine):
    chaine = chaine.replace(u"\xa0", " ")
    chaine = chaine.replace(u"\u2009", " ")
    chaine = chaine.replace(u"\u200b", " ")
    chaine = chaine.replace(u"\u200c", " ")
    return chaine.replace(u"\u200d", " ")

def clean_word(word):
    try:
        word = word.strip()
    except:
        print(word)
        return
    word = word.strip(".,;“…’:!”?\"()[]{}«»×*")
    if re.fullmatch(r"((\\x)|(\\u)|(\\n)|(x?\d+)).*", word):
        return
    if '"-"' in word:
        return
    if word == 'à-ç':
        return
    if re.fullmatch('(-"?\w+)|(\w+"?-)', word):
        return
    if re.fullmatch(r"[^A-zÄ-ÿ]+", word):
        return
    if re.fullmatch(r"('+)|(\++)", word):
        return
    return word

def find_neo(songs):
    neologismes = set()

    for song in songs:
        if song.paroles:
            paroles = tokenize(preclean(song.paroles))
            for word in paroles:
                word = clean_word(word)
                if word:
                    if word.lower() not in lexique_ultime:
                        neologismes.add(word)

    return neologismes

def songs_and_neo(artiste):
    if isinstance(artiste, str | Path):
        artiste = ly.Artiste(artiste)
    songs = artiste.songs
    neologismes = find_neo(songs)
    genres = artiste.genres

    return songs, neologismes, genres

nlp = French()

tokenizer = nlp.tokenizer

lexiques = Path("lexiques").glob("*.json")

dict_lexiques = {
    fic.stem: set(json.load(fic.open(encoding="utf-8")))
    for fic in lexiques
}

lexique_ultime = set.union(*dict_lexiques.values())

In [3]:
artistes_files = Path("Lyrics_all").glob("*.json")
artistes_files = sorted(artistes_files)

dict_artistes = {
    e : songs_and_neo(e)
    for e in artistes_files
}

OSError: [Errno 22] Invalid argument

In [None]:
e = ly.Artiste(artistes_files[457])
print(e.genres, e.name)


In [None]:
df = pd.DataFrame(dict_artistes).T
df.to_pickle("df_artistes_raw.pkl")

In [None]:
neo = 50
fr_songs = 20

neover = df[df[1].apply(lambda x : len(x) > neo)]
frver = df[df[0].apply(lambda x : len([e for e in x if e.lang == "fr"]) > fr_songs)]

bothver = frver[frver.index.isin(neover.index)]

neover.to_pickle("df_artistes_neo.pkl")
frver.to_pickle("df_artistes_fr.pkl")
bothver.to_pickle("df_artistes_neo_fr.pkl")
bothver.to_csv("df_artistes_neo_fr.csv")
bothver.to_json("df_artistes_neo_fr.json")


In [None]:
bothver
