# Проект по NLP. Корпус англоязычных песен.

### Гордеев Никита и Шитикова Елизавета

В рамках финального проекта мы решили сделать корпус текстов песен британских и американских исполнителей.
Наша выборка включала в себя 40 исполнителей, по 10 на каждый жанр: поп, рок, хип-хоп и экспериментальная музыка.
Морфологическим парсером был выбран пакет spaCy.

In [239]:
import os
import spacy
import re
import pandas as pd
import spotipy
import lyricsgenius as genius
import requests
import warnings
import shutil
import conf
import unicodedata

from spotipy.oauth2 import SpotifyClientCredentials
from bs4 import BeautifulSoup
from spotipy import util
from lyricsgenius.utils import sanitize_filename

In [63]:
warnings.filterwarnings('ignore')

In [68]:
cid = conf.S_TOKEN
secret = conf.S_SECRET
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret) 
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

Данная функция из списка артистов получает набор дискографий.

In [135]:
def prepare(artists):
    
    strings = []
    for artist in artists:
        albums = []
        albums.append(artist.lower())
        artist = sp.search(q='artist:' + artist, type='artist')
        artist = artist['artists']['items'][0]['id']
        alb = sp.artist_albums(artist)
        for i in range(len(alb['items'])):
            a = alb['items'][i]['name'].lower()
            b = re.search('(.*)(?: \(.*| -.*|:.*)(?:instrumentals|edition|complete|reissue|remastered|japan|anniversary|special|deluxe)', a)
            c = re.search('(?:(.*)(?: \(.*| -.*|:.*)(?:live|acoustic|soundtrack|remixes|mixes)|mtv|itunes|live (?:from|at|in)|bbc|radio|stadium|karaoke|[a-z]-sides|audio|version|session)', a)
            if b != None:
                a = b.group(1)
            if alb['items'][i]['album_group'] == 'album' and a not in albums[1:] and c == None:
                albums.append(a)
        strings.append(albums)
        
    return strings

In [None]:
def make_api():
    
    g_token = conf.G_TOKEN
    api = genius.Genius(g_token)
    genius.remove_section_headers = True
    genius.excluded_terms = ["(Remix)", "(Live)", "Remix", "mix", "Edition", 
                             '(Traducción al Español)', '(Tradução em Português)', 
                             '(Deutsche Übersetzung)', '(Turkish Translation)', 
                             '(Traduction Française)', '(Türkçe Çeviri)', 
                             '(Traduzione Italiana)', '(Český Překlad)', 
                             '(Dansk Oversættelse)', '(Traducerea Românească)',
                             '([0-9a-zA-Z -]* Remix)', '(Demo)']
    genius.skip_non_songs = True
    
    return api

In [None]:
def make_li():
    
    with open('artists and albums.txt', encoding='utf-8') as f:
            text = f.read()
        li = re.findall('([a-zA-Z0-9\- ]*)\n', text)
    return li

Следующие три функции собирают и проверяют ссылки на странички альбомов на портале Genius.

In [76]:
def link_for_genius(name):
    
    name = re.sub('[^A-Za-z0-9]+', '-', name)
    if name.endswith('-'):
        name = name[:-1]
    if name.startswith('-'):
        name = name[1:]
        
    return name

In [77]:
def lp_list(strings):

    for i in range(len(strings)):
        for j in range(len(strings[i])):
            strings[i][j] = link_for_genius(strings[i][j])
    st = []
    for i in strings:
        s = ' '.join(i)
        st.append(s)
    st = '\n'.join(st)
    
    with open('artists and albums.txt', 'w', encoding='utf-8') as f:
        f.write(st)

In [218]:
def check_links(li):
    
    for i in range(len(li)):
        artist = li[i].split()
        for alb in range(1, len(artist)):
            album = artist[alb]
            link = 'https://genius.com/albums/' + artist[0] + '/' + album
            result = requests.get(link)
            html = result.text
            if re.search('Oops', html):
                print(album, '!!!!!!!!!!!!!!!!NOT OK!!!!!!!!!!!!!!!!')
                al = ' ' + album
                li[i] = li[i].replace(al, '') 
            else:
                print(album,'OK')
                
    return li

Следующая функция выкачивает тексты песен и формирует из них нашу выборку.

In [317]:
def prepare_database(api, li, i):
    
    terms = ["(Remix)", "(Live)", "Remix", "mix", "Edition", 
                             '(Traducción al Español)', '(Tradução em Português)', 
                             '(Deutsche Übersetzung)', '(Turkish Translation)', 
                             '(Traduction Française)', '(Türkçe Çeviri)', 
                             '(Traduzione Italiana)', '(Český Překlad)', 
                             '(Dansk Oversættelse)', '(Traducerea Românească)',
                             '([0-9a-zA-Z -]* Remix)', '(Demo)']
    
    os.makedirs("Database", exist_ok=True)
    genre = i.split('_')[0]
    for artist in li:
        artist = artist.split()
        for album in range(1, len(artist)):
            print(artist)
            album = artist[album]
            print(album)
            link = 'https://genius.com/albums/' + artist[0] + '/' + album
            result = requests.get(link)
            html = result.text
            soup = BeautifulSoup(html,'html.parser')
            for song in soup.find_all('h3', {'class': 'chart_row-content-title'}):
                song_name = song.get_text()
                song_name = unicodedata.normalize("NFKD", song_name)
                song_name = re.search('( +)(.+)(\n)', song_name).group(2)
                try:
                    song = api.search_song(song_name, artist[0])
                    song_name = sanitize_filename(song_name).replace('-', '')
                    album = album.replace('-', '')
                    art = artist[0].replace('-', '')
                    for i in terms:
                        if re.search(i, str(song)):
                            song = None
                    if song is not None:
                        name = genre + '_' + art + '_' + album + '_' + song_name
                        song.save_lyrics(extension='txt', filename= name, overwrite= True, binary_encoding=True)
                        name = name + '.txt'
                        shutil.move(name, 'Database')
                        
                except (TypeError, RuntimeError) as ex:
                    pass

In [None]:
def run(album_files):
    
    for a in album_files:
        with open(a, encoding='utf-8') as f:
            artists = f.read().splitlines()
            strings = prepare(artists)
        lp_list(strings, a)
        api, li = make_api(), make_li(a)
        prepare_database(api, li, a)

In [172]:
album_files = ['pop_albums.txt', 'rock_albums.txt', 'hiphop_albums.txt', 'experimental_albums.txt']

run(album_files)

Функция, собирающая из полученных текстов корпус в формате таблицы.

In [341]:
def make_corpus():
    
    nlp = spacy.load("en_core_web_sm")
    songs_list = os.listdir('./Database')
    pron_lemma = {"they" : ["they", "their", "them", "themselves", "theirs", 
                            "They", "Their", "Them", "Theirs", "Themselves"], 
                  "I" : ["me", "myself", "I", "my", "Me", "Myself", "My"], 
                  "he" : ["he", "his", "him", "himself", "He", "His", "Him", "Himself"], 
                  "she" : ["she", "her", "herself", "She", "Her", "Herself"], 
                  "it" : ["its", "it", "itself", "Its", "It", "Itself"], 
                  "you" : ["you", "your", "yours", "u", "You", "Your", "Yours", "U"]}
    df = []
    for file in songs_list:
        direct = './Database/' + file
        div = re.search('(.*)(_)(.*)(_)(.*)(_)(.*)(\.txt)', file)
        genre, artist, album, song = div.group(1), div.group(3), div.group(5), div.group(7)
        with open(direct, encoding='utf-8') as f:
            text = f.read()
        text = re.sub('\[.*\]', '', text)
        text = text.splitlines()
        while '' in text:
            text.remove('')
        for line in text:
            doc = nlp(line)
            for token in doc:
                l = []
                if token.lemma_.isalpha():
                    l.append(token.text)
                    l.append(token.lemma_)
                    l.append(token.pos_)
                    l.append(line)
                    l.append(song)
                    l.append(album)
                    l.append(artist)
                    l.append(genre)
                    df.append(l)
                elif token.lemma_ == '-PRON-':
                    l.append(token.text)
                    for pronoun in pron_lemma.items():
                        if token.text in pronoun[1]:
                            l.append(pronoun[0])
                    l.append(token.pos_)
                    l.append(line)
                    l.append(song)
                    l.append(album)
                    l.append(artist)
                    l.append(genre)
                    df.append(l)
        d = []
        for i in range(8):
            d.append('#')
    df.append(d)
    df = pd.DataFrame(df, 
                    columns=['word', 'lemma', 'part of speech', 'line', 'song', 'album', 'artist', 'genre'])
        
    return df

In [342]:
df = make_corpus()


In [344]:
df.to_csv('corpus.csv')

Следующие две функции непосредственно осуществляют поиск по корпусу.

Синтаксис запросов:
1. Словом без кавычек задаётся лемма.
2. Словом в двойных кавычках задаётся конкретная словоформа.
3. Тегом POS из набора spaCy задаётся любое слово соответствующей части речи.
4. Словом (в кавычках или без) и POS-тегом, записанными через +, задаётся поиск омонимов, принадлежащих нужной нам части речи.
5. Составляющие N-грамм перечисляются через пробел.

In [353]:
def looking(df):

    lemmas = df.lemma.tolist()
    words = df.word.tolist()
    poss = df['part of speech'].tolist()

    a = input()
    keywords = a.split(' ')
    inds = []
    pos_tags = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 
                'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'SCONJ']
           
    for i in range(len(words)):
        for k in range(len(keywords)):
            c = 0
            r = re.search("[^A-Za-z'-]", keywords[k])
            if r == None:
                keywords[k]
                if keywords[k] in pos_tags:
                    if keywords[k] == poss[i]:
                        c = 1
                else:
                    if keywords[k] == lemmas[i]:
                        c = 1
            elif keywords[k].startswith("\"") and keywords[k].endswith("\""):
                keywords[k] = re.sub('\"', '', keywords[k])
                if keywords[k] == words[i]:
                    c = 1
            elif "+" in keywords[k]:
                word, pos = keywords[k].split("+")
                if word.startswith("\"") and word.endswith("\""):
                    word = re.sub('\"', '', word)
                    if poss[i] == pos and words[i] == word:
                        c = 1
                else:
                    if poss[i] == pos and lemmas[i] == word:
                        c = 1
            if c == 1:
                if i != (len(lemmas) - 1):            
                    i += 1
            else:
                break
            if k == 0:
                ind = i - 1
        if c == 1:
            inds.append(ind)

    return inds

In [348]:
def results(inds):
    
    dfs = []
    for i in inds:
        dfs.append(df[df.index == i])
    if dfs == []:
        print('Ничего не найдено.')
        ndf = ''
    else:
        ndf = pd.concat(dfs)
    return ndf

Пример запроса: слово hello.

In [349]:
inds = looking(df)
ndf = results(inds)
ndf

hello


Unnamed: 0,word,lemma,part of speech,line,song,album,artist,genre
19021,Hello,hello,INTJ,"Hello, hello, hello, I see clearly",7 years Ft. BJ Burton,howimfeelingnow,charlixcx,experimental
19022,hello,hello,INTJ,"Hello, hello, hello, I see clearly",7 years Ft. BJ Burton,howimfeelingnow,charlixcx,experimental
19023,hello,hello,INTJ,"Hello, hello, hello, I see clearly",7 years Ft. BJ Burton,howimfeelingnow,charlixcx,experimental
19036,Hello,hello,INTJ,"Hello, hello, hello, got new meanin'",7 years Ft. BJ Burton,howimfeelingnow,charlixcx,experimental
19037,hello,hello,INTJ,"Hello, hello, hello, got new meanin'",7 years Ft. BJ Burton,howimfeelingnow,charlixcx,experimental
19038,hello,hello,INTJ,"Hello, hello, hello, got new meanin'",7 years Ft. BJ Burton,howimfeelingnow,charlixcx,experimental
19416,Hello,hello,INTJ,Hello,anthems,howimfeelingnow,charlixcx,experimental
97812,Hello,hello,INTJ,Hello,anthems,howimfeelingnow,charlixcx,hiphop
152200,hello,hello,INTJ,That are ready to get married at hello,Thank Me Now,thankmelater,drake,hiphop
177835,Hello,hello,INTJ,Hello (Hello),Hello,Relapse,eminem,hiphop


Более сложный запрос: местоимение I + глагол want в любой форме + любое местоимение.

In [358]:
pd.options.display.max_rows = 10000

inds = looking(df)
ndf = results(inds)
ndf

"I" want PRON


Unnamed: 0,word,lemma,part of speech,line,song,album,artist,genre
103,I,I,PRON,"On and on, I don't know if I want it",Creature Comfort,everythingnow,arcadefire,experimental
124,I,I,PRON,"On and on, I don't know if I want it",Creature Comfort,everythingnow,arcadefire,experimental
145,I,I,PRON,(On and on I don't know if I want it),Creature Comfort,everythingnow,arcadefire,experimental
166,I,I,PRON,(On and on I don't know if I want it),Creature Comfort,everythingnow,arcadefire,experimental
317,I,I,PRON,"On and on, I don't know if I want it",Creature Comfort,everythingnow,arcadefire,experimental
338,I,I,PRON,"On and on, I don't know if I want it",Creature Comfort,everythingnow,arcadefire,experimental
359,I,I,PRON,(On and on I don't know if I want it),Creature Comfort,everythingnow,arcadefire,experimental
612,I,I,PRON,(Everything now!) I want it,Everything Now,everythingnow,arcadefire,experimental
707,I,I,PRON,(Everything now!) I want it,Everything Now,everythingnow,arcadefire,experimental
815,I,I,PRON,(Everything now!) I want it,Everything Now,everythingnow,arcadefire,experimental
