In [65]:
import pandas as pd
import numpy as np
import pickle
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
import re
from nltk.stem import SnowballStemmer
from sklearn.metrics import ndcg_score, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split

**Обработка данных**

Исходный датасет - https://www.kaggle.com/neisse/scrapped-lyrics-from-6-genres

(Датасет с текстами песен 6 различных жанров)

Состоит из 2 файлов:

1)Название песни, её текст, язык

2)Исполнитель, количество песен, популярность, жанр...

In [2]:
data = pd.read_csv('lyrics-data.csv')
data.head()

Unnamed: 0,ALink,SName,SLink,Lyric,Idiom
0,/10000-maniacs/,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH
1,/10000-maniacs/,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH
2,/10000-maniacs/,These Are Days,/10000-maniacs/these-are-days.html,These are. These are days you'll remember. Nev...,ENGLISH
3,/10000-maniacs/,A Campfire Song,/10000-maniacs/a-campfire-song.html,"A lie to say, ""O my mountain has coal veins an...",ENGLISH
4,/10000-maniacs/,Everyday Is Like Sunday,/10000-maniacs/everyday-is-like-sunday.html,Trudging slowly over wet sand. Back to the ben...,ENGLISH


In [3]:
artist_data = pd.read_csv('artists-data.csv')
artist_data.head()

Unnamed: 0,Artist,Songs,Popularity,Link,Genre,Genres
0,10000 Maniacs,110,0.3,/10000-maniacs/,Rock,Rock; Pop; Electronica; Dance; J-Pop/J-Rock; G...
1,12 Stones,75,0.3,/12-stones/,Rock,Rock; Gospel/Religioso; Hard Rock; Grunge; Roc...
2,311,196,0.5,/311/,Rock,Rock; Surf Music; Reggae; Ska; Pop/Rock; Rock ...
3,4 Non Blondes,15,7.5,/4-non-blondes/,Rock,Rock; Pop/Rock; Rock Alternativo; Grunge; Blue...
4,A Cruz Está Vazia,13,0.0,/a-cruz-esta-vazia/,Rock,Rock


Из всех данных нас интересует только название песни, текст, и исполнитель.

In [4]:
data = pd.merge(data, artist_data, left_on = 'ALink', right_on = 'Link', how = 'inner')[['SName', 'Lyric', 'Artist']].drop_duplicates(keep='first')
data.head()

Unnamed: 0,SName,Lyric,Artist
0,More Than This,I could feel at the time. There was no way of ...,10000 Maniacs
2,Because The Night,"Take me now, baby, here as I am. Hold me close...",10000 Maniacs
4,These Are Days,These are. These are days you'll remember. Nev...,10000 Maniacs
6,A Campfire Song,"A lie to say, ""O my mountain has coal veins an...",10000 Maniacs
8,Everyday Is Like Sunday,Trudging slowly over wet sand. Back to the ben...,10000 Maniacs


Предобработаем данные сразу, чтобы загружать их из файла при запуске сервера

In [5]:
class Document:
    def __init__(self, title, text):
        self.title = title
        self.text = text

    def format(self, query):
        return [self.title, self.text[:200] + ' ...']

In [37]:
# У загрузки данных в функции build_index будет похожая реализация
documents = [Document(str(row['SName']) + ' - ' + str(row['Artist']), 
                      str(row['Lyric'])) for index, row in data.iterrows()]

**Индекс**

Реализуем инверированный индекс с удалением "стоп-слов"

In [47]:
nltk.download('stopwords')
sw_eng = stopwords.words('english')
stemmer = SnowballStemmer(language='english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
# У индекса в функции build_index будет похожая реализация
index = {}
text = []     
for idx, doc in enumerate(documents):
    text.append('')
    for word in set(re.split(r'[^a-z0-9]', (doc.title + ' ' + doc.text).lower())):
        if word not in sw_eng:
            stemmed_word = stemmer.stem(word)
            text[idx]+=stemmed_word + ' '
            if stemmed_word not in index:
                index[stemmed_word] = []
            index[stemmed_word].append(idx)
with open('index.pickle', "wb") as f:
    pickle.dump((documents, index), f)

Сохраним данные и индекс

In [11]:
with open('index.pickle', "wb") as f:
    pickle.dump((documents, index, text), f)

Реализация функции retrieve с использованием инвертированного индекса

In [49]:
def retrieve(query):
    keywords = [stemmer.stem(word) for word in re.split(r'[^a-z0-9]', query.lower()) if word not in sw_eng]
    keywords = list(set(index.keys()).intersection(keywords))
    if len(keywords) == 0:
        return documents[:50], [range(50)]
    s = set(index[keywords[0]])
    for word in keywords[1:]:
        s = s.intersection(index[word])

    candidates = [documents[i] for i in s]
    return candidates[:50], list(s)[:50]

**Ранжирование**

In [50]:
tfv = TfidfVectorizer(stop_words='english')
tfv.fit(text)

TfidfVectorizer(stop_words='english')

In [76]:
def transform_data(query):
    docs, indexes = retrieve(query)
    texts = [text[i] for i in indexes]
    intersections = []
    keywords = [stemmer.stem(word) for word in re.split(r'[^a-z0-9]', query.lower()) if word not in sw_eng]
    for i in indexes:
        intersections.append(' '.join(set(text[i].split()).intersection(keywords)))
    return 2*tfv.transform(intersections).toarray()+tfv.transform(texts).toarray()

In [80]:
x = np.vstack((transform_data('Iron Maiden'), transform_data('For the Greater Good of God')))
x = np.vstack((x, transform_data('For the Greater Good of God')))
x = np.vstack((x, transform_data('Cats and Dogs')))
x = np.vstack((x, transform_data('seaside song')))
x = np.vstack((x, transform_data('Stairway To Heaven')))
x = np.vstack((x, transform_data('Lost in America')))
x = np.vstack((x, transform_data('The Best of Both Worlds')))
x = np.vstack((x, transform_data('red hot fortune')))
x = np.vstack((x, transform_data('The Unforgiven')))
x = np.vstack((x, transform_data('Nothing Else Matters')))
x = np.vstack((x, transform_data('Fade To Black')))
x = np.vstack((x, transform_data('dump spend give try')))
ratings = '1111111444444444445444444444444444444444444444444422121222512222222222222222222222222222222222222222222222222223224241122211121111441142211151421331222211511112211121222223333221212233222222211211111122222121222222222222222222222221222222222222252222222112224222124132222222222225222522521222211211111111111112212111111111211111111111122111121111111221211111221111111121111211111111112111111111232412'

y = [int(i) for i in ratings]
x.shape, len(y)

((400, 221169), 400)

In [84]:
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle = True, test_size = 0.05)

In [85]:
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [98]:
y_pred = np.array(lr.predict(x_test))
ndcg_score(np.array(y_test)[np.newaxis], y_pred[np.newaxis])

0.7245162610482105

In [102]:
with open('model.pickle', "wb") as f:
  pickle.dump((lr, tfv), f)