## Goal: create base model from using artist_1 and artist_2 dataframes

In [19]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#### - import artist_1 and artist_2 csv

In [7]:
artist_1 = pd.read_csv('artist_1.csv', index_col=0)
artist_2 = pd.read_csv('artist_2.csv', index_col=0)

In [8]:
artist = pd.concat([artist_1, artist_2], axis=0)
artist

Unnamed: 0,song_title,l_link,lyrics,artist_name
0,Alligator,https://www.lyrics.com//lyric/36182049/Of+Mons...,Hey Hey I see color raining down Feral feelin...,Of Monsters and Men
2,Ahay,https://www.lyrics.com//lyric/36406257/Of+Mons...,I know that I'm wrong And now we're both sitti...,Of Monsters and Men
3,Róróró,https://www.lyrics.com//lyric/36406262/Of+Mons...,I am up with the sunrise I'm tired and I think...,Of Monsters and Men
4,Waiting for the Snow,https://www.lyrics.com//lyric/36406261/Of+Mons...,I'm waiting for the snow I'm waiting for visio...,Of Monsters and Men
5,"Vulture, Vulture",https://www.lyrics.com//lyric/36406260/Of+Mons...,"He lives in the darkness, he's calling my name...",Of Monsters and Men
...,...,...,...,...
157,where we are [Acoustic],https://www.lyrics.com//lyric-lf/7526166/The+L...,Where we are (where we are) I don't know where...,The Lumineers
163,Democracy,https://www.lyrics.com//lyric/36521451/The+Lum...,It's coming through a hole in the air From tho...,The Lumineers
167,Old Lady,https://www.lyrics.com//lyric/36521460/The+Lum...,An old lady crosses the street And as she wave...,The Lumineers
168,Soundtrack Song,https://www.lyrics.com//lyric/36521450/The+Lum...,"Loneliness, oh won't you let me be Let me be a...",The Lumineers


#### - feature vectorizing

In [10]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(artist['lyrics'])
X

<125x1900 sparse matrix of type '<class 'numpy.float64'>'
	with 5798 stored elements in Compressed Sparse Row format>

In [11]:
X.shape

(125, 1900)

In [12]:
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
X.toarray().shape

(125, 1900)

In [15]:
# create dataframe from the vector
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names(), index=artist['artist_name'])
df

Unnamed: 0_level_0,18,3am,44,able,abyss,account,act,actress,admit,advice,...,yelled,yellow,yes,yesterdays,yo,yon,york,young,younger,youth
artist_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Of Monsters and Men,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Of Monsters and Men,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Of Monsters and Men,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Of Monsters and Men,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Of Monsters and Men,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Lumineers,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Lumineers,0.0,0.0,0.0,0.0,0.0,0.046372,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Lumineers,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Lumineers,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# let's check whether sum of the vectors is 0 or not
np.square(df).sum(axis=1)

artist_name
Of Monsters and Men    1.0
Of Monsters and Men    1.0
Of Monsters and Men    1.0
Of Monsters and Men    1.0
Of Monsters and Men    1.0
                      ... 
The Lumineers          1.0
The Lumineers          1.0
The Lumineers          1.0
The Lumineers          1.0
The Lumineers          1.0
Length: 125, dtype: float64

In [18]:
df2 = df.reset_index()

#### - Logistic Regression

In [20]:
# define X, y
X = df2.drop('artist_name', axis=1)
y = artist['artist_name']


In [21]:
m = LogisticRegression(max_iter=1000, class_weight='balanced').fit(X, y)

m.score(X, y)

0.992

In [38]:
# fun part: try to predict song title too !!!
y_2 = artist['song_title']

In [39]:
m2 = LogisticRegression(multi_class='multinomial', solver='saga', penalty='l2', C=1.0).fit(X, y_2)

m.score(X, y_2)

0.0

In [40]:
m.predict(vectorizer.transform(["though the truth may vary"]))

array(['Of Monsters and Men'], dtype=object)

In [41]:
f'song title: {m2.predict(vectorizer.transform(["dirty paws"]))}, name of artist: {m.predict(vectorizer.transform(["dirty paws"]))}'

"song title: ['Dirty Paws'], name of artist: ['Of Monsters and Men']"

## Find your song and artist

In [42]:
line = input("Enter a line: ")

print("\n")
print(f'song title: {m2.predict(vectorizer.transform([line]))}')
print(f'name of artist: {m.predict(vectorizer.transform([line]))}')



song title: ['Little Talks']
name of artist: ['Of Monsters and Men']
