### Machine learning model: Text classification
##### Business goal: Predict the artist of a song by lyrics
##### Data set: Web scraped lyrics

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn import set_config
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from collections import Counter

In [2]:
# Import and preparation of the data set

df_lyrics_chvrches=pd.read_csv('Chvrches_all.csv')
df_lyrics_chvrches=df_lyrics_chvrches['Chvrches_all']
df_lyrics_chvrches

0     Caught out cold, cold\nHiding from you in this...
1     Throw me\nNo more bones and I will tell you no...
2     Can I tell you something just between you and ...
3     I gave up on time\nJust like you said you woul...
4     Throw me\nNo more bones and I will tell you no...
                            ...                        
91    I will carry you and give you life\nI will cov...
92    Everyone sees, but only I know\nNothing to los...
93    I will never believe what they say\nThere is a...
94    Do you really believe that you can never be su...
95    We are\nI've come apart and you made me\nFloat...
Name: Chvrches_all, Length: 96, dtype: object

In [3]:
df_lyrics_lizzo=pd.read_csv('Lizzo_all.csv')
df_lyrics_lizzo=df_lyrics_lizzo['Lizzo_all']
df_lyrics_lizzo

0     Hey, listen! (hahaha)\nI blame it on you, I bl...
1     Why're men great 'til they gotta be great?\nHu...
2     I do my hair toss\nCheck my nails\nBaby how yo...
3     Black, white, ebony\nAll sound good to me\nTwo...
4     True story\nNo glory, let's go\n\nYeah, the ol...
                            ...                        
91    True story\nNo glory, let's go\n\nYeah, the ol...
92    Hey, listen! (hahaha)\nI blame it on you, I bl...
93    I do my hair toss\nCheck my nails\nBaby how yo...
94    I do my hair toss\nCheck my nails\nBaby how yo...
95    Uh\nGo\n\nWoke up feelin' like I just might ru...
Name: Lizzo_all, Length: 96, dtype: object

In [4]:
# Creat a new list including all lyrics from one artist

chvrches = []
for i in df_lyrics_chvrches[0:96]:
    chvrches.append(i)
chvrches

["Caught out cold, cold\nHiding from you in this skin\nSo, old\nI'll come, clean\nEveryone-everyone knows\nIt's, me\n\nAnd if I recover\nWill you be my comfort?\nOr it can be over\nOr we can just leave it here\nSo pick any number\nChoose any color\nI've got the answer\nOpen the envelope\n\nI'll give you one more chance\nTo say we can change or part ways\nAnd you take what you need\nAnd you don't need me\n\nI'll give you one more chance\nTo say we can change our old ways\nAnd you take what you need\nAnd you know you don't need me\n\nBlow by blow\nHonest in every way I know\nYou appear\nTo face a decision I know you fear\n\nAnd if I recover\nWill you be my comfort?\nOr it can be over\nOr we can just leave it here\nSo pick any number\nChoose any color\nI've got the answer\nOpen the envelope\n\nI'll give you one more chance\nTo say we can change or part ways\nAnd you take what you need\nAnd you don't need me\n\nI'll give you one more chance\nTo say we can change our old ways\nAnd you take 

In [5]:
lizzo = []
for i in df_lyrics_lizzo[0:96]:
    lizzo.append(i)
lizzo

["Hey, listen! (hahaha)\nI blame it on you, I blame it on you\nYee!\n\nI just wanna drink you up\nPour you in a silver cup\nYou know I like to party\nBut you know I love your body\nSo many things I shouldn't do\nI hate myself for hurting you\nHonestly, I'm reckless\nI'm sorry if I'm selfish\n\nEvery time you get too close I run, I run away\nAnd every time you say the words I don't know what to say\nBack, back to the beginning\nReally wish that I could change\nI do, I do, I do\n\nI blame it on your love\nEvery time I f*ck it up\nI blame it on your love, I do\nI blame it on your love\nI can't help it, I can't stop\nI blame it on your love, I do\nI blame it on your love\nI blame it on your love\n\nSorry I'm a little scared\nBut no one ever really cared (ah)\nI took you for granted (yeah, I did)\nJust a big misunderstanding\nI just want to spend the night (one night)\nSleeping in your bed tonight, yeah\nWatch a little TV\nI love it when you need me\n\nEvery time you get too close I run, I 

In [6]:
# Combine lyrics in on data frame

corpus = chvrches + lizzo
len(corpus)

192

In [7]:
# Convert collected text documents to a matrix of token counts

vectorizer = CountVectorizer(stop_words="english")
matrix = vectorizer.fit_transform(corpus)
print(matrix)

In [10]:
corpus

["Caught out cold, cold\nHiding from you in this skin\nSo, old\nI'll come, clean\nEveryone-everyone knows\nIt's, me\n\nAnd if I recover\nWill you be my comfort?\nOr it can be over\nOr we can just leave it here\nSo pick any number\nChoose any color\nI've got the answer\nOpen the envelope\n\nI'll give you one more chance\nTo say we can change or part ways\nAnd you take what you need\nAnd you don't need me\n\nI'll give you one more chance\nTo say we can change our old ways\nAnd you take what you need\nAnd you know you don't need me\n\nBlow by blow\nHonest in every way I know\nYou appear\nTo face a decision I know you fear\n\nAnd if I recover\nWill you be my comfort?\nOr it can be over\nOr we can just leave it here\nSo pick any number\nChoose any color\nI've got the answer\nOpen the envelope\n\nI'll give you one more chance\nTo say we can change or part ways\nAnd you take what you need\nAnd you don't need me\n\nI'll give you one more chance\nTo say we can change our old ways\nAnd you take 

In [11]:
matrix.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [12]:
vectorizer.get_feature_names_out()

array(['100', '20', '2two', ..., '帰りたくない', '悪かないね', '清志郎もデヴィッド'],
      dtype=object)

In [13]:
df = pd.DataFrame(matrix.todense(), columns=vectorizer.get_feature_names_out())
df.shape

Unnamed: 0,100,20,2two,750,99,aah,abide,ability,able,accent,...,とうにとうにいなくなって,やっぱりまだ帰りたくない,キャプチャーコースで違って,テレビがつきっぱなしの,ボウイも,ロンドンモード王道も,川崎もグラスゴーも,帰りたくない,悪かないね,清志郎もデヴィッド
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
189,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
tf = TfidfTransformer()
transformed = tf.fit_transform(matrix)
transformed


<192x2013 sparse matrix of type '<class 'numpy.float64'>'
	with 12241 stored elements in Compressed Sparse Row format>

In [17]:
tdf = pd.DataFrame(transformed.todense(), columns=vectorizer.get_feature_names_out())
tdf

Unnamed: 0,100,20,2two,750,99,aah,abide,ability,able,accent,...,とうにとうにいなくなって,やっぱりまだ帰りたくない,キャプチャーコースで違って,テレビがつきっぱなしの,ボウイも,ロンドンモード王道も,川崎もグラスゴーも,帰りたくない,悪かないね,清志郎もデヴィッド
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
189,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
190,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
X = tdf.values
y = ['chvrches'] * 96 + ['lizzo'] * 96
X
y


In [21]:
# Train test split
x_train, x_test, y_train, y_test = train_test_split(X,y)

In [22]:
m = LogisticRegression()
m.fit(x_train, y_train)
m.score(x_train, y_train)

In [24]:
# Predict probabilities

m.predict_proba(x_train) 

array([[0.84030514, 0.15969486],
       [0.20047622, 0.79952378],
       [0.34844342, 0.65155658],
       [0.29742947, 0.70257053],
       [0.72360399, 0.27639601],
       [0.72638259, 0.27361741],
       [0.8436588 , 0.1563412 ],
       [0.15504948, 0.84495052],
       [0.82389401, 0.17610599],
       [0.15646463, 0.84353537],
       [0.29742947, 0.70257053],
       [0.68873618, 0.31126382],
       [0.18321421, 0.81678579],
       [0.70822958, 0.29177042],
       [0.39511513, 0.60488487],
       [0.7160096 , 0.2839904 ],
       [0.73584855, 0.26415145],
       [0.84988773, 0.15011227],
       [0.43306616, 0.56693384],
       [0.23901126, 0.76098874],
       [0.81977093, 0.18022907],
       [0.17518614, 0.82481386],
       [0.15504948, 0.84495052],
       [0.15646463, 0.84353537],
       [0.77808382, 0.22191618],
       [0.38230668, 0.61769332],
       [0.28344865, 0.71655135],
       [0.79783172, 0.20216828],
       [0.65534171, 0.34465829],
       [0.62235825, 0.37764175],
       [0.

In [25]:
# Test model
song = ["I like spaghetti with salsa"]
counts = vectorizer.transform(song)
tfcounts = tf.transform(counts)
m.predict(tfcounts)

array(['lizzo'], dtype='<U8')

In [26]:
m.predict_proba(tfcounts)

array([[0.34766593, 0.65233407]])

In [27]:
pd.DataFrame(tfcounts.todense(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,100,20,2two,750,99,aah,abide,ability,able,accent,...,とうにとうにいなくなって,やっぱりまだ帰りたくない,キャプチャーコースで違って,テレビがつきっぱなしの,ボウイも,ロンドンモード王道も,川崎もグラスゴーも,帰りたくない,悪かないね,清志郎もデヴィッド
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
