* Feature **Extraction**: creating new features from existing ones:
  * `track_album_release_date` can be split into Year, Month, Day, and in our case **Decade** might also be valuable.
  * Years since release
  * `language` - get the language based on the track name / lyrics - can be narrowed to English / Not-English
* Feature Engineering: Converting data into a format suitable for modeling:
  * find five-to-ten popular words in the `track_name` and create *dummy* column for if the song name include the words
  * Sentiment Analysis of the song name  
  * Sentiment Analysis of song lyrics
  * Aggregating `track_artist` by Country/Continent (depends on category size)
  * Aggregating `track_artist` by [Male, Female, Band]
  * Artist Followers

In [None]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Ensure consistent results
DetectorFactory.seed = 0

# Sample DataFrame
df = pd.DataFrame({'track_name': ['Despacito', 'Shape of You', 'Gangnam Style', 'Bonjour', 'Liebe ist für alle da']})

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'

# Apply language detection
df['language'] = df['track_name'].apply(detect_language)

print(df)


              track_name language
0              Despacito       es
1           Shape of You       en
2          Gangnam Style       tl
3                Bonjour       fr
4  Liebe ist für alle da       de


In [None]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Ensure consistent results
DetectorFactory.seed = 0

# Sample DataFrame with more songs
df = pd.DataFrame({
    'track_name': [
        'Despacito', 'Shape of You', 'Gangnam Style', 'Bonjour', 
        'Liebe ist für alle da', 'Bohemian Rhapsody', 'Blinding Lights', 
        'Hips Don’t Lie', 'Senorita', 'Tusa', 'Dynamite', 'Savage Love', 
        'Creep', 'Yesterday', 'Un Homme et une Femme', 'Ai Se Eu Te Pego', 
        'Bésame Mucho', 'Dragostea Din Tei', 'Владимирский Централ', 
        'Καλημέρα', '東京', '你好', 'ありがとう', '밤하늘의 별을'
    ]
})

# Function to classify as English / Not English
def classify_english(text):
    try:
        return "English" if detect(text) == "en" else "Not English"
    except LangDetectException:
        return "Not English"

# Apply classification
df['classification'] = df['track_name'].apply(classify_english)

print(df)


               track_name classification
0               Despacito    Not English
1            Shape of You        English
2           Gangnam Style    Not English
3                 Bonjour    Not English
4   Liebe ist für alle da    Not English
5       Bohemian Rhapsody    Not English
6         Blinding Lights    Not English
7          Hips Don’t Lie        English
8                Senorita    Not English
9                    Tusa    Not English
10               Dynamite    Not English
11            Savage Love    Not English
12                  Creep    Not English
13              Yesterday    Not English
14  Un Homme et une Femme    Not English
15       Ai Se Eu Te Pego    Not English
16           Bésame Mucho    Not English
17      Dragostea Din Tei    Not English
18   Владимирский Централ    Not English
19               Καλημέρα    Not English
20                     東京    Not English
21                     你好    Not English
22                  ありがとう    Not English
23              

In [None]:
import pandas as pd
from langdetect import detect, detect_langs, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Ensure consistent results
DetectorFactory.seed = 0

# Sample DataFrame with more songs
df = pd.DataFrame({
    'track_name': [
        'Despacito', 'Shape of You', 'Gangnam Style', 'Bonjour', 
        'Liebe ist für alle da', 'Bohemian Rhapsody', 'Blinding Lights', 
        'Hips Don’t Lie', 'Senorita', 'Tusa', 'Dynamite', 'Savage Love', 
        'Creep', 'Yesterday', 'Un Homme et une Femme', 'Ai Se Eu Te Pego', 
        'Bésame Mucho', 'Dragostea Din Tei', 'Владимирский Централ', 
        'Καλημέρα', '東京', '你好', 'ありがとう', '밤하늘의 별을'
    ]
})

# Function to classify English / Not English with better accuracy
def classify_english(text):
    try:
        # Detect language probabilities
        langs = detect_langs(text)
        
        # If English is detected with high confidence, mark as English
        for lang in langs:
            if lang.lang == "en" and lang.prob > 0.4:  # Adjust threshold if needed
                return "English"
        
        return "Not English"
    
    except LangDetectException:
        return "Not English"

# Apply classification
df['classification'] = df['track_name'].apply(classify_english)

print(df)


               track_name classification
0               Despacito    Not English
1            Shape of You        English
2           Gangnam Style    Not English
3                 Bonjour    Not English
4   Liebe ist für alle da    Not English
5       Bohemian Rhapsody    Not English
6         Blinding Lights        English
7          Hips Don’t Lie        English
8                Senorita    Not English
9                    Tusa    Not English
10               Dynamite    Not English
11            Savage Love    Not English
12                  Creep        English
13              Yesterday    Not English
14  Un Homme et une Femme    Not English
15       Ai Se Eu Te Pego    Not English
16           Bésame Mucho    Not English
17      Dragostea Din Tei    Not English
18   Владимирский Централ    Not English
19               Καλημέρα    Not English
20                     東京    Not English
21                     你好    Not English
22                  ありがとう    Not English
23              

In [None]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from collections import Counter

# Ensure consistent results
DetectorFactory.seed = 0

# Sample DataFrame with more songs
df = pd.DataFrame({
    'track_name': [
        'Despacito', 'Shape of You', 'Gangnam Style', 'Bonjour', 
        'Liebe ist für alle da', 'Bohemian Rhapsody', 'Blinding Lights', 
        'Hips Don’t Lie', 'Senorita', 'Tusa', 'Dynamite', 'Savage Love', 
        'Creep', 'Yesterday', 'Un Homme et une Femme', 'Ai Se Eu Te Pego', 
        'Bésame Mucho', 'Dragostea Din Tei', 'Владимирский Централ', 
        'Καλημέρα', '東京', '你好', 'ありがとう', '밤하늘의 별을'
    ]
})

# Function to improve accuracy
def detect_language(text, n_attempts=3):
    try:
        # Run detection multiple times and take the most common result
        languages = [detect(text) for _ in range(n_attempts)]
        most_common_lang = Counter(languages).most_common(1)[0][0]
        return most_common_lang
    except LangDetectException:
        return "unknown"

# Function to classify English vs Not English
def classify_english(text):
    lang = detect_language(text)
    return "English" if lang == "en" else "Not English"

# Apply classification
df['classification'] = df['track_name'].apply(classify_english)

print(df)


               track_name classification
0               Despacito    Not English
1            Shape of You        English
2           Gangnam Style    Not English
3                 Bonjour    Not English
4   Liebe ist für alle da    Not English
5       Bohemian Rhapsody    Not English
6         Blinding Lights    Not English
7          Hips Don’t Lie        English
8                Senorita    Not English
9                    Tusa    Not English
10               Dynamite    Not English
11            Savage Love    Not English
12                  Creep    Not English
13              Yesterday    Not English
14  Un Homme et une Femme    Not English
15       Ai Se Eu Te Pego    Not English
16           Bésame Mucho    Not English
17      Dragostea Din Tei    Not English
18   Владимирский Централ    Not English
19               Καλημέρα    Not English
20                     東京    Not English
21                     你好    Not English
22                  ありがとう    Not English
23              

In [None]:
%pip install lyricsgenius

Note: you may need to restart the kernel to use updated packages.


In [None]:
from lyricsgenius import Genius

genius = Genius(token)
genius.search_artist('Andy Shauf')
artist.save_lyrics()

NameError: name 'token' is not defined

In [None]:
import requests

url = "https://api.lyrics.ovh/v1/Coldplay/Adventure of a Lifetime"
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print(f"Failed to retrieve data: {response.status_code}")

{'lyrics': "Turn your magic on, Umi she'd say\r\nEverything you want's a dream away\r\nWe are legends, every day\r\nThat's what she told me\r\nTurn your magic on, to me she'd say\n\nEverything you want's a dream away\n\nUnder this pressure, under this weight\n\nWe are diamonds \n\n\n\nI feel my heart beating\n\nI feel my heart underneath my skin\n\nI feel my heart beating\n\nOh, you make me feel\n\nLike I'm alive again\n\n\n\nAlive again!\n\n\n\nOh, you make me feel\n\nLike I'm alive again\n\n\n\nSaid I can't go on, not in this way\n\nI'm a dream that died by light of day\n\nGonna hold up half the sky and say\n\nOnly I own me\n\n\n\nI feel my heart beating\n\nI feel my heart underneath my skin\n\nOh, I can feel my heart beating\n\n'Cause you make me feel\n\nLike I'm alive again\n\n\n\nAlive again!\n\n\n\nOh, you make me feel\n\nLike I'm alive again\n\n\n\nTurn your magic on, Umi she'd say\n\nEverything you want's a dream away\n\nUnder this pressure, under this weight\n\nWe are diamonds

In [None]:
df_text = pd.read_pickle("./pickle/01_data_preparation/df_text.pkl")
df_text = df_text.drop(columns=['playlist_name'])
df_text = df_text.merge(df[['track_id', 'track_artist']], on='track_id', how='left')

In [None]:
df_text = df_text.drop(columns=['playlist_name'])

In [None]:
df_sample = df_text.sample(1000)


In [None]:
df_sample['lyrics'] = df_sample.apply(get_lyrics, axis=1)

In [None]:
df_sample.describe()

Unnamed: 0,track_id,track_name,track_album_name,track_artist,lyrics
count,1000,1000,1000,1000,435
unique,1000,989,966,891,431
top,7cG5fCOSThJTaKxDHoRT0s,Easy,Greatest Hits,J Balvin,[Instrumental]
freq,1,3,8,5,3


In [None]:
df_text['lyrics'] = df_text.apply(get_lyrics, axis=1)

In [None]:
df_sample.to_pickle("./pickle/04_feature_engineering/df_sample.pkl")

In [None]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
from collections import Counter

# Function to improve accuracy
def detect_language(text, n_attempts=3):
    try:
        # Run detection multiple times and take the most common result
        languages = [detect(text) for _ in range(n_attempts)]
        most_common_lang = Counter(languages).most_common(1)[0][0]
        return most_common_lang
    except LangDetectException:
        return "unknown"

# Function to classify English vs Not English
def classify_english(text):
    lang = detect_language(text)
    return "English" if lang == "en" else "Not English"

In [None]:
df_sample

Unnamed: 0,track_id,track_name,track_album_name,track_artist,lyrics
12130,7cG5fCOSThJTaKxDHoRT0s,Going Back To Memphis - Remastered,Capitol Rarities 1968-1977 (Remastered),The Band,
2058,70vvnTUamBXOc0vRk7BBDu,Poker Face,The Fame Monster (International Deluxe),Lady Gaga,I wanna hold em' like they do in Texas Plays. ...
4962,5mIM8wMyZ8NqhNP3brwnEV,Get Away,Escape,Paperwhite,Make me an offer One heart for true love Cards...
7996,5801f9g6Kb8D4qNjXWBusY,Legend of the South,Creeker 2,Upchurch,There's a warm wind ablowin' And a burnin' sun...
25438,3AuvTRlw1TQulraCRTZnKc,Together,Together,Vinsand,
...,...,...,...,...,...
1971,1MijKOgtIl3uK70qYrZnoy,Sad Songs In The Summer,It Was A Sad Fucking Summer,Olivia O'Brien,Sad songs in the summer Heartbreak till the da...
5272,3QEWSE2yXJN85Q66gxY64O,Takeoff,Takeoff,High John,
1922,53HUxlAstNtpgKVU6VV2JL,American Spirit,happysad,Meg & Dia,
6588,5UQOmTylBEdsIfSn37v3DO,Straight Up Menace,The Best of MC Eiht,MC Eiht,


In [None]:
# Apply classification
df_sample['classification'] = df_sample['lyrics'].apply(classify_english)

TypeError: expected string or bytes-like object, got 'NoneType'

In [None]:
df_text = df_text.drop_duplicates(subset='track_id')

In [None]:
df_text = df_text.merge(df[['track_id', 'track_artist']], on='track_id', how='left')

In [None]:
df_text.describe()

Unnamed: 0,track_id,track_name,track_album_name,track_artist
count,28356,28352,28352,28352
unique,28356,23449,19743,10692
top,6f807x0ima9a1j3VPbc7VN,Breathe,Greatest Hits,Queen
freq,1,18,135,130


In [None]:
def get_lyrics(row):
    artist = row['track_artist']
    title = row['track_name']
    url = f"https://api.lyrics.ovh/v1/{artist}/{title}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        data['lyrics'] = data['lyrics'].replace('\r\n', ' ')
        return data['lyrics']
    else:
        return None
    
