pip install snappy

In [1]:
import gzip 
import zlib
import lzma
import bz2
import snappy
import base64
import polars as pl
import pandas as pd 
import math
# Function to clean the lyrics
import re
def clean_lyrics(text: str) -> str:
    return re.sub(r'\[.*?\]', '', text) if isinstance(text, str) else text
    
# Function to compress the lyrics using gzip
def compress_text(text: str) -> str:
    compressed = gzip.compress(text.encode('utf-8'))
    # Convert compressed bytes to base64 for storage
    return base64.b64encode(compressed).decode('utf-8')


# Function to compress text using zlib
def compress_text_zlib(text: str) -> str:
    compressed = zlib.compress(text.encode('utf-8'))
    # Convert compressed bytes to base64 for storage
    return base64.b64encode(compressed).decode('utf-8')


# Function to compress text using bz2
def compress_text_bz2(text: str) -> str:
    compressed = bz2.compress(text.encode('utf-8'))
    # Convert compressed bytes to base64 for storage
    return base64.b64encode(compressed).decode('utf-8')


# Function to compress text using lzma
def compress_text_lzma(text: str) -> str:
    compressed = lzma.compress(text.encode('utf-8'))
    # Convert compressed bytes to base64 for storage
    return base64.b64encode(compressed).decode('utf-8')


# Function to compress text using snappy
def compress_text_snappy(text: str) -> str:
    compressed = snappy.compress(text.encode('utf-8'))
    # Convert compressed bytes to base64 for storage
    return base64.b64encode(compressed).decode('utf-8')



In [2]:
# File path
file_path = '/Users/inesdimassi/Documents/Data/Lyrics/ds2.csv'
# Read the file using Polars
df_polars = pl.read_csv(file_path)

###############################################@

# File path
file_path_metal = '/Users/inesdimassi/Documents/Data/Lyrics/metal_lyrics.csv'
# Read the file using Polars
metal_polars = pl.read_csv(file_path_metal)

In [4]:
#metal_polars=metal_polars.filter(pl.col("Year") > 0 )
metal_polars = metal_polars.with_columns(
    tag = pl.lit('metal')
)

In [5]:
print("DF Info")
print(df_polars.shape)
print(df_polars.columns)
######################################

print("Metal songs Info")
print(metal_polars.shape)
print(metal_polars.columns)

DF Info
(5913411, 8)
['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id']
Metal songs Info
(228288, 7)
['Artist', 'Album', 'Song', 'Lyric', 'SongNum', 'Year', 'tag']


In [6]:
# Select the necessary columns 
df_polars = df_polars[["title", "artist", "tag",  "year", "lyrics"]]
metal_polars = metal_polars[["Song", "Artist", "tag",  "Year", "Lyric"]]
# Change columns names of the metal Data
metal_polars.columns= ["title", "artist", "tag",  "year", "lyrics"]

In [7]:
df_polars.head(6)

title,artist,tag,year,lyrics
str,str,str,i64,str
"""Killa Cam""","""Cam'ron""","""rap""",2004,"""[Chorus: Opera Steve & Cam'ron…"
"""Can I Live""","""JAY-Z""","""rap""",1996,"""[Produced by Irv Gotti] [Intr…"
"""Forgive Me Father""","""Fabolous""","""rap""",2003,"""Maybe cause I'm eatin And thes…"
"""Down and Out""","""Cam'ron""","""rap""",2004,"""[Produced by Kanye West and Br…"
"""Fly In""","""Lil Wayne""","""rap""",2005,"""[Intro] So they ask me ""Young …"
"""Lollipop Remix""","""Lil Wayne""","""rap""",2008,"""[Intro: Lil Wayne] Haha Uh-huh…"


In [8]:
metal_polars.head(6)

title,artist,tag,year,lyrics
str,str,str,i64,str
"""_Gecenin_G__lgesi""","""...AAAARRGHH""","""metal""",0,"""Kara bulutlar sardГ„В± yine dГ…"
"""_Son___afak""","""...AAAARRGHH""","""metal""",0,"""Dolunay parlak gГѓВ¶rГѓВјnmГѓВ…"
"""_F__rt__na_Yakla__yyor...""","""...AAAARRGHH""","""metal""",0,"""Ay Г„В±Г…ВџГ„В±Г„ВџГ„В±nГ„В±n …"
"""_Ebedi_Buzulun_Ortas__nda""","""...AAAARRGHH""","""metal""",0,"""Ay Г„В±Г…ВџГ„В±Г„ВџГ„В±nГ„В±n …"
"""_Lanetli_Diyarlar""","""...AAAARRGHH""","""metal""",0,"""YГѓВјrГѓВјyorum yalnГ„В±z baГ……"
"""...Ja Kylma Vesi Nuolee Oksaa""","""...AND OCEANS""","""metal""",1998,"""Open the white doors To the fi…"


In [9]:
# Concat both Data sets
df=pl.concat([df_polars, metal_polars])

In [10]:
print("Merged Data Info")
print(df.shape)
print(df.columns)

Merged Data Info
(6141699, 5)
['title', 'artist', 'tag', 'year', 'lyrics']


In [11]:
# Clean lyrics column
df = df.with_columns( 
    pl.col("lyrics").map_elements(lambda x:  clean_lyrics(x), return_dtype=pl.Utf8 ).alias("lyrics_cleaned")
)

In [12]:
df=df.with_columns( 
    pl.col("lyrics_cleaned").map_elements(lambda x:  len(x), return_dtype=pl.Int64 ).alias("len_lyrics")
)

In [13]:
# Calculate size lyrics
df = df.with_columns( 
    pl.col("lyrics_cleaned").map_elements(lambda x: len(x.encode('utf-8')) , return_dtype=pl.Int64 ).alias("song_size_bytes")
)

In [25]:
#Compress lyrics with multiple algorithms so we can compare and picke the right one

#### Compress lyrics with gzip function
df = df.with_columns( 
    pl.col("lyrics_cleaned").map_elements(lambda x: compress_text(x) , return_dtype=pl.Utf8 ).alias("lyrics_compressed_gzip")
)

#### Compress lyrics with zlib function
df = df.with_columns( 
    pl.col("lyrics_cleaned").map_elements(lambda x:  compress_text_zlib(x) , return_dtype=pl.Utf8 ).alias("lyrics_compressed_zlib")
)

#### Compress lyrics with bz2 function
df = df.with_columns( 
    pl.col("lyrics_cleaned").map_elements(lambda x:  compress_text_bz2(x) , return_dtype=pl.Utf8 ).alias("lyrics_compressed_bz2")
)

#### Compress lyrics with lzma function
df = df.with_columns( 
    pl.col("lyrics_cleaned").map_elements(lambda x:  compress_text_lzma(x) , return_dtype=pl.Utf8 ).alias("lyrics_compressed_lzma")
)

#### Compress lyrics with snappy function
df = df.with_columns( 
    pl.col("lyrics_cleaned").map_elements(lambda x:  compress_text_snappy(x) , return_dtype=pl.Utf8 ).alias("lyrics_compressed_snappy")
)

In [31]:
# Calculate size compressed lyrics
df = df.with_columns( 
    pl.col("lyrics_compressed_gzip").map_elements(lambda x: len(x.encode('utf-8')), return_dtype=pl.Int64 ).alias("song_compressed_size_gzip")
)

In [33]:
df=df.filter( (pl.col("year") <= 2024) & (pl.col("year") > 89 ) )

In [35]:
df = df.with_columns(
    ratio_compress = (pl.col('song_compressed_size_gzip')*100) / pl.col('song_size_bytes')
)

df = df.with_columns( 
    pl.col("ratio_compress").map_elements(lambda x: round(x,0), return_dtype=pl.Float64 ).alias("ratio_compress")
)

In [37]:
df2=df.filter( (pl.col("year") <= 2024) & (pl.col("year") > 89 ) & (pl.col("ratio_compress") <= 100 ) )
df_songs= df2[["title", "artist", "tag",  "year", "len_lyrics", "song_size_bytes", "song_compressed_size_gzip", "ratio_compress"]]

In [39]:
df_songs = df_songs.with_columns( 
    pl.col("ratio_compress").map_elements(lambda x: int(x), return_dtype=pl.Int64 ).alias("ratio_compress")
)

In [41]:
df_songs

title,artist,tag,year,len_lyrics,song_size_bytes,song_compressed_size_gzip,ratio_compress
str,str,str,i64,i64,i64,i64,i64
"""Killa Cam""","""Cam'ron""","""rap""",2004,3968,3968,2108,53
"""Can I Live""","""JAY-Z""","""rap""",1996,2885,2887,2080,72
"""Forgive Me Father""","""Fabolous""","""rap""",2003,2807,2807,1892,67
"""Down and Out""","""Cam'ron""","""rap""",2004,3851,3859,2248,58
"""Fly In""","""Lil Wayne""","""rap""",2005,2144,2144,1496,70
…,…,…,…,…,…,…,…
"""Storm Detonation""","""ZYKLON""","""metal""",2001,910,910,668,73
"""Zycloned""","""ZYKLON""","""metal""",2001,1015,1015,712,70
"""Terrordrome""","""ZYKLON""","""metal""",2001,859,859,700,81
"""Worm World""","""ZYKLON""","""metal""",2001,966,966,732,76


In [43]:
df_songs.write_csv('Repetitive_songs.csv', separator=",")
df_songs

title,artist,tag,year,len_lyrics,song_size_bytes,song_compressed_size_gzip,ratio_compress
str,str,str,i64,i64,i64,i64,i64
"""Killa Cam""","""Cam'ron""","""rap""",2004,3968,3968,2108,53
"""Can I Live""","""JAY-Z""","""rap""",1996,2885,2887,2080,72
"""Forgive Me Father""","""Fabolous""","""rap""",2003,2807,2807,1892,67
"""Down and Out""","""Cam'ron""","""rap""",2004,3851,3859,2248,58
"""Fly In""","""Lil Wayne""","""rap""",2005,2144,2144,1496,70
…,…,…,…,…,…,…,…
"""Storm Detonation""","""ZYKLON""","""metal""",2001,910,910,668,73
"""Zycloned""","""ZYKLON""","""metal""",2001,1015,1015,712,70
"""Terrordrome""","""ZYKLON""","""metal""",2001,859,859,700,81
"""Worm World""","""ZYKLON""","""metal""",2001,966,966,732,76
