In [None]:
import pandas as pd
import re
from collections import defaultdict
from typing import List, Tuple, Dict

def find_word_count(song_lyrics: str) -> int:
    # Handle NaN/None values and non-string inputs
    if pd.isna(song_lyrics) or not isinstance(song_lyrics, str):
        return 0
    return len(song_lyrics.split())

def clean_words_occurence(words_occurence: Dict[str, int]) -> Dict[str, int]:
    words_filter = {"", " ", "a", "of", "it", "the", "are", "to", "It", "in", "i", "I", "I'm", 
                   "my", "it's", "and", "be", "on", "in", "you", "for", "The", "your", "they", "im"}
    
    for word in words_filter:
        words_occurence.pop(word, None)
    
    for word in list(words_occurence.keys()):
        if any(ord(char) > 127 for char in word):
            words_occurence.pop(word)
    
    return words_occurence

def extract_lyrics(song_lyrics: str) -> Dict[str, int]:
    words_occurence = defaultdict(int)
    if pd.isna(song_lyrics) or not isinstance(song_lyrics, str):
        return words_occurence
        
    words = song_lyrics.split()
    for word in words:
        words_occurence[word] += 1
    
    return words_occurence

def clean_top_occurence(top_occurences: List[Tuple[str, int]]) -> List[Tuple[str, int]]:
    symbols_filter = {')', '(', '.', '"', ',', '\\', ' ', '!', '?', '#', '@', '&', "'", '-', '*', '/', '%'}
    
    cleaned = []
    for word, count in top_occurences:
        cleaned_word = ''.join(char for char in word if char not in symbols_filter)
        if cleaned_word:  
            cleaned.append((cleaned_word, count))
    
    return cleaned

def get_top_result(words_occurence: Dict[str, int], minimum_occurence: int) -> List[Tuple[str, int]]:
    top_occurences = [(word, count) for word, count in words_occurence.items() if count > minimum_occurence]
    top_occurences.sort(key=lambda x: x[1], reverse=True)
    return top_occurences

def longest_word(words_occurence: Dict[str, int]) -> Tuple[str, int]:
    all_occurences = get_top_result(words_occurence, 0)
    all_occurences = clean_top_occurence(all_occurences)
    
    if not all_occurences:
        return ("Not Found", 0)
    
    longest_word_entry = max(all_occurences, key=lambda x: len(x[0]))
    return (longest_word_entry[0], len(longest_word_entry[0]))

def average_word_length(words_occurence: Dict[str, int]) -> float:
    words_occurence = clean_words_occurence(words_occurence)
    all_occurences = get_top_result(words_occurence, 1)
    all_occurences = clean_top_occurence(all_occurences)
    
    if not all_occurences:
        return 0.0
    
    total_chars = sum(len(word) for word, _ in all_occurences)
    word_count = len(all_occurences)
    
    return total_chars / word_count

def words_repetition_percentage(top_occurrences: List[Tuple[str, int]], words_count: int) -> List[Tuple[int, float, str]]:
    result = []
    missing_counter = 0
    
    if not top_occurrences:
        return [(0, 0.0, "Not Found")] * 5
    
    if len(top_occurrences) >= 5:
        for i in range(5):
            count = top_occurrences[i][1] if i < len(top_occurrences) else 0
            percentage = (count * 100 / words_count) if words_count > 0 else 0.0
            word = top_occurrences[i][0] if i < len(top_occurrences) else "Not Found"
            result.append((count, percentage, word))
    else:
        for i in range(len(top_occurrences)):
            count = top_occurrences[i][1]
            percentage = (count * 100 / words_count) if words_count > 0 else 0.0
            result.append((count, percentage, top_occurrences[i][0]))
            missing_counter += 1
        
        for i in range(5 - missing_counter):
            result.append((0, 0.0, "Not Found"))
    
    return result[:5] 

def unique_word_count(words_occurence: Dict[str, int], word_count: int) -> float:
    unique_occurences = get_top_result(words_occurence, 0)
    unique_occurences = clean_top_occurence(unique_occurences)
    
    unique_count = sum(1 for _, count in unique_occurences if count == 1)
    
    return (unique_count * 100 / word_count) if word_count > 0 else 0.0

def is_explicit(lyrics: List[Tuple[str, int]]) -> int:
    explicit_words = {
        "fuck", "shit", "bitch", "ass", "damn", "hell", "goddamn", "cunt", "dick", "bastard", "slut", "whore",
        "cock", "pussy", "motherfucker", "asshole", "prick", "faggot", "douchebag", "balls", "twat", "cocksucker",
        "piss", "sex", "blowjob", "cum", "suck", "anal", "jerk off", "masturbate", "nude", "naked", "breasts",
        "orgasm", "dildo", "gangbang", "horny", "porno", "porn", "fucker", "fap", "freak", "cockblock", "fuckbuddy",
        "fuckface", "one-night-stand", "fucktard", "tits", "fuckhead", "shitface", "prickhead", "asswipe", "cockroach",
        "assclown", "asslicker", "shitstorm", "nutjob", "dickhead", "retard", "moron", "spaz", "wanker", "jackass",
        "scumbag", "garbage", "skank", "shithead", "cockface", "whore", "bastard", "cock", "freakshow", "pisshead",
        "dickwad", "asshole", "spic", "chink", "kike", "gook", "towelhead", "dyke", "bitchass", "fuckface", "freak",
        "asswipe", "twat", "nigger", "bastard", "slutty", "sex tape", "fetish", "threesome", "creampie", "pornstar"
    }
    
    for word, _ in lyrics:
        if word.lower() in explicit_words:
            return 1
    return 0

def process_lyrics(df: pd.DataFrame, lyrics_column: str = 'lyrics_cleaned') -> pd.DataFrame:
    results = []
    
    for lyrics in df[lyrics_column]:
        word_count = find_word_count(lyrics)
        words_occurence = extract_lyrics(lyrics)
        words_occurence = clean_words_occurence(words_occurence)
        top_occurences = get_top_result(words_occurence, 3)
        top_occurences = clean_top_occurence(top_occurences)
        
        avg_word_len = average_word_length(words_occurence)
        top_words_meta = words_repetition_percentage(top_occurences, word_count)
        longest_word_info = longest_word(words_occurence)
        unique_percentage = unique_word_count(words_occurence, word_count)
        explicit_flag = is_explicit(top_occurences)
        
        result_row = {
            'words_count': word_count,
            'words_average_size': avg_word_len,
            '1st_word': top_words_meta[0][2],
            '1st_occurence': top_words_meta[0][0],
            '1st_word_percentage': top_words_meta[0][1],
            '2nd_word': top_words_meta[1][2],
            '2nd_occurence': top_words_meta[1][0],
            '2nd_word_percentage': top_words_meta[1][1],
            '3rd_word': top_words_meta[2][2],
            '3rd_occurence': top_words_meta[2][0],
            '3rd_word_percentage': top_words_meta[2][1],
            '4th_word': top_words_meta[3][2],
            '4th_occurence': top_words_meta[3][0],
            '4th_word_percentage': top_words_meta[3][1],
            '5th_word': top_words_meta[4][2],
            '5th_occurence': top_words_meta[4][0],
            '5th_word_percentage': top_words_meta[4][1],
            'longest_word': longest_word_info[0],
            'longest_word_length': longest_word_info[1],
            'unique_word_percentage': unique_percentage,
            'Explicitness': explicit_flag
        }
        
        results.append(result_row)
    
    return pd.DataFrame(results)



In [10]:
import pandas as pd
df = pd.read_csv('Final_df_cleaned_lyrics.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed', case=False)]
df = df.drop(['lyrics_not_cleaned','lyrics_url','song_id'],axis=1)
df_lyrics_only = df[['lyrics_cleaned']]
df_lyrics_only

Unnamed: 0,lyrics_cleaned
0,I hopped off the plane at LAX With a dream and...
1,"Oh, caught in a bad romance Oh, caught in a ba..."
2,You would not believe your eyes if ten million...
3,Shawty's like a melody in my head That I can't...
4,I need another story Something to get off my c...
...,...
11100,No Lyrics Found
11101,"Оставь в покое меня Я был последним, да, подон..."
11102,"Unleaded Oh, oh, oh Yeah, yeah, yeah, yeah, ye..."
11103,Issa void behind me I took so many steps I can...


In [None]:
processed_df = process_lyrics(df_lyrics_only)

In [12]:
processed_df

Unnamed: 0,words_count,words_average_size,1st_word,1st_occurence,1st_word_percentage,2nd_word,2nd_occurence,2nd_word_percentage,3rd_word,3rd_occurence,...,4th_word,4th_occurence,4th_word_percentage,5th_word,5th_occurence,5th_word_percentage,longest_word,longest_word_length,unique_word_percentage,Explicitness
0,469,4.277778,song,11,2.345416,like,10,2.132196,party,9,...,hands,8,1.705757,playin,8,1.705757,butterflies,11,21.535181,0
1,566,4.033333,want,41,7.243816,bad,36,6.360424,romance,25,...,love,14,2.473498,love,14,2.473498,leatherstudded,14,5.653710,0
2,343,4.478261,Id,11,3.206997,Cause,8,2.332362,believe,6,...,as,6,1.749271,asleep,6,1.749271,everything,10,25.072886,0
3,492,4.520548,like,25,5.081301,me,11,2.235772,melody,10,...,got,10,2.032520,singing,10,2.032520,replayayayay,12,13.617886,0
4,339,4.155556,all,15,4.424779,secrets,10,2.949853,away,10,...,gonna,8,2.359882,give,8,2.359882,everything,10,21.533923,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11100,3,0.000000,Not Found,0,0.000000,Not Found,0,0.000000,Not Found,0,...,Not Found,0,0.000000,Not Found,0,0.000000,Lyrics,6,100.000000,0
11101,431,0.000000,Not Found,0,0.000000,Not Found,0,0.000000,Not Found,0,...,Not Found,0,0.000000,Not Found,0,0.000000,Not Found,0,0.000000,0
11102,405,3.680000,mon,17,4.197531,you,12,2.962963,quand,9,...,mes,9,2.222222,cest,9,2.222222,Aujourdhui,10,18.518519,0
11103,376,3.927273,cant,6,1.595745,And,5,1.329787,no,5,...,me,5,1.329787,win,5,1.329787,underestimate,13,44.680851,0


In [13]:
processed_df.to_csv('processed_lyrics.csv', index=False)

In [14]:
processed_df

Unnamed: 0,words_count,words_average_size,1st_word,1st_occurence,1st_word_percentage,2nd_word,2nd_occurence,2nd_word_percentage,3rd_word,3rd_occurence,...,4th_word,4th_occurence,4th_word_percentage,5th_word,5th_occurence,5th_word_percentage,longest_word,longest_word_length,unique_word_percentage,Explicitness
0,469,4.277778,song,11,2.345416,like,10,2.132196,party,9,...,hands,8,1.705757,playin,8,1.705757,butterflies,11,21.535181,0
1,566,4.033333,want,41,7.243816,bad,36,6.360424,romance,25,...,love,14,2.473498,love,14,2.473498,leatherstudded,14,5.653710,0
2,343,4.478261,Id,11,3.206997,Cause,8,2.332362,believe,6,...,as,6,1.749271,asleep,6,1.749271,everything,10,25.072886,0
3,492,4.520548,like,25,5.081301,me,11,2.235772,melody,10,...,got,10,2.032520,singing,10,2.032520,replayayayay,12,13.617886,0
4,339,4.155556,all,15,4.424779,secrets,10,2.949853,away,10,...,gonna,8,2.359882,give,8,2.359882,everything,10,21.533923,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11100,3,0.000000,Not Found,0,0.000000,Not Found,0,0.000000,Not Found,0,...,Not Found,0,0.000000,Not Found,0,0.000000,Lyrics,6,100.000000,0
11101,431,0.000000,Not Found,0,0.000000,Not Found,0,0.000000,Not Found,0,...,Not Found,0,0.000000,Not Found,0,0.000000,Not Found,0,0.000000,0
11102,405,3.680000,mon,17,4.197531,you,12,2.962963,quand,9,...,mes,9,2.222222,cest,9,2.222222,Aujourdhui,10,18.518519,0
11103,376,3.927273,cant,6,1.595745,And,5,1.329787,no,5,...,me,5,1.329787,win,5,1.329787,underestimate,13,44.680851,0


In [16]:
merged_df = pd.concat([df, processed_df], axis=1)

In [17]:
merged_df

Unnamed: 0,artist,danceability,valence,tempo,liveness,song,popularity,speechiness,mood,loudness,...,4th_word,4th_occurence,4th_word_percentage,5th_word,5th_occurence,5th_word_percentage,longest_word,longest_word_length,unique_word_percentage,Explicitness
0,Miley Cyrus,0.652,0.470,96.021,0.0886,Party In The U.S.A.,78,0.0420,,-4.667,...,hands,8,1.705757,playin,8,1.705757,butterflies,11,21.535181,0
1,Lady Gaga,0.696,0.714,119.001,0.0842,Bad Romance,78,0.0363,,-3.755,...,love,14,2.473498,love,14,2.473498,leatherstudded,14,5.653710,0
2,Owl City,0.513,0.461,180.118,0.1180,Fireflies,78,0.0439,,-6.800,...,as,6,1.749271,asleep,6,1.749271,everything,10,25.072886,0
3,Iyaz,0.706,0.195,91.031,0.1680,Replay,72,0.0708,,-6.323,...,got,10,2.032520,singing,10,2.032520,replayayayay,12,13.617886,0
4,OneRepublic,0.516,0.376,148.021,0.1150,Secrets,76,0.0366,,-6.223,...,gonna,8,2.359882,give,8,2.359882,everything,10,21.533923,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11100,Lutov Doko,0.564,0.715,76.583,0.6670,Simghera,47,0.0536,Energetic,-8.206,...,Not Found,0,0.000000,Not Found,0,0.000000,Lyrics,6,100.000000,0
11101,Ramil',0.789,0.457,135.100,0.1160,Из-за тебя,60,0.1820,Happy,-8.621,...,Not Found,0,0.000000,Not Found,0,0.000000,Not Found,0,0.000000,0
11102,DYSTINCT,0.760,0.544,100.021,0.5240,Habiba (feat. Tawsen),67,0.0539,Energetic,-5.032,...,mes,9,2.222222,cest,9,2.222222,Aujourdhui,10,18.518519,0
11103,Greg Willen,0.657,0.342,90.006,0.7700,Marvin Vettori - The Italian Dream,63,0.1740,Energetic,-4.554,...,me,5,1.329787,win,5,1.329787,underestimate,13,44.680851,0


In [18]:
merged_df=merged_df.drop(['lyrics_cleaned'],axis=1)

In [19]:
merged_df

Unnamed: 0,artist,danceability,valence,tempo,liveness,song,popularity,speechiness,mood,loudness,...,4th_word,4th_occurence,4th_word_percentage,5th_word,5th_occurence,5th_word_percentage,longest_word,longest_word_length,unique_word_percentage,Explicitness
0,Miley Cyrus,0.652,0.470,96.021,0.0886,Party In The U.S.A.,78,0.0420,,-4.667,...,hands,8,1.705757,playin,8,1.705757,butterflies,11,21.535181,0
1,Lady Gaga,0.696,0.714,119.001,0.0842,Bad Romance,78,0.0363,,-3.755,...,love,14,2.473498,love,14,2.473498,leatherstudded,14,5.653710,0
2,Owl City,0.513,0.461,180.118,0.1180,Fireflies,78,0.0439,,-6.800,...,as,6,1.749271,asleep,6,1.749271,everything,10,25.072886,0
3,Iyaz,0.706,0.195,91.031,0.1680,Replay,72,0.0708,,-6.323,...,got,10,2.032520,singing,10,2.032520,replayayayay,12,13.617886,0
4,OneRepublic,0.516,0.376,148.021,0.1150,Secrets,76,0.0366,,-6.223,...,gonna,8,2.359882,give,8,2.359882,everything,10,21.533923,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11100,Lutov Doko,0.564,0.715,76.583,0.6670,Simghera,47,0.0536,Energetic,-8.206,...,Not Found,0,0.000000,Not Found,0,0.000000,Lyrics,6,100.000000,0
11101,Ramil',0.789,0.457,135.100,0.1160,Из-за тебя,60,0.1820,Happy,-8.621,...,Not Found,0,0.000000,Not Found,0,0.000000,Not Found,0,0.000000,0
11102,DYSTINCT,0.760,0.544,100.021,0.5240,Habiba (feat. Tawsen),67,0.0539,Energetic,-5.032,...,mes,9,2.222222,cest,9,2.222222,Aujourdhui,10,18.518519,0
11103,Greg Willen,0.657,0.342,90.006,0.7700,Marvin Vettori - The Italian Dream,63,0.1740,Energetic,-4.554,...,me,5,1.329787,win,5,1.329787,underestimate,13,44.680851,0


In [20]:
merged_df.to_csv('Database.csv', index=False)