In [1]:
import math
import pandas as pd
from langdetect import detect
from tqdm import tqdm

df = pd.read_csv("lyrics_dataframe.csv")

In [2]:
df.rename(columns={"en": "orig"}, inplace=True)
df["lang"] = "nan"

In [3]:
df.head(3)

Unnamed: 0,artist_name,album_name,year,title,number,orig,fr,lang
0,The Beatles,Beatles For Sale,1964.0,Rock and Roll Music,4.0,chorus\nJust let me hear some of that rock and...,Laisse moi juste écouter un peu de cette musiq...,
1,Imagine Dragons,Infinity Blade II [OST],2011.0,Monster,3.0,Ever since I could remember\nEverything inside...,Aussi longtemps que je m'en souvienne\nTout ce...,
2,The Beatles,Let It Be,1970.0,I Me Mine,4.0,"All through the day I me mine, I me mine, I me...","Tout le jour : je, moi, à moi, je, moi, à moi,...",


---

### Remove small strings (not lyrics)

In [4]:
def replace_short_with_nan(value):
    if isinstance(value, str) and len(value) < 100:
        return math.nan
    else:
        return value

df['orig'] = df['orig'].apply(replace_short_with_nan)
df['fr'] = df['fr'].apply(replace_short_with_nan)

### Remove nan rows

In [5]:
df.dropna(subset=['orig', 'fr'], how='all', inplace=True)

### Put French texts only in the fr column, get the original lang and delete the row if there is no French version at all.

In [6]:
fr_fr_idx = []
not_fr_idx = []

for idx in tqdm(df.index):
    fr_lang = "nan"
    orig_lang = "nan"

    if isinstance(df['fr'][idx], str):
        try:fr_lang = detect(df['fr'][idx])
        except:pass
    if isinstance(df['orig'][idx], str):
        try:orig_lang = detect(df['orig'][idx])
        except:pass

    if fr_lang != 'fr' and orig_lang == "fr":
        df.loc[idx, ['fr', 'orig']] = df.loc[idx, ['orig', 'fr']].values
        df.loc[idx, 'lang'] = fr_lang
    elif fr_lang == 'fr' and orig_lang != "fr":
        df.loc[idx, 'lang'] = orig_lang
    elif fr_lang == "fr" and orig_lang == "fr":
        fr_fr_idx.append(idx)
    elif fr_lang != "fr" and orig_lang != "fr":
        not_fr_idx.append(idx)

100%|██████████| 101508/101508 [17:39<00:00, 95.78it/s] 


In [10]:
print(len(fr_fr_idx))
print(len(not_fr_idx))

1655
564


In [11]:
df = df.drop(fr_fr_idx)
df = df.drop(not_fr_idx)

In [12]:
df.head()

Unnamed: 0,artist_name,album_name,year,title,number,orig,fr,lang
0,The Beatles,Beatles For Sale,1964.0,Rock and Roll Music,4.0,chorus\nJust let me hear some of that rock and...,Laisse moi juste écouter un peu de cette musiq...,en
1,Imagine Dragons,Infinity Blade II [OST],2011.0,Monster,3.0,Ever since I could remember\nEverything inside...,Aussi longtemps que je m'en souvienne\nTout ce...,en
2,The Beatles,Let It Be,1970.0,I Me Mine,4.0,"All through the day I me mine, I me mine, I me...","Tout le jour : je, moi, à moi, je, moi, à moi,...",en
3,The Beatles,Paperback Writer [Single],1966.0,Rain,2.0,If the rain comes they run and hide their head...,"Quand la pluie arrive, ils courent et protègen...",en
4,Imagine Dragons,Divergente 2 - L'insurrection (The Divergent S...,2015.0,Warriors,7.0,"Warriors\nAs a child, you would wait\nAnd watc...","(Les guerriers\nEnfant, tu attendais\nEt garda...",en


In [13]:
df.to_csv("lyrics_dataframe_processed.csv", index = False)