In [None]:
import pandas as pd
import nltk
import contractions

input_csv = "in/tables/lyrics.csv"
output_csv = "out/tables/filtered_lyrics.csv"
output_csv_words = "out/tables/filtered_lyrics_words.csv"

#Set the desired path for NLTK data
nltk_path = 'in/tables/'
if not os.path.exists(nltk_path):
    os.makedirs(nltk_path)

# Set the NLTK data path
nltk.data.path.append(nltk_path)

# Load NLTK resources
nltk.download('stopwords', download_dir=nltk_path)
nltk.download('punkt', download_dir=nltk_path)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_path)
nltk.download('words', download_dir=nltk_path)

from nltk.corpus import stopwords, words
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Set of valid English words
english_vocab = set(w.lower() for w in words.words())

# Read the input CSV
df = pd.read_csv(input_csv, encoding='utf-8')

# Load additional stopwords from a CSV file
additional_stopwords = set()
data = pd.read_csv('in/tables/stopwords.csv', encoding='utf-8', sep=',', decimal='.')
# Assuming stopwords are in the first column and there is no header in the CSV
additional_stopwords = {word.strip().lower() for word in data.iloc[:, 0] if pd.notna(word)}

# Prepare stopwords by combining NLTK  and additional stopwords
stop_words = set(stopwords.words('english')) | additional_stopwords

allowed_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 
'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'MD', 'RB', 'RBR', 'RBS'}

def filter_pos_tags(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    return [word for word, tag in tagged_tokens if tag in allowed_tags]

# Expand contractions    
df['lyrics'] = df['lyrics'].apply(lambda x: contractions.fix(str(x)))

# Remove unwanted characters but keep the hyphen
df["lyrics"] = df["lyrics"].str.replace(r"[^a-zA-Z\s\-]", "", regex=True).str.lower()
# Remove remaining unwanted characters
df["lyrics"] = df["lyrics"].str.replace("-", " ")

# Count total words before filtering stopwords and pos-tagging
df['total_words'] = df['lyrics'].apply(lambda x: len(word_tokenize(x)))

# Count unique words before filtering stopwords and pos-tagging
df['unique_words'] = df['lyrics'].apply(lambda x: len(set(word_tokenize(x))))

# Remove stopwords
df['filtered_lyrics'] = df['lyrics'].apply(lambda x: [word for word in word_tokenize(x) if word in english_vocab and word not in stop_words])

# Filter by part-of-speech tags
df['filtered_lyrics'] = df['filtered_lyrics'].apply(lambda x: filter_pos_tags(' '.join(x)))

# Save to CSV
df.to_csv(output_csv, index=False, encoding='utf-8')

# Create list of tuples (title, artist, word)
words_list = [
    (title, artist, word)
    for title, artist, words in zip(df["song"], df["artist"], df["filtered_lyrics"])
    for word in words
]

# Convert to DataFrame and save to CSV
df_words = pd.DataFrame(words_list, columns=["song", "artist", "word"])

output_csv = 'filtered_lyrics_words.csv'
df_words.to_csv(output_csv_words, index=False, encoding='utf-8')
