In [None]:
from bs4 import BeautifulSoup
import requests as rq 
import numpy as np
import re 
import os

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

## Webscraping (OOP Style)

In [None]:
class LyricScraper: 
    def __init__(self, songs_pages:dict,save_destination:str,verbose=True):
        self.save_destination = save_destination 
        self.verbose = verbose 
        self.songs_pages = songs_pages #Dict containing Artist and corresponding song pages
        self.base_url = "http://www.lyrics.com/"
        self.links = {} 
    
    def extract_links_to_lyrics(self):
        """This function will go through the song page of each artist, extract the links to the lyrics and then set the links dictionary""" 
        for artist, songs_page in self.songs_pages.items(): 
            if self.verbose: 
                print(f"Extracting Links from {artist} @ {songs_page}")
            songs_response = rq.get(songs_page)
            songs_bs = BeautifulSoup(songs_response.text) 
            #Extract the Links 
            rows = songs_bs.find("table",{"class":"tdata"}).findAll("tr")
            table_data = [t.td for t in rows ][1:] #Skip the header of the tabel
            links = [td.a.get("href")  for td in table_data]
            #Clean the links 
            cleaned_links = LyricScraper.links_cleaner(links)
            self.links[artist] = cleaned_links
    
    def links_cleaner(links:list): 
        """This function does some filterning, many different links link to the same song lyrics, so get rid of them, remove remixes, acoustic versions and instrumentals as well"""
        cleaned_links = []
        titles=[]
        pattern = r"([Rr]emix|[Aa]coustic|[Ii]nstrumental|[Ff]eat.|[Mm]ix)"
        
        #Get rid of all the links which link to the same song and get rid of remixes, accousitc versions and instrumentals
        for element in links: 
            title_str = element.split("/")[-1].lower()
            if title_str in titles or re.search(pattern,title_str)!=None: 
                continue
            else: 
                titles.append(title_str)
                cleaned_links.append(element)
        return cleaned_links
    
# A problem with this function is the repetion of code, since I don't have enough time I just went with it, but I know it's ugly design 
    def get_lyrics(self,ammount="all"):
        """This function visits each site linked to  in self.links, extracts the lyrics and saves them separated by artist folders as a text file
        You can also specifiy the ammount of songlyrics you want to download (this saves time and is usefull for balancing the models)
        """ 
        number_of_links = [len(links) for links in self.links.values()]
        
        for artist, lyric_links in self.links.items(): 
            if ammount=="all": 
                for link in lyric_links: 
                    
                    url = self.base_url+link 
                    title = url.split("/")[-1].replace("+"," ")

                    save_dir = os.path.join(self.save_destination,artist)
                    if not os.path.exists(save_dir): 
                        os.mkdir(save_dir)
                        
                    if self.verbose:
                        print(f"Url updated to:{url}. Title:{title}. Artist:{artist}")
                        
                    try: 
                        raw_html = rq.get(url).text
                    except HTTPError as e:
                        print(f"An error occured: {e}") 
                        return False 
                    bs_text = BeautifulSoup(raw_html)
                    text = bs_text.pre.text 
                    text_file_name = save_dir+"/"+title
                    with open(text_file_name,"w") as file: 
                        file.write(re.sub(r"[^a-zA-Z0-9]+", ' ', text))
                        
            elif ammount=="balanced": 
                if self.verbose: 
                    print(f"Going for {min(number_of_links)} downloads. Artist:{artist}")
                    
                for i in range(0,min(number_of_links)):
                    url = self.base_url+lyric_links[i] 
                    title = url.split("/")[-1].replace("+"," ")
                               
                    save_dir = os.path.join(self.save_destination,artist)
                    if not os.path.exists(save_dir): 
                        os.mkdir(save_dir)
                        
                    if self.verbose:
                        print(f"Url updated to:{url}. Title:{title}. Artist:{artist}")
                    try: 
                        raw_html = rq.get(url).text
                    except HTTPError as e:
                        print(f"An error occured: {e}") 
                        return False 
                    bs_text = BeautifulSoup(raw_html)
                    text = bs_text.pre.text 
                    text_file_name = save_dir+"/"+title
                    with open(text_file_name,"w") as file: 
                        file.write(re.sub(r"[^a-zA-Z0-9]+", ' ', text))
                    
            elif isinstance(ammount,int) and (ammount <= min(number_of_links)):
                if self.verbose: 
                    print(f"Going for {ammount} downloads. Artist: {artist}")
                for i in range(0,ammount):
                    url = self.base_url+lyric_links[i] 
                    title = url.split("/")[-1].replace("+"," ")

                    save_dir = os.path.join(self.save_destination,artist)
                    if not os.path.exists(save_dir): 
                        os.mkdir(save_dir)
                    if self.verbose:
                        print(f"Url updated to:{url}. Title:{title}. Artist:{artist}")
                    try: 
                        raw_html = rq.get(url).text
                    except HTTPError as e:
                        print(f"An error occured: {e}") 
                        return False 
                    bs_text = BeautifulSoup(raw_html)
                    text = bs_text.pre.text 
                    text_file_name = save_dir+"/"+title
                    with open(text_file_name,"w") as file: 
                        file.write(re.sub(r"[^a-zA-Z0-9]+", ' ', text))
            else: 
                return False 
        print("Finished!") 
        return True 
        
                
            
            
            
        

In [None]:
save_dir= "/home/michaelhaag/Documents/Coding/Spiced_Code/Code-Repository/Week_04/Data"
ls = LyricScraper({"Charli XCX":"https://www.lyrics.com/artist.php?name=Charli-XCX&aid=2391950&o=1","Terror Jr":"https://www.lyrics.com/artist.php?name=Terror-Jr&aid=3252479&o=1"},
                  save_dir) 


**Note**: I wrote a function that updated the url to visit the song pages of different artists in the following way:  
1. User supplies a list of Names (e.g. ["Charlie XCX", "Terror Jr"]
2. Update URL to artist page 
3. Go to Song Page   
Unfortunately this didn't work because the pattern for generating the url for the artist pages wasn't general (e.g. lyrics.com/artists/Charlie-XCX -> Charlie XCX Artist Page, but lyrics.com/artists/Terror-Jr -/-> Terror Jr. Artist Page)   
Therefore, I removed said function (maybe I'll implement it later on) 

In [None]:
ls.extract_links_to_lyrics()

In [None]:
ls.get_lyrics(ammount="balanced")

## Building a Natural Language Classifier Modell (basically just a Wrapper for the sklearn modells)

In [None]:
class LyricsClassifier: 
    def __init__ (self,save_dir:str): 
        self.save_dir = save_dir #Where are the text files stored? 
        self.text_data = {} #This dictionary containts as keys artists and as values a list of their lyrics 
    
    def read_text_files(self):
        """Sets the text_data dict by reading in the text files for each artist into a list and saving them as values in said dictionary""" 
        artist_folders = os.listdir(self.save_dir) 
        lyrics_dirs = [os.path.join(self.save_dir,artist) for artist in artist_folders]
        for d in lyrics_dirs: 
            artist_name = d.split("/")[-1]
            lyrics = []
            for text_file in os.listdir(d):
                #print(f"Reading in: {text_file} in directory {d}")
                with open(d+"/"+text_file,"r") as file: 
                    text=file.read()
                    lyrics.append(text)
            self.text_data[artist_name]=lyrics
    
    def get_corpus_and_labels(self): 
        corpus=[]
        lables =[]
        for artist,lyrics in self.text_data.items():
            for lyric in lyrics:
                corpus.append(lyric)
                lables.append(artist)
        return (corpus,lables)
    
    def train_logreg_model(self,corpus,label): 
        tf_vec = TfidfVectorizer(stop_words="english",ngram_range=(1,2),max_df=0.9)
        lrm = LogisticRegression()
        model = make_pipeline(tf_vec,lrm)
        model.fit(corpus,labels)
        
        return model 
    
    def train_nb_model(self,corpus,label): 
        tf_vec = TfidfVectorizer(stop_words="english",ngram_range=(1,2),max_df=0.9)
        nbm = MultinomialNB(alpha=1)
        model = make_pipeline(tf_vec,nbm)
        model.fit(corpus,labels)
        
        return model
    
    
    def predict(self,model,new_text): 
        new_text = [new_text]
        prediction = model.predict(new_text) 
        return prediction[0]
            
                
                    

                    
        
        

In [None]:
lcm = LyricsClassifier(save_dir)

In [None]:
lcm.read_text_files()

In [None]:
corpus,labels = lcm.get_corpus_and_labels()

In [None]:
log_reg_model = lcm.train_logreg_model(corpus,labels)

In [None]:
lcm.predict(log_reg_model,"I want to meet my maker")

In [None]:
lcm2 = LyricsClassifier(save_dir)

In [None]:
lcm.read_text_files()

In [None]:
corpus,labels = lcm.get_corpus_and_labels()

In [None]:
nb_model = lcm.train_nb_model(corpus,labels)

In [None]:
lcm.predict(nb_model,"Smile mouth!")

## The Count Vectorizer (just playing around) 

In [None]:
cv = CountVectorizer(stop_words="english",ngram_range=(1,2))

In [None]:
lyrics_charli = lcm.text_data["Charli XCX"]

In [None]:
cv.fit(lyrics_charli) 

In [None]:
cv.transform(lyrics_charli).shape

In [None]:
vec_lyrics = cv.transform(lyrics_charli).todense()

## The TF-Idf Transformer (just playing around)

In [None]:
tf_vec = TfidfVectorizer(stop_words="english", ngram_range=(1,2))

In [None]:
tf_vec.fit(lyrics_charli)

In [None]:
X_trans = tf_vec.transform(lyrics_charli).todense()

## Lyrics Classification (manually) 

In [None]:
len(lcm.text_data["Charli XCX"])

In [None]:
len(lcm.text_data["Terror Jr"])

In [None]:
lyrics_corpus = lcm.text_data["Charli XCX"] + lcm.text_data["Terror Jr"]

In [None]:
len(lyrics_corpus)

In [None]:
labels = ["Charlie XCX"] * 34 + ["Terror Jr"] * 34 

### TfidfVectorizer 

In [None]:
tf_vec =TfidfVectorizer(stop_words="english",ngram_range=(1,2),max_df=0.9) 

In [None]:
tf_vec.fit(lyrics_corpus)

In [None]:
lyrics_corpus_trans = tf_vec.transform(lyrics_corpus).todense()
len(lyrics_corpus_trans)

## Train the Logistic Regression Modell 

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(np.asarray(lyrics_corpus_trans),labels)

## Make Predictions

In [None]:
sentence="smash into pieces "
lr.predict(tf_vec.transform([sentence]))