# Creating the web scraper

In [1]:
#!pip install bs4

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
import sklearn.metrics as sm


import re

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import csv
import requests
import json
from pandas.io.json import json_normalize
from urllib.request import urlopen
from bs4 import BeautifulSoup


from neo4j import GraphDatabase 
from py2neo import Graph,Node,Relationship

# Getting the dataset

In [2]:
data = pd.read_csv('../data/cleanedDatasetWithFeatures.csv')

# Extracting artist and track name

In [3]:
data.drop('Unnamed: 0', inplace=True, axis=1)
data.drop('Unnamed: 0.1', inplace=True, axis=1)

In [4]:
data.head()

Unnamed: 0,title,rank,date,artist,url,region,streams,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,Chantaje (feat. Maluma),1,2017-01-01,Shakira,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,Argentina,253019.0,0.852,0.773,8.0,-2.921,0.0776,0.187,3e-05,0.159,0.907,102.034,195840.0
1,Despacito (Featuring Daddy Yankee),1,2017-04-07,Luis Fonsi,https://open.spotify.com/track/4aWmUDTfIPGksMN...,Dominican Republic,11681.0,0.66,0.786,2.0,-4.757,0.17,0.209,0.0,0.112,0.846,177.833,228200.0
2,ภาพจำ,1,2018-04-12,Pop Pongkool,https://open.spotify.com/track/4YRpjJaJpiI36HM...,Thailand,15477.0,0.557,0.585,5.0,-7.559,0.047,0.426,1e-06,0.144,0.464,81.922,274000.0
3,Te Boté - Remix,1,2018-04-27,"Nio Garcia, Casper Magico, Bad Bunny, Darell, ...",https://open.spotify.com/track/3V8UKqhEK5zBkBb...,Honduras,12869.0,0.903,0.675,11.0,-3.445,0.214,0.542,1.3e-05,0.0595,0.442,96.507,417920.0
4,Binibini,1,2021-04-16,Zack Tabudlo,https://open.spotify.com/track/2X5AFygz5SDYlXa...,Philippines,401678.0,0.642,0.374,5.0,-10.606,0.032,0.423,2e-06,0.435,0.0979,129.863,221538.0


In [5]:
data.shape

(104561, 18)

In [6]:
import re

def get_track_name(nr):
    

    track_string = str(data.title[nr])
    
    x = track_string.find(" (")
    y = track_string.find(" -")


    
    if(int(x) < 0):
        track_string = track_string
    if(int(x) > 0):
        track_string = track_string[:x]
    if (int(y) > 0):
        track_string = track_string[:y]
    
    return track_string



def get_track_artist(nr):
    
    artist_string = str(data.artist[nr])

    y = artist_string.find(", ")
    x = artist_string.find(" (")
    
    if(int(x) > 0):
        artist_string = artist_string.replace("' ", "")

    if(int(y) > 0):
        #artist_string = y
        artist_string = artist_string[:y]     
        
    return artist_string

# Getting URL

In [7]:
import re

def get_url(track_nr):
    
    track = str(get_track_name(track_nr))
    artist = str(get_track_artist(track_nr))
    
    track = re.sub('[^a-zA-Z0-9 \n\.]', '', track)
    artist = re.sub('[^a-zA-Z0-9 \n\.]', '', artist)
    
    
    if (int(artist.find(" ")) > 0):
        artist = artist.replace(" ", "-")
        
    if (int(track.find(" ")) > 0):
        track = track.replace(" ", "-")
        
    
    
    url_string = str('https://genius.com/' + artist + '-' + track + '-lyrics')
    
    return str(url_string)
    

# Scraping credits

In [8]:
def extract_credits(URL):
    
    # "https://genius.com/Shakia-chantaje-lyrics" <- url to check for raised exception
    
    r = requests.get(URL)
    
    if (int(r.status_code) == 200):


        # Parsing the HTML
        soup = BeautifulSoup(r.content, 'html.parser')
        s = soup.find('div', class_='SongInfo__Columns-nekw6x-2 lgBflw')
        g = soup.findAll(class_='SongTags__Tag-xixwg3-2 fdHeQh')

        feature = []
        writer = []
        label = []
        producer = []
        genre = []

        for line in s:
            div = line.find('div').text
            an = line.find_all('a')

            if 'Featuring' in div: 
                extract = [x.text for x in an]
                for x in extract:
                    feature.append(x)

            if 'Written By' in div:
                extract = [x.text for x in an]
                for x in extract:
                    writer.append(x)

            if 'Label' in div:
                extract = [x.text for x in an]
                for x in extract:
                    label.append(x)

            if 'Produced By' in div:
                extract = [x.text for x in an]
                for x in extract:
                    producer.append(x)

        for line in g:
            genre.append(line.text)
                    
    else:
        raise Exception("Sorry, no numbers below zero") 

    collection = {"Feature":feature, "Writer":writer, "Label":label, "Producer":producer, "Genres": genre}        
        
        
    return collection

In [9]:
#extLis = extract_credits(get_url(track_nr))

# Dict to csv

In [10]:
#credit_list = pd.DataFrame.from_dict(extLis, orient='index').T

In [11]:
#credit_list

### Path to neo4j import folder
 - Nicholas Stationær: C:\Users\Nmtur\.Neo4jDesktop\relate-data\dbmss\dbms-20b18527-223b-4ada-8e62-429ad982fc9b\import\collaborators.csv\
- Bærbar: C:\Users\Nmtur\.Neo4jDesktop\relate-data\dbmss\dbms-fb2eb10a-32f4-4f40-90bd-10e9b225eb47\import

- Martins bærbare: C:\Users\marti\.Neo4jDesktop\relate-data\dbmss\dbms-a7f58857-34e9-48ad-b898-3dc31e11d74c\import\collaborators.csv



In [12]:
#df = pd.DataFrame.from_dict(credit_list) 
#df.to_csv (r'C:\Users\Nmtur\.Neo4jDesktop\relate-data\dbmss\dbms-fb2eb10a-32f4-4f40-90bd-10e9b225eb47\import\collaborators.csv', index = False, header=True)

In [13]:
#df

***

# Neo4j

In [9]:
# Create connection to database

class neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
                print("Failed to create the driver: ", e)
                
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try:
            session = self.__driver.session(database=db) if db is not None else self.__driver.session()
            response = list(session.run(query))
        except Exception as e:
            print("Query failed!", e)
        finally:
            if session is not None:
                session.close()
        return response
    
    def close(self):
        if self.__driver is not None:
            self.__driver.close()

In [10]:
# Connecting to database
conn = neo4jConnection(uri="bolt://localhost:11005", user="neo4j", pwd="test1234")
# bolt://localhost:7687

In [11]:
conn.query('''MATCH (n) RETURN n;''')

[]

### Query to insert into neo4j

In [12]:
def run_querys(track_nr):
    
    artist = get_track_artist(track_nr)
    track = get_track_name(track_nr)
    ranking = str(data.iloc[track_nr][1])
    
    conn.query('''CALL apoc.load.csv('collaborators.csv')
    YIELD lineNo, map, list
    
    WHERE map.Feature IS NOT NULL
    MERGE (f:Person {name:map.Feature})
    MERGE (a:Person {name:"'''+ artist + '''"})
    MERGE (w:Person {name:map.Writer})
    MERGE (p:Person {name:map.Producer})
    MERGE (l:Label {name:map.Label})
    MERGE (s:Song {name:"'''+ track +'''", rank: "'''+ ranking + '''"})
    
    
    MERGE (f)-[:COLLABORATED]-(a)
    MERGE (f)-[:FEATURED]->(s)
    MERGE (a)-[:SANG]->(s)
    MERGE (w)-[:WROTE]->(s)
    MERGE (l)-[:RELEASED]->(s)
    MERGE (p)-[:PRODUCED]->(s);
    ''')
    
    conn.query('''MATCH (f:Person) WHERE (f.name = "") detach delete f;''')
    
    conn.query('''MATCH (l:Label) WHERE (l.name = "") detach delete l;''')
    
    conn.close()

### Getting the Shit

In [13]:
def running(track_nr):
    # For scraping
    extLis = extract_credits(get_url(track_nr))
    # Get credit
    credit_list = pd.DataFrame.from_dict(extLis, orient='index').T
    df = pd.DataFrame.from_dict(credit_list) 
    df.to_csv (r'C:\Users\Nmtur\.Neo4jDesktop\relate-data\dbmss\dbms-abdd1e77-a9d6-4f8f-810d-312f9bad1906\import\collaborators.csv', index = False, header=True)
    # Running neo4j query
    run_querys(track_nr)


In [14]:
import time
tic = time.perf_counter()

errors = 0
count = 0
for i in range(1500):
    try:
        running(i)
        print(str(int(((index+1)/data.shape[0])*100)) + '% done - ' + str(index+1) + ' of ' + str(data.shape[0]) + ' songs iterated - ' + errors + ' encountered.', end='\r')
    except:
        #append error or insert specific values?
        errors = errors+1
        pass
    count = count+1
    print(count, end='\r')
        
toc = time.perf_counter()
print(f"Added new rows from external links in {toc - tic:0.4f} seconds")

Added new rows from external links in 3314.4264 seconds


In [15]:
conn.close()