<h1>Database 2 Project - Load data in ttl file</h1>
<h3>
    Group FRANGI: <br>
    Francesco Frigato, Andrea Felline, Gianluca Antolini <br>
    <br>
    Topic: <br>
    Spotify songs and their Youtube videos
</h3>


<h4>Install and import required Libraries</h4>

In [1]:
!pip install rdflib



In [2]:
import pandas as pd
import os
from pathlib import Path
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import FOAF, XSD, SKOS

<h4>Set global variables</h4>
<ol>
    <li>Path to the csv dataset</li>
    <li>Path for the final file</li>
    <li>Ontology object</li>
    <li>Graph object</li>
    <li>Data readed from the csv file</li>
    <li>Dictionaries to help properties naming</li>
</ol>

In [3]:
# Paths
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
datasetCSV = path + "/Datasets/Computed/complete_dataset.csv"
savePath = path + "/Datasets/rdf/"

# Construct the SpotifyYoutubeStatistics ontology namespaces not known by RDFlib
SY = Namespace("http://www.dei.unipd.it/Database2/FRANGI/spotifyYoutubeStatistics/")

# Create the graph
g = Graph()

# Load artists
data = pd.read_csv(datasetCSV, sep=",")

# Useful dictionaries
floatProps = {"Instrumentalness": "instrumentalness", "Danceability": "danceability", "Energy": "energy",
              "Loudness": "loudness", "Speechiness": "speechiness", "Acousticness": "acousticness",
              "Liveness": "liveness", "Tempo": "tempo", "Valence": "valence"}
intProps = {"Key": "key", "Duration_ms": "duration", "uStream": "stream"}

print("Number of rows of the dataset:", len(data))

Number of rows of the dataset: 20718


<h4>Iterate over each row of the dataset and add everything to the graph</h4>

At each iteration check if something already exists (for example an Artist may show up for multiple songs, and thus in multiple lines) or if it's null (for example some songs are not in a playlist, so that field will be null sometimes).

In [4]:
for index, row in data.iterrows():
    
    # Create the node to add to the Graph
    # the node has the namespace + the artist id as URI
    Artist = None if pd.isnull(row["Url_spotify"]) else URIRef(SY[row["Url_spotify"]])
    Song = None if pd.isnull(row["Uri"]) else URIRef(SY[row["Uri"]])
    Album = None if pd.isnull(row["albumId"]) else URIRef(SY[row["albumId"]])
    Video = None if pd.isnull(row["Url_youtube"]) else URIRef(SY[str(row["Url_youtube"]).split("=")[-1]])
    Channel = None if pd.isnull(row["channelId"]) else URIRef(SY[row["channelId"]])
    Playlist = None if pd.isnull(row["playlist_id"]) else URIRef(row["playlist_id"])
    Genre = None if pd.isnull(row["playlist_genre"]) else URIRef(SY[row["playlist_genre"]])

    # remove spaces from subgenres
    formattedSubgenre = float("nan")
    if str(row["playlist_subgenre"]) != "nan":
        formattedSubgenre = row["playlist_subgenre"].replace(" ", "_")
    Subgenre = None if pd.isnull(row["playlist_subgenre"]) else URIRef(SY[formattedSubgenre])

    # add Artist
    if (Artist, RDF.type, SY.Artist) not in g and not pd.isnull(row["Url_spotify"]):
        g.add((Artist, RDF.type, SY.Artist))
        g.add((Artist, SY["personName"], Literal(row["Artist"], datatype=XSD.string)))

    # add Song
    if (Song, RDF.type, SY.SpotifySong) not in g and not pd.isnull(row["Uri"]):
        g.add((Song, RDF.type, SY.SpotifySong))
        g.add((Song, SY["trackName"], Literal(row["Track"], datatype=XSD.string)))

        for n in floatProps:
            if not pd.isnull(row[n]):
                g.add((Song, SY[floatProps[n]], Literal(float(row[n]), datatype=XSD.float)))

        for n in intProps:
            if not pd.isnull(row[n]):
                g.add((Song, SY[intProps[n]], Literal(int(row[n]), datatype=XSD.integer)))

    # add Album
    if (Album, RDF.type, SY.Album) not in g and not pd.isnull(row["albumId"]):
        g.add((Album, RDF.type, SY.Album))
        g.add((Album, SY["albumName"], Literal(row["Album"], datatype=XSD.string)))
        g.add((Album, SY["albumType"], Literal(row["Album_type"], datatype=XSD.string)))

    # add Video
    if (Video, RDF.type, SY.YoutubeVideo) not in g and not pd.isnull(row["Url_youtube"]):
        g.add((Video, RDF.type, SY.YoutubeVideo))
        g.add((Video, SY["videoTitle"], Literal(row["Title"], datatype=XSD.string)))

        if not pd.isnull(row["uViews"]):
            g.add((Video, SY["views"], Literal(int(row["uViews"]), datatype=XSD.integer)))

        if not pd.isnull(row["uLikes"]):
            g.add((Video, SY["likes"], Literal(int(row["uLikes"]), datatype=XSD.integer)))

        if not pd.isnull(row["uComments"]):
            g.add((Video, SY["comments"], Literal(int(row["uComments"]), datatype=XSD.integer)))

        g.add((Video, SY["description"], Literal(row["Description"], datatype=XSD.string)))
        g.add((Video, SY["licensed"], Literal(row["Licensed"], datatype=XSD.boolean)))

        if not pd.isnull(row["uOfficial_video"]):
            g.add((Video, SY["officialVideo"], Literal(row["uOfficial_video"], datatype=XSD.boolean)))

    # add Channel
    if (Channel, RDF.type, SY.YoutubeChannel) not in g and not pd.isnull(index):
        g.add((Channel, RDF.type, SY.YoutubeChannel))
        g.add((Channel, SY["channelName"], Literal(row["Channel"], datatype=XSD.string)))

    # add Album-Song links
    if (Album, SY["isComposed"], Song) not in g and not pd.isnull(row["albumId"]) and not pd.isnull(row["Uri"]):
        g.add((Album, SY["isComposed"], Song))
        g.add((Song, SY["belongsTo"], Album))

    # add Song-Artist links
    if (Artist, SY["published"], Song) not in g and not pd.isnull(row["Uri"]) and not pd.isnull(row["Url_spotify"]):
        g.add((Artist, SY["published"], Song))
        g.add((Song, SY["isPublishedBy"], Artist))

    # add Video-Song links
    if (Video, SY["isVideoOf"], Song) not in g and not pd.isnull(row["Uri"]) and not pd.isnull(row["Url_youtube"]):
        g.add((Video, SY["isVideoOf"], Song))

    # add Video-Channel links
    if (Video, SY["isUploadedBy"], Channel) not in g \
            and not pd.isnull(row["channelId"]) and not pd.isnull(row["Url_youtube"]):
        g.add((Video, SY["isUploadedBy"], Channel))
        g.add((Channel, SY["upload"], Video))

    # add Playlists
    if (Playlist, RDF.type, SY.SpotifyPlaylist) not in g and not pd.isnull(row["playlist_id"]):
        g.add((Playlist, RDF.type, SY.SpotifyPlaylist))
        g.add((Playlist, SY["playlistName"], Literal(row["playlist_name"], datatype=XSD.string)))

    # add Genres
    if (Genre, RDF.type, SY.Genre) not in g and not pd.isnull(row["playlist_genre"]):
        g.add((Genre, RDF.type, SY.Genre))
        g.add((Genre, RDF.type, SKOS.Concept))
        g.add((Genre, SY["genreName"], Literal(row["playlist_genre"], datatype=XSD.string)))

    # add Subgenres
    if (Subgenre, RDF.type, SY.Genre) not in g and not pd.isnull(row["playlist_subgenre"]):
        g.add((Subgenre, RDF.type, SY.Genre))
        g.add((Subgenre, RDF.type, SKOS.Concept))
        g.add((Subgenre, SY["genreName"], Literal(formattedSubgenre, datatype=XSD.string)))

    # add Playlist-Genre links
    if (Playlist, SY["hasGenre"], Genre) not in g \
            and not pd.isnull(row["playlist_id"]) and not pd.isnull(row["playlist_genre"]):
        g.add((Playlist, SY["hasGenre"], Genre))

    # add Playlist-Subgenre links
    if (Playlist, SY["hasGenre"], Subgenre) not in g \
            and not pd.isnull(row["playlist_id"]) and not pd.isnull(row["playlist_subgenre"]):
        g.add((Playlist, SY["hasGenre"], Subgenre))

    # add Genre-Subgenre links
    if (Genre, SKOS.narrower, Subgenre) not in g \
            and not pd.isnull(row["playlist_subgenre"]) and not pd.isnull(row["playlist_genre"]):
        g.add((Genre, SKOS.narrower, Subgenre))
        g.add((Subgenre, SKOS.broader, Genre))

    # add Song-Playlist links
    if (Song, SY["isPartOf"], Playlist) not in g and not pd.isnull(row["playlist_id"]) and not pd.isnull(row["Uri"]):
        g.add((Song, SY["isPartOf"], Playlist))
        
    # print status
    if index % (int(len(data)/10)+1) == 0:
        print("Progress:", str(int(index*100/len(data)))+"%")

print("Progress: 100%")

Progress: 0%
Progress: 10%
Progress: 20%
Progress: 30%
Progress: 40%
Progress: 50%
Progress: 60%
Progress: 70%
Progress: 80%
Progress: 90%
Progress: 100%


<h4>Bind the namespaces to a prefix for more readable output</h4>

In [5]:
g.bind("xsd", XSD)
g.bind("sy", SY)

<h4>Save all the data in Turtle format</h4>

In [6]:
print("Saving...\n")

with open(savePath + "syOn.ttl", "w", encoding="utf-8") as file:
    file.write(g.serialize(format="turtle"))

print("Done!")

Saving...

Done!
