In [9]:
from os import environ
from datetime import datetime, timedelta

from dotenv import load_dotenv
from psycopg2 import extensions, connect
import pandas as pd
from xhtml2pdf import pisa

YESTERDAY_DATE = datetime.strftime(datetime.now() - timedelta(1), '%d-%m-%Y')


def get_db_connection() -> extensions.connection:
    """Returns a connection to the AWS Bandcamp database"""

    try:
        return connect(user=environ["DB_USER"],
                       password=environ["DB_PASSWORD"],
                       host=environ["DB_IP"],
                       port=environ["DB_PORT"],
                       database=environ["DB_NAME"])
    except ConnectionError:
        print("Error: Cannot connect to the database")

In [10]:
def load_all_data(db_connection: extensions.connection) -> pd.DataFrame:
    """Loads all the data from the database into a pandas dataframe"""

    with db_connection.cursor() as curr:

        curr.execute("""
                    SELECT sale_event.*, country.country, artist.artist_name, genre.genre, item_type.item_type, item.item_name
                    FROM sale_event
                    JOIN country
                    ON country.country_id = sale_event.country_id
                    JOIN item
                    ON item.item_id = sale_event.item_id
                    JOIN artist
                    ON artist.artist_id = item.artist_id
                    JOIN item_genre
                    ON item_genre.item_id = item.item_id
                    JOIN genre
                    ON genre.genre_id =item_genre.genre_id
                    JOIN item_type
                    ON item_type.item_type_id = item.item_type_id;""")
        tuples = curr.fetchall()
        column_names = ['sale_id', 'sale_time', 'amount', 'item_id',
                        'country_id', 'country', 'artist', 'genre', 'item_type', 'item_name']

        df = pd.DataFrame(tuples, columns=column_names)

        return df

In [11]:
load_dotenv()
connection = get_db_connection()
data = load_all_data(connection)
data

Unnamed: 0,sale_id,sale_time,amount,item_id,country_id,country,artist,genre,item_type,item_name
0,1,2024-01-05 10:06:02+00:00,109,1,5,Australia,svdestada,crust punk,album,Candela
1,1,2024-01-05 10:06:02+00:00,109,1,5,Australia,svdestada,blackened crust,album,Candela
2,1,2024-01-05 10:06:02+00:00,109,1,5,Australia,svdestada,atmospheric black metal,album,Candela
3,1,2024-01-05 10:06:02+00:00,109,1,5,Australia,svdestada,black metal,album,Candela
4,1,2024-01-05 10:06:02+00:00,109,1,5,Australia,svdestada,hardcore,album,Candela
...,...,...,...,...,...,...,...,...,...,...
792,139,2024-01-05 10:38:57+00:00,800,138,5,Australia,doomed,death metal,album,6 Anti​-​Odes To Life
793,139,2024-01-05 10:38:57+00:00,800,138,5,Australia,doomed,metal,album,6 Anti​-​Odes To Life
794,139,2024-01-05 10:38:57+00:00,800,138,5,Australia,doomed,doom death metal,album,6 Anti​-​Odes To Life
795,139,2024-01-05 10:38:57+00:00,800,138,5,Australia,doomed,death doom metal,album,6 Anti​-​Odes To Life


In [31]:
unique_sales = data.drop_duplicates(subset='sale_id', keep='first')
album_sales = unique_sales.drop(unique_sales[unique_sales['artist'] != 'we rob rave'].index)
album_sales

Unnamed: 0,sale_id,sale_time,amount,item_id,country_id,country,artist,genre,item_type,item_name
412,75,2024-01-05 10:37:12+00:00,150,75,26,Belgium,we rob rave,club bass,track,HELEN IS DEAD (DEEPLINKIN REMIX)
549,94,2024-01-05 10:37:32+00:00,109,94,26,Belgium,we rob rave,odesa,track,SHOULD WE BE
583,102,2024-01-05 10:37:48+00:00,760,102,26,Belgium,we rob rave,bristol,album,SSBB006
716,130,2024-01-05 10:38:51+00:00,753,129,26,Belgium,we rob rave,rave,album,WE ROB RAVE - REMIXED AND REMASTERED
728,131,2024-01-05 10:38:51+00:00,753,130,26,Belgium,we rob rave,hel,album,WE ROB RAVE 7
738,132,2024-01-05 10:38:51+00:00,753,131,26,Belgium,we rob rave,hel,album,WE ROB RAVE 8
748,133,2024-01-05 10:38:51+00:00,753,132,26,Belgium,we rob rave,rave,album,WE ROB RAVE 9


In [21]:
unique_sales = data.drop_duplicates(subset='sale_id', keep='first')
album_sales = unique_sales.drop(unique_sales[unique_sales['item_type'] == 'track'].index)
album_sales
album_sales['item_name'].value_counts()


item_name
Driftveil City - Pok​​​é​​​mon / Toothless (Ringtone)    2
Candela                                                  1
The Inextricable Wandering                               1
AVESTA                                                   1
High Speed Tapes                                         1
                                                        ..
Process by EKANY                                         1
Future Nostalgia, Vol. 1 by EKANY                        1
NNMS61 — FREEDL                                          1
Last Call​,​Centry meets Manassah                        1
6 Anti​-​Odes To Life                                    1
Name: count, Length: 72, dtype: int64

In [28]:
unique_sales = data.drop_duplicates(subset='sale_id', keep='first')
album_sales = unique_sales.drop(
     unique_sales[unique_sales['item_type'] == 'track'].index)
album_sales = album_sales.groupby(
    'item_name')['amount'].sum()

album_sales = (
     album_sales/100).sort_values(ascending=False).head(5).reset_index()
album_sales


Unnamed: 0,item_name,amount
0,Process by EKANY,15.27
1,Gone,15.0
2,Radiant Records Palestine Comp,13.1
3,Radiant Records IWD Comp Vol 4,13.1
4,Everything Must Go!,12.66


In [None]:
def remove_duplicate_words(words: list) -> list:
    """Removes duplicate words in a list"""

    return list(set(words))

album_sales = data[data['item_type'] == 'album']

filtered_album_sales = album_sales[album_sales['artist'].isin(selected)]

albums = filtered_album_sales.groupby(['artist', 'artist'])[
        'genre'].agg(list).reset_index()

albums['genre'] = albums['genre'].apply(
        remove_duplicate_words)