In [1]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
import pandas as pd

from sqlalchemy import Column, Integer, String, Date, SmallInteger, Table, Float, MetaData, ForeignKey
from sqlalchemy.orm import relationship

In [2]:
db_string = "postgresql://admin:123@127.0.0.1:5432/spotify"
engine = create_engine(db_string)
Base = declarative_base()

In [3]:
class Artist(Base):
    __tablename__ = 'artist'
    artist_id = Column(Integer, primary_key=True)
    name = Column(String(50))

    def __repr__(self):
        return "<artist(id='{0}', name={1})>".format(self.artist_id, self.name)


class Song(Base):
    __tablename__ = 'song'
    song_id = Column(Integer, primary_key=True)
    title = Column(String(50))
    artist_id = Column(Integer())

    def __repr__(self):
        return "<song(id='{0}', name={1})>".format(self.song_id, self.name)

class Trend(Base):
    __tablename__ = 'trend'
    trend_id = Column(Integer, primary_key=True)
    trend = Column(String(50))

    def __repr__(self):
        return "<trend(id='{0}', trend={1})>".format(self.trend_id, self.trend)

class Day(Base):
    __tablename__ = 'day'
    day_id = Column(Integer, primary_key=True)
    date = Column(Date)

    def __repr__(self):
        return "<date(id='{0}', date={1})>".format(self.day_id, self.date)

class Region(Base):
    __tablename__ = 'region'
    region_id = Column(Integer, primary_key=True)
    name = Column(String(50))

    def __repr__(self):
        return "<region(id='{0}', name={1})>".format(self.region_id, self.name)

class Category(Base):
    __tablename__ = 'category'
    category_id = Column(Integer, primary_key=True)
    name = Column(String(50))

    def __repr__(self):
        return "<category(id='{0}', name={1})>".format(self.category_id, self.name)

class Chart(Base):
    __tablename__ = 'chart'
    chart_id = Column(Integer, primary_key=True)
    position = Column(Integer)
    song_id = Column(Integer)
    day_id = Column(Integer)
    region_id = Column(Integer)
    category_id = Column(Integer)
    trend_id = Column(Integer)
    streams = Column(Integer)

    def __repr__(self):
        return "<chart(id='{0}', position={1}, song={2}, streams={3})>".format(self.chart_id, self.position, self.song_id, self.streams)

In [4]:
Base.metadata.create_all(engine)

In [50]:
data = pd.read_csv('..\data\charts.csv')
data.head()

Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams
0,Chantaje (feat. Maluma),1,2017-01-01,Shakira,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,Argentina,top200,SAME_POSITION,253019.0
1,Vente Pa' Ca (feat. Maluma),2,2017-01-01,Ricky Martin,https://open.spotify.com/track/7DM4BPaS7uofFul...,Argentina,top200,MOVE_UP,223988.0
2,Reggaetón Lento (Bailemos),3,2017-01-01,CNCO,https://open.spotify.com/track/3AEZUABDXNtecAO...,Argentina,top200,MOVE_DOWN,210943.0
3,Safari,4,2017-01-01,"J Balvin, Pharrell Williams, BIA, Sky",https://open.spotify.com/track/6rQSrBHf7HlZjtc...,Argentina,top200,SAME_POSITION,173865.0
4,Shaky Shaky,5,2017-01-01,Daddy Yankee,https://open.spotify.com/track/58IL315gMSTD37D...,Argentina,top200,MOVE_UP,153956.0


In [51]:
data = data.head(10000)

In [52]:
# Artist

df_artist = pd.DataFrame(data['artist'].unique(), columns=['name'])
df_artist.index.name = 'artist_id'
df_artist.index += 1

df_artist.head()

Unnamed: 0_level_0,name
artist_id,Unnamed: 1_level_1
1,Shakira
2,Ricky Martin
3,CNCO
4,"J Balvin, Pharrell Williams, BIA, Sky"
5,Daddy Yankee


In [53]:
# Song

df_song = data[['title','artist']].drop_duplicates().reset_index().drop(columns = ['index'])
df_song = df_song.rename(columns = {'artist':'artist_id'})
df_song.index.name = 'song_id'
df_song.index += 1
df_song['artist_id'] = df_song['artist_id'].map(lambda x:  df_artist[df_artist['name'] == x].index.values.astype(int)[0])

df_song.head()

Unnamed: 0_level_0,title,artist_id
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Chantaje (feat. Maluma),1
2,Vente Pa' Ca (feat. Maluma),2
3,Reggaetón Lento (Bailemos),3
4,Safari,4
5,Shaky Shaky,5


In [54]:
# Trend

df_trend = pd.DataFrame(data['trend'].unique(), columns=['trend'])
df_trend.index.name = 'trend_id'
df_trend.index += 1

df_trend.head()

Unnamed: 0_level_0,trend
trend_id,Unnamed: 1_level_1
1,SAME_POSITION
2,MOVE_UP
3,MOVE_DOWN
4,NEW_ENTRY


In [55]:
# Day

df_day = pd.DataFrame(data['date'].unique(), columns=['date'])
df_day.index.name = 'day_id'
df_day.index += 1

df_day.head()

Unnamed: 0_level_0,date
day_id,Unnamed: 1_level_1
1,2017-01-01
2,2017-01-02
3,2018-03-01
4,2018-03-02


In [56]:
# Region

df_region = pd.DataFrame(data['region'].unique(), columns=['name'])
df_region.index.name = 'region_id'
df_region.index += 1

df_region.head()

Unnamed: 0_level_0,name
region_id,Unnamed: 1_level_1
1,Argentina
2,Australia
3,Brazil
4,Austria
5,Belgium


In [57]:
# Category

df_category = pd.DataFrame(data['chart'].unique(), columns=['name'])
df_category.index.name = 'category_id'
df_category.index += 1

df_category.head()

Unnamed: 0_level_0,name
category_id,Unnamed: 1_level_1
1,top200


In [58]:
df_chart = data[['rank', 'title', 'date', 'region', 'chart', 'trend', 'streams']].drop_duplicates().reset_index().drop(columns = ['index'])
df_chart = df_chart.rename(columns = {'title':'song_id', 'rank':'position', 'date':'day_id', 'region':'region_id', 'chart':'category_id', 'trend':'trend_id'})
df_chart.index.name = 'chart_id'
df_chart.index += 1
df_chart['song_id'] = df_chart['song_id'].map(lambda x:  df_song[df_song['title'] == x].index.values.astype(int)[0])
df_chart['day_id'] = df_chart['day_id'].map(lambda x:  df_day[df_day['date'] == x].index.values.astype(int)[0])
df_chart['region_id'] = df_chart['region_id'].map(lambda x:  df_region[df_region['name'] == x].index.values.astype(int)[0])
df_chart['category_id'] = df_chart['category_id'].map(lambda x:  df_category[df_category['name'] == x].index.values.astype(int)[0])
df_chart['trend_id'] = df_chart['trend_id'].map(lambda x:  df_trend[df_trend['trend'] == x].index.values.astype(int)[0])
df_chart['streams'] = df_chart['streams'].astype('int')

df_chart.head()

Unnamed: 0_level_0,position,song_id,day_id,region_id,category_id,trend_id,streams
chart_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,1,1,1,1,1,253019
2,2,2,1,1,1,2,223988
3,3,3,1,1,1,3,210943
4,4,4,1,1,1,1,173865
5,5,5,1,1,1,2,153956


In [59]:
df_artist.to_sql('artist',engine, if_exists='append')
df_song.to_sql('song',engine, if_exists='append')
df_trend.to_sql('trend',engine, if_exists='append')
df_day.to_sql('day',engine, if_exists='append')
df_region.to_sql('region',engine, if_exists='append')
df_category.to_sql('category',engine, if_exists='append')
df_chart.to_sql('chart',engine, if_exists='append')

DataError: (psycopg2.errors.StringDataRightTruncation) value too long for type character varying(50)

[SQL: INSERT INTO artist (artist_id, name) VALUES (%(artist_id)s, %(name)s)]
[parameters: ({'artist_id': 1, 'name': 'Shakira'}, {'artist_id': 2, 'name': 'Ricky Martin'}, {'artist_id': 3, 'name': 'CNCO'}, {'artist_id': 4, 'name': 'J Balvin, Pharrell Williams, BIA, Sky'}, {'artist_id': 5, 'name': 'Daddy Yankee'}, {'artist_id': 6, 'name': 'Sebastian Yatra'}, {'artist_id': 7, 'name': 'Rombai'}, {'artist_id': 8, 'name': 'Zion & Lennox'}  ... displaying 10 of 1566 total bound parameter sets ...  {'artist_id': 1565, 'name': 'Post Malone, 2 Chainz'}, {'artist_id': 1566, 'name': 'Florida Georgia Line'})]
(Background on this error at: https://sqlalche.me/e/14/9h9h)