In [17]:
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os


load_dotenv("../credentials.env") #Here goes the credentials for your Database

db_username = os.getenv("DB_USERNAME")
db_password = os.getenv("DB_PASSWORD")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")

engine = create_engine(f"postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}")

query = "SELECT * FROM clean_awards"

df = pd.read_sql_query(query, engine)
df.head()

Unnamed: 0,year,title,published_at,updated_at,category,nominee,artist,workers,img,winner
0,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Bad Guy,Billie Eilish,"Finneas O'Connell, producer; Rob Kinelski & Fi...",https://www.grammy.com/sites/com/files/styles/...,True
1,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,"Hey, Ma",Bon Iver,"BJ Burton, Brad Cook, Chris Messina & Justin V...",https://www.grammy.com/sites/com/files/styles/...,True
2,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,7 rings,Ariana Grande,"Charles Anderson, Tommy Brown, Michael Foster ...",https://www.grammy.com/sites/com/files/styles/...,True
3,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Hard Place,H.E.R.,"Rodney “Darkchild” Jerkins, producer; Joseph H...",https://www.grammy.com/sites/com/files/styles/...,True
4,2019,62nd Annual GRAMMY Awards (2019),2020-05-19T05:10:28-07:00,2020-05-19T05:10:28-07:00,Record Of The Year,Talk,Khalid,"Disclosure & Denis Kosiak, producers; Ingmar C...",https://www.grammy.com/sites/com/files/styles/...,True


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4810 entries, 0 to 4809
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   year          4810 non-null   int64 
 1   title         4810 non-null   object
 2   published_at  4810 non-null   object
 3   updated_at    4810 non-null   object
 4   category      4810 non-null   object
 5   nominee       4804 non-null   object
 6   artist        2970 non-null   object
 7   workers       2620 non-null   object
 8   img           3443 non-null   object
 9   winner        4810 non-null   bool  
dtypes: bool(1), int64(1), object(8)
memory usage: 343.0+ KB


There are nulls in nominee artist, workers and img

Also its seems the published_at and updated_at are objects (Need to be transformed into date type)

#### Summary

In [19]:
# Numerical summary
numerical_summary = df.describe()
numerical_summary

Unnamed: 0,year
count,4810.0
mean,1995.566944
std,17.14972
min,1958.0
25%,1983.0
50%,1998.0
75%,2010.0
max,2019.0


In [20]:
#Categorical summary
categorical_summary = df.describe(include=['object','bool'])
categorical_summary

Unnamed: 0,title,published_at,updated_at,category,nominee,artist,workers,img,winner
count,4810,4810,4810,4810,4804,2970,2620,3443,4810
unique,62,4,10,638,4131,1658,2366,1463,1
top,62nd Annual GRAMMY Awards (2019),2017-11-28T00:03:45-08:00,2019-09-10T01:08:19-07:00,Song Of The Year,Robert Woods,(Various Artists),"John Williams, composer (John Williams)",https://www.grammy.com/sites/com/files/styles/...,True
freq,433,4205,778,70,7,66,20,26,4810


## Column analysis

### Year

In [21]:
df['year'].value_counts()

year
2019    433
2007    111
2008    111
2006    110
2009    109
       ... 
1968     40
1960     39
1962     39
1959     35
1958     28
Name: count, Length: 62, dtype: int64

### Title

In [22]:
df['title'].value_counts()

title
62nd Annual GRAMMY Awards  (2019)    433
50th Annual GRAMMY Awards  (2007)    111
51st Annual GRAMMY Awards  (2008)    111
49th Annual GRAMMY Awards  (2006)    110
52nd Annual GRAMMY Awards  (2009)    109
                                    ... 
11th Annual GRAMMY Awards  (1968)     40
3rd Annual GRAMMY Awards  (1960)      39
5th Annual GRAMMY Awards  (1962)      39
2nd Annual GRAMMY Awards  (1959)      35
1st Annual GRAMMY Awards  (1958)      28
Name: count, Length: 62, dtype: int64

### Published/Updated at

In [23]:
df['published_at'].value_counts()

published_at
2017-11-28T00:03:45-08:00    4205
2020-05-19T05:10:28-07:00     433
2018-12-06T23:48:49-08:00      86
2018-05-22T03:08:24-07:00      86
Name: count, dtype: int64

In [24]:
df['updated_at'].value_counts()

updated_at
2019-09-10T01:08:19-07:00    778
2019-09-10T01:06:11-07:00    754
2019-09-10T01:07:37-07:00    713
2019-09-10T01:06:59-07:00    681
2019-09-10T01:11:09-07:00    658
2019-09-10T01:09:02-07:00    554
2020-05-19T05:10:28-07:00    433
2017-11-28T00:03:45-08:00    108
2020-09-01T12:16:40-07:00     83
2019-09-10T01:11:48-07:00     48
Name: count, dtype: int64

### Category

In [25]:
df['category'].value_counts()

category
Song Of The Year                                                                          70
Record Of The Year                                                                        69
Album Of The Year                                                                         66
Best Opera Recording                                                                      64
Best Album Notes                                                                          63
                                                                                          ..
Best Album Cover (Other Than Classical)                                                    1
Best Classical Performance - Operatic Or Choral                                            1
Best Sound Track Album Or Recording Of Original Cast From Motion Picture Or Television     1
Best Sound Track Album Or Recording Of Score From Motion Picture Or Television             1
Best Performance By A "Top 40" Artist                        

### Nominee

In [26]:
df['nominee'].value_counts()

nominee
Robert Woods                          7
Bridge Over Troubled Water            7
Berlioz: Requiem                      7
Steven Epstein                        7
Up, Up And Away                       6
                                     ..
The Best Of The Stan Freberg Shows    1
Gigi                                  1
The Music Man                         1
Cross Country Suite                   1
Only The Lonely                       1
Name: count, Length: 4131, dtype: int64

### Artist

In [27]:
df['artist'].value_counts()

artist
(Various Artists)                                          66
U2                                                         18
Aretha Franklin                                            16
Stevie Wonder                                              13
Beyoncé                                                    13
                                                           ..
Marty Robbins                                               1
Harry Belafonte                                             1
(David Seville And The Chipmunks) Ross Bagdasarian, Sr.     1
Bobby Darin                                                 1
David Rose And His Orchestra With Andre Previn              1
Name: count, Length: 1658, dtype: int64

### Workers

In [28]:
df['workers'].value_counts()

workers
John Williams, composer (John Williams)                                                                              20
Vladimir Horowitz, artist                                                                                            15
Henry Mancini, arranger (Henry Mancini)                                                                               8
Robert Shaw, conductor (Atlanta Symphony Chorus; Atlanta Symphony Orchestra)                                          7
(Chicago Symphony Orchestra)                                                                                          7
                                                                                                                     ..
Simon Green, producer; Simon Green & Frank Merritt, mixers                                                            1
Simone Giani, Luca De Gregorio & Mattia Vitale, producers; Simone Giani, Luca De Gregorio & Mattia Vitale, mixers     1
Jason Evigan & RÜFÜS DU SOL, pro

### Img

In [29]:
df['img'].value_counts()

img
https://www.grammy.com/sites/com/files/styles/artist_circle/public/muzooka/John%2BWilliams/John%2520Williams_1_1_1597170495.jpg?itok=GsnrkP98            26
https://www.grammy.com/sites/com/files/styles/artist_circle/public/muzooka/U2/U2_1_1_1578385236.jpg?itok=rQVhxxhL                                        22
https://www.grammy.com/sites/com/files/styles/artist_circle/public/muzooka/Vladimir%2BHorowitz/Vladimir%2520Horowitz_1_1_1597176026.jpg?itok=QiT9d29e    21
https://www.grammy.com/sites/com/files/styles/artist_circle/public/muzooka/Henry%2BMancini/Henry%2520Mancini_1_1_1581553583.jpg?itok=oMIyl-mj            20
https://www.grammy.com/sites/com/files/styles/artist_circle/public/georgsolti-spotlight-78824961.png?itok=rNrB7-r2                                       20
                                                                                                                                                         ..
https://www.grammy.com/sites/com/files/styles/artist_circle/

### Winner

In [30]:
df['winner'].value_counts()

winner
True    4810
Name: count, dtype: int64

## Transformations:
Mainly we should drop the columns that dont 

In [31]:
#Every nominated is winner (at least in this dataset), and img doesnt provide nothing. We can improve storage deleting that columns
drop_columns = ['img']
df = df.drop(columns=drop_columns)

# Object to datetime
df['published_at'] = pd.to_datetime(df['published_at'], format='%Y-%m-%dT%H:%M:%S%z', utc=True)
df['updated_at'] = pd.to_datetime(df['updated_at'], format='%Y-%m-%dT%H:%M:%S%z', utc=True)

#Object to string
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns}) #Object to string

df['nominee'] = df['nominee'].fillna('Unknown')
df['artist'] = df['artist'].fillna('Unknown')
df['workers'] = df['workers'].fillna('Unknown')
df['workers'] = df['workers'].fillna('False')

#Normalize for the future merge
df['artist'] = df['artist'].str.lower().str.strip().str.replace(r'\s+', ' ', regex=True)
df['nominee'] = df['nominee'].str.lower().str.strip().str.replace(r'\s+', ' ', regex=True)
df = df.rename(columns={'artist': 'artists'})
df = df.rename(columns={'nominee': 'track_name'})
df = df.rename(columns={'winner': 'was_nominated'})

#We check the transformations
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4810 entries, 0 to 4809
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   year           4810 non-null   int64              
 1   title          4810 non-null   string             
 2   published_at   4810 non-null   datetime64[ns, UTC]
 3   updated_at     4810 non-null   datetime64[ns, UTC]
 4   category       4810 non-null   string             
 5   track_name     4810 non-null   string             
 6   artists        4810 non-null   string             
 7   workers        4810 non-null   string             
 8   was_nominated  4810 non-null   bool               
dtypes: bool(1), datetime64[ns, UTC](2), int64(1), string(5)
memory usage: 305.5 KB
None


In [32]:
engine = create_engine(f"postgresql://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}")

try:
    df.to_sql("clean_awards", engine, if_exists="replace", index=False)
    print("Succesfull migration")
except Exception as e:
    print(f"Error in migration: {e}")

Succesfull migration
