In [299]:
import sys
!{sys.executable} -m pip install duckdb-engine
!{sys.executable} -m pip install ipython-sql



In [300]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import time

import seaborn
from matplotlib import pyplot

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import utils

In [301]:
%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


Research Question: Does there exist correlation between the popularity of Pitbull’s music (measured through Billboard awards and spots on Billboard Hot 100) with American unemployment since his breakthrough in 2009 to 2021?

Source datasets:
https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs
https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Pitbull
https://www.kaggle.com/code/qusaybtoush1990/us-unemployment-data-1948-2021

Billboard Hot 100 Data Cleaning:

In [302]:
billboard_df = pd.read_csv('charts.csv')
print (billboard_df.head())

         date  rank           song                         artist  last-week  \
0  2021-11-06     1     Easy On Me                          Adele        1.0   
1  2021-11-06     2           Stay  The Kid LAROI & Justin Bieber        2.0   
2  2021-11-06     3  Industry Baby        Lil Nas X & Jack Harlow        3.0   
3  2021-11-06     4     Fancy Like                   Walker Hayes        4.0   
4  2021-11-06     5     Bad Habits                     Ed Sheeran        5.0   

   peak-rank  weeks-on-board  
0          1               3  
1          1              16  
2          1              14  
3          3              19  
4          2              18  


In [303]:
pitbull_df= billboard_df.loc[billboard_df['artist'] == 'Pitbull']

In [304]:
pitbull_df= pitbull_df.loc[pitbull_df['date'].between('2009-01-01', '2022-10-15')]
%sql SELECT * FROM pitbull_df ORDER BY date ASC

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2009-03-14,66,I Know You Want Me (Calle Ocho),Pitbull,,66,1
1,2009-03-21,54,I Know You Want Me (Calle Ocho),Pitbull,66.0,54,2
2,2009-03-28,50,I Know You Want Me (Calle Ocho),Pitbull,54.0,50,3
3,2009-04-04,44,I Know You Want Me (Calle Ocho),Pitbull,50.0,44,4
4,2009-04-11,37,I Know You Want Me (Calle Ocho),Pitbull,44.0,37,5
...,...,...,...,...,...,...,...
79,2012-08-04,50,Back In Time,Pitbull,38.0,11,16
80,2012-08-11,55,Back In Time,Pitbull,50.0,11,17
81,2012-08-18,66,Back In Time,Pitbull,55.0,11,18
82,2012-08-25,75,Back In Time,Pitbull,66.0,11,19


Webscraping and data cleaning Pitbull's awards

In [305]:
page= requests.get('https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Pitbull')
with open ('pitbull_awards.html', 'w', errors='replace') as writer:
    writer.write (page.text)
with open ('pitbull_awards.html', 'r') as reader:
    pitbull_awards_source = reader.read()
pitbull_awards_page= BeautifulSoup(pitbull_awards_source, 'html.parser')
table= pitbull_awards_page.find_all(['table'], class_= 'wikitable plainrowheaders')
pitbull_awards_df= pd.read_html(str(table))
pitbull_billboard_awards_df=pd.DataFrame(pitbull_awards_df[2])
print (pitbull_billboard_awards_df.head())
pitbull_latin_billboard_awards_df=pd.DataFrame(pitbull_awards_df[3])
print (pitbull_latin_billboard_awards_df.head())

      Year        Nominee / work             Award     Result
0  2011[4]               Pitbull  Top Latin Artist  Nominated
1  2011[4]            "Bon, Bon"    Top Latin Song        Won
2  2012[5]               Pitbull  Top Latin Artist  Nominated
3  2012[5]  "Give Me Everything"    Top Radio Song        Won
4  2012[5]  "Give Me Everything"      Top Rap Song  Nominated
      Year                     Nominee / work  \
0  2009[6]                            Pitbull   
1  2010[7]                            Pitbull   
2  2010[7]  "I Know You Want Me (Calle Ocho)"   
3     2011                            Pitbull   
4     2011                            Pitbull   

                                           Award     Result  
0      Latin Digital Download Artist of the Year        Won  
1  Latin Rhythm Airplay Artist of the Year, Solo  Nominated  
2          Latin Rhythm Airplay Song of the Year  Nominated  
3                                      Social 50  Nominated  
4                      

In [306]:
pitbull_latin_billboard_awards_df['Latin']= 'Yes'
pitbull_billboard_awards_df['Latin']= 'No'
pitbull_billboard_all_awards_df = pd.concat([pitbull_billboard_awards_df, pitbull_latin_billboard_awards_df], ignore_index=True)
pitbull_billboard_all_awards_df['Year'] = pitbull_billboard_all_awards_df['Year'].str[:4]
pd.to_datetime(pitbull_billboard_all_awards_df.Year, format='%Y')
pitbull_billboard_all_awards_df.sort_values(by='Year',ascending=True)
print (pitbull_billboard_all_awards_df)
#Still need to sort, but it's not working

    Year                                     Nominee / work  \
0   2011                                            Pitbull   
1   2011                                         "Bon, Bon"   
2   2012                                            Pitbull   
3   2012                               "Give Me Everything"   
4   2012                               "Give Me Everything"   
5   2012                               "Give Me Everything"   
6   2012                                         "Bon, Bon"   
7   2013                                            Pitbull   
8   2013          "Bailando Por El Mundo" (with Juan Magan)   
9   2014                                            Pitbull   
10  2014                                           "Timber"   
11  2009                                            Pitbull   
12  2010                                            Pitbull   
13  2010                  "I Know You Want Me (Calle Ocho)"   
14  2011                                            Pit

Data Cleaning Unemployment Data

In [307]:
unemployment_df = pd.read_csv('unemployment_rate_data.csv')
unemployment_df['date'] =  pd.to_datetime(unemployment_df['date'], infer_datetime_format= True)
unemployment_df.loc[unemployment_df['date'].between('2009-01-01', '2021-10-01')]

Unnamed: 0,date,unrate,unrate_men,unrate_women,unrate_16_to_17,unrate_18_to_19,unrate_20_to_24,unrate_25_to_34,unrate_35_to_44,unrate_45_to_54,unrate_55_over
732,2009-01-01,8.5,9.7,7.0,20.8,21.7,13.5,9.2,7.4,6.7,5.9
733,2009-02-01,8.9,10.4,7.2,22.8,22.7,13.8,9.8,7.5,7.2,6.1
734,2009-03-01,9.0,10.6,7.3,23.6,20.4,13.9,10.0,7.8,7.2,6.6
735,2009-04-01,8.6,9.9,7.1,21.8,20.4,13.7,9.6,7.5,6.6,6.2
736,2009-05-01,9.1,10.2,7.7,24.2,23.3,15.5,10.1,7.5,6.5,6.3
...,...,...,...,...,...,...,...,...,...,...,...
881,2021-06-01,6.1,6.1,6.1,15.0,12.3,9.9,6.5,5.2,4.6,5.0
882,2021-07-01,5.7,5.5,5.8,12.8,9.9,9.5,6.3,4.8,4.0,4.6
883,2021-08-01,5.3,5.1,5.5,10.7,11.0,9.1,5.8,4.4,4.2,4.1
884,2021-09-01,4.6,4.6,4.5,9.2,12.6,7.7,5.0,3.8,3.7,3.3
