In [1]:
import sys
!{sys.executable} -m pip install duckdb-engine
!{sys.executable} -m pip install ipython-sql



In [2]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import time

import seaborn
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn import utils
from scipy.stats import binom, poisson, norm

In [3]:
%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

### Billboard Hot 100 Data Cleaning:

The intial data cleaning meant that we had to cut down the Kaggle unemployment data starting from 2009, because that's when Pitbull first appeared on the Hot100.

In [4]:
billboard_df = pd.read_csv('charts.csv')
print (billboard_df.head())

         date  rank           song                         artist  last-week  \
0  2021-11-06     1     Easy On Me                          Adele        1.0   
1  2021-11-06     2           Stay  The Kid LAROI & Justin Bieber        2.0   
2  2021-11-06     3  Industry Baby        Lil Nas X & Jack Harlow        3.0   
3  2021-11-06     4     Fancy Like                   Walker Hayes        4.0   
4  2021-11-06     5     Bad Habits                     Ed Sheeran        5.0   

   peak-rank  weeks-on-board  
0          1               3  
1          1              16  
2          1              14  
3          3              19  
4          2              18  


In [5]:
pitbull_df= billboard_df.loc[billboard_df['artist'] == 'Pitbull']

In [6]:
pitbull_df= pitbull_df.loc[pitbull_df['date'].between('2009-01-01', '2022-10-15')]
%sql SELECT * FROM pitbull_df ORDER BY date ASC

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2009-03-14,66,I Know You Want Me (Calle Ocho),Pitbull,,66,1
1,2009-03-21,54,I Know You Want Me (Calle Ocho),Pitbull,66.0,54,2
2,2009-03-28,50,I Know You Want Me (Calle Ocho),Pitbull,54.0,50,3
3,2009-04-04,44,I Know You Want Me (Calle Ocho),Pitbull,50.0,44,4
4,2009-04-11,37,I Know You Want Me (Calle Ocho),Pitbull,44.0,37,5
...,...,...,...,...,...,...,...
79,2012-08-04,50,Back In Time,Pitbull,38.0,11,16
80,2012-08-11,55,Back In Time,Pitbull,50.0,11,17
81,2012-08-18,66,Back In Time,Pitbull,55.0,11,18
82,2012-08-25,75,Back In Time,Pitbull,66.0,11,19


### Webscraping and data cleaning Pitbull's awards

This involved webscraping Pitbull's award page on Wikipedia. Because we're using the Hot100 as a popularity metric, it made sense to use Billboard Awards as a success metric. Here, we combined the Billboard Latin Awards with the Billboard America Awards to give a fuller picture of Pitbull as an artist. We implemented a binary to ask later down the line if his success in the Latin awards could have any impact on whether or not there was a correlation with unemployment.

In [7]:
page= requests.get('https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Pitbull')
with open ('pitbull_awards.html', 'w', errors='replace') as writer:
    writer.write (page.text)
with open ('pitbull_awards.html', 'r') as reader:
    pitbull_awards_source = reader.read()
pitbull_awards_page= BeautifulSoup(pitbull_awards_source, 'html.parser')
table= pitbull_awards_page.find_all(['table'], class_= 'wikitable plainrowheaders')
pitbull_awards_df= pd.read_html(str(table))
pitbull_billboard_awards_df=pd.DataFrame(pitbull_awards_df[2])
print (pitbull_billboard_awards_df.head())
pitbull_latin_billboard_awards_df=pd.DataFrame(pitbull_awards_df[3])
print (pitbull_latin_billboard_awards_df.head())

      Year        Nominee / work             Award     Result
0  2011[4]               Pitbull  Top Latin Artist  Nominated
1  2011[4]            "Bon, Bon"    Top Latin Song        Won
2  2012[5]               Pitbull  Top Latin Artist  Nominated
3  2012[5]  "Give Me Everything"    Top Radio Song        Won
4  2012[5]  "Give Me Everything"      Top Rap Song  Nominated
      Year                     Nominee / work  \
0  2009[6]                            Pitbull   
1  2010[7]                            Pitbull   
2  2010[7]  "I Know You Want Me (Calle Ocho)"   
3     2011                            Pitbull   
4     2011                            Pitbull   

                                           Award     Result  
0      Latin Digital Download Artist of the Year        Won  
1  Latin Rhythm Airplay Artist of the Year, Solo  Nominated  
2          Latin Rhythm Airplay Song of the Year  Nominated  
3                                      Social 50  Nominated  
4                      

In [8]:
pitbull_latin_billboard_awards_df['Latin']= 'Yes'
pitbull_latin_billboard_awards_df['Month']= '09'
pitbull_billboard_awards_df['Month']= '05'
pitbull_billboard_awards_df['Latin']= 'No'
pitbull_billboard_all_awards_df = pd.concat([pitbull_billboard_awards_df, pitbull_latin_billboard_awards_df], ignore_index=True)
pitbull_billboard_all_awards_df['Year'] = pitbull_billboard_all_awards_df['Year'].str[:4]
pd.to_datetime(pitbull_billboard_all_awards_df.Year, format='%Y')
pd.to_datetime(pitbull_billboard_all_awards_df.Month, format='%m')
pitbull_billboard_all_awards_df=pitbull_billboard_all_awards_df.sort_values(by='Year',ascending=True)
pitbull_billboard_all_awards_df.head(10)
#print(pitbull_billboard_all_awards_df)

Unnamed: 0,Year,Nominee / work,Award,Result,Month,Latin
11,2009,Pitbull,Latin Digital Download Artist of the Year,Won,9,Yes
13,2010,"""I Know You Want Me (Calle Ocho)""",Latin Rhythm Airplay Song of the Year,Nominated,9,Yes
12,2010,Pitbull,"Latin Rhythm Airplay Artist of the Year, Solo",Nominated,9,Yes
0,2011,Pitbull,Top Latin Artist,Nominated,5,No
18,2011,"""Bon, Bon""",Latin Rhythm Airplay,Nominated,9,Yes
17,2011,"""Bon, Bon""",Latin Digital Download of the Year,Nominated,9,Yes
16,2011,Pitbull,Latin Rhythm Albums,Nominated,9,Yes
15,2011,Pitbull,Latin Rhythm Airplay,Nominated,9,Yes
14,2011,Pitbull,Social 50,Nominated,9,Yes
19,2011,"""Armando""",Latin Rhythm Albums,Nominated,9,Yes


In [9]:
pairs = set(tuple(x) for x in pitbull_billboard_all_awards_df[['Year', 'Month']].values.tolist())
for year in range(2009, 2013):
    for month in range(1, 13):
        if (year, month) not in pairs:
            pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.append({'Year': int(year), 'Month': int(month)}, ignore_index=True)
pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.sort_values(by=['Year', 'Month'])
pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.reset_index(drop=True)
pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.fillna(0.0)
pitbull_billboard_all_awards_df['Year'] = pitbull_billboard_all_awards_df['Year'].astype(int)
pitbull_billboard_all_awards_df['Month'] = pitbull_billboard_all_awards_df['Month'].astype(int)
pitbull_billboard_all_awards_df=pitbull_billboard_all_awards_df.replace(to_replace=False,value=0)
pitbull_billboard_all_awards_df=pitbull_billboard_all_awards_df.replace(to_replace=True,value=1)
pitbull_billboard_all_awards_df['date']= pd.to_datetime(pitbull_billboard_all_awards_df[['Year', 'Month']].assign(DAY=1))
pitbull_billboard_all_awards_df=pitbull_billboard_all_awards_df
pitbull_billboard_all_awards_df

  pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.append({'Year': int(year), 'Month': int(month)}, ignore_index=True)
  pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.append({'Year': int(year), 'Month': int(month)}, ignore_index=True)
  pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.append({'Year': int(year), 'Month': int(month)}, ignore_index=True)
  pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.append({'Year': int(year), 'Month': int(month)}, ignore_index=True)
  pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.append({'Year': int(year), 'Month': int(month)}, ignore_index=True)
  pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.append({'Year': int(year), 'Month': int(month)}, ignore_index=True)
  pitbull_billboard_all_awards_df = pitbull_billboard_all_awards_df.append({'Year': int(year), 'Month': int(month)}, ignore_index=True)
  pitbull_billboard_all_awards_df = pitbull_bill

Unnamed: 0,Year,Nominee / work,Award,Result,Month,Latin,date
0,2009,0,0,0,1,0,2009-01-01
1,2009,0,0,0,2,0,2009-02-01
2,2009,0,0,0,3,0,2009-03-01
3,2009,0,0,0,4,0,2009-04-01
4,2009,0,0,0,5,0,2009-05-01
...,...,...,...,...,...,...,...
90,2016,Pitbull & Enrique Iglesias,Tour of the Year,Nominated,9,Yes,2016-09-01
91,2016,"""Dale""",Latin Rhythm Album of the Year,Won,9,Yes,2016-09-01
92,2017,"“El Taxi” (with Sensato, Lil Jon & Osmani Garcia)",Digital Song of the Year,Nominated,9,Yes,2017-09-01
93,2017,Pitbull,"Latin Rhythm Albums Artist of the Year, Solo",Nominated,9,Yes,2017-09-01


In [10]:
pitbull_df.sort_values(by='date')

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
66065,2009-03-14,66,I Know You Want Me (Calle Ocho),Pitbull,,66,1
65953,2009-03-21,54,I Know You Want Me (Calle Ocho),Pitbull,66.0,54,2
65849,2009-03-28,50,I Know You Want Me (Calle Ocho),Pitbull,54.0,50,3
65743,2009-04-04,44,I Know You Want Me (Calle Ocho),Pitbull,50.0,44,4
65636,2009-04-11,37,I Know You Want Me (Calle Ocho),Pitbull,44.0,37,5
...,...,...,...,...,...,...,...
48349,2012-08-04,50,Back In Time,Pitbull,38.0,11,16
48254,2012-08-11,55,Back In Time,Pitbull,50.0,11,17
48165,2012-08-18,66,Back In Time,Pitbull,55.0,11,18
48074,2012-08-25,75,Back In Time,Pitbull,66.0,11,19


### Data Cleaning Unemployment Data

Question is: As unemployment rate changes, what is the likelihood that Pitbull has a song on the Billboard 100?

Since unemployment is calculated on a monthly basis through household surveys, we will be creating a binary variable where 1 = Pitbull appeared on the BB100 chart that month, and 0 otherwise.


In [11]:
unemployment_df = pd.read_csv('unemployment_rate_data.csv')
unemployment_df['date'] =  pd.to_datetime(unemployment_df['date'], infer_datetime_format= True)
unemployment_df.loc[unemployment_df['date'].between('2009-01-01', '2012-12-01')]

Unnamed: 0,date,unrate,unrate_men,unrate_women,unrate_16_to_17,unrate_18_to_19,unrate_20_to_24,unrate_25_to_34,unrate_35_to_44,unrate_45_to_54,unrate_55_over
732,2009-01-01,8.5,9.7,7.0,20.8,21.7,13.5,9.2,7.4,6.7,5.9
733,2009-02-01,8.9,10.4,7.2,22.8,22.7,13.8,9.8,7.5,7.2,6.1
734,2009-03-01,9.0,10.6,7.3,23.6,20.4,13.9,10.0,7.8,7.2,6.6
735,2009-04-01,8.6,9.9,7.1,21.8,20.4,13.7,9.6,7.5,6.6,6.2
736,2009-05-01,9.1,10.2,7.7,24.2,23.3,15.5,10.1,7.5,6.5,6.3
737,2009-06-01,9.7,10.4,8.9,32.1,25.4,16.0,9.8,7.7,7.0,6.8
738,2009-07-01,9.7,10.2,9.2,27.5,23.2,15.4,10.1,7.9,7.4,7.2
739,2009-08-01,9.6,10.1,8.9,24.1,24.3,15.2,10.1,8.1,7.5,6.9
740,2009-09-01,9.5,10.2,8.6,26.2,25.7,15.1,10.1,8.3,7.4,6.5
741,2009-10-01,9.5,10.3,8.5,28.5,25.5,15.3,10.1,8.5,7.1,6.6


In [12]:
%sql up_df << SELECT date, unrate FROM unemployment_df ORDER BY date
up_df['year'] = pd.DatetimeIndex(up_df['date']).year
up_df['month'] = pd.DatetimeIndex(up_df['date']).month
%sql up_df << SELECT unrate, year, month FROM up_df WHERE year between 2009 and 2012
up_df['date']= pd.to_datetime(up_df[['year', 'month']].assign(DAY=1))
up_df

Returning data to local variable up_df
Returning data to local variable up_df


Unnamed: 0,unrate,year,month,date
0,8.5,2009,1,2009-01-01
1,8.9,2009,2,2009-02-01
2,9.0,2009,3,2009-03-01
3,8.6,2009,4,2009-04-01
4,9.1,2009,5,2009-05-01
5,9.7,2009,6,2009-06-01
6,9.7,2009,7,2009-07-01
7,9.6,2009,8,2009-08-01
8,9.5,2009,9,2009-09-01
9,9.5,2009,10,2009-10-01


In [13]:
m_df=pitbull_df
m_df['month'] = pd.DatetimeIndex(m_df['date']).month
m_df['year'] = pd.DatetimeIndex(m_df['date']).year
m_df['day'] = pd.DatetimeIndex(m_df['date']).day
%sql pab_df << SELECT year, month, song, COUNT(day) AS songfreq FROM m_df GROUP BY month, year, song ORDER BY year, month
pab_df

Returning data to local variable pab_df


Unnamed: 0,year,month,song,songfreq
0,2009,3,I Know You Want Me (Calle Ocho),3
1,2009,4,I Know You Want Me (Calle Ocho),4
2,2009,5,I Know You Want Me (Calle Ocho),5
3,2009,6,I Know You Want Me (Calle Ocho),4
4,2009,7,I Know You Want Me (Calle Ocho),4
5,2009,7,Hotel Room Service,4
6,2009,8,Hotel Room Service,5
7,2009,8,I Know You Want Me (Calle Ocho),5
8,2009,9,Hotel Room Service,4
9,2009,9,I Know You Want Me (Calle Ocho),4


We need to account for the months where he did not appear on the Billboard 100:

In [14]:
pab = pab_df
pairs = set(tuple(x) for x in pab[['year', 'month']].values.tolist())
#print(pairs)
for year in range(2009, 2013):
    for month in range(1, 13):
        if (year, month) not in pairs:
            pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
pab = pab.sort_values(by=['year', 'month'])
pab = pab.reset_index(drop=True)
pab = pab.fillna(0.0)
pab['year'] = pab['year'].astype(int)
pab['month'] = pab['month'].astype(int)
pab['appearance']=pab['songfreq']>0
pab=pab.replace(to_replace=False,value=0)
pab=pab.replace(to_replace=True,value=1)
pab['date']= pd.to_datetime(pab[['year', 'month']].assign(DAY=1))
pab_df=pab
pab_df

  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), 'month': int(month)}, ignore_index=True)
  pab = pab.append({'year': int(year), '

Unnamed: 0,year,month,song,songfreq,appearance,date
0,2009,1,0,0.0,0,2009-01-01
1,2009,2,0,0.0,0,2009-02-01
2,2009,3,I Know You Want Me (Calle Ocho),3.0,1,2009-03-01
3,2009,4,I Know You Want Me (Calle Ocho),4.0,1,2009-04-01
4,2009,5,I Know You Want Me (Calle Ocho),5.0,1,2009-05-01
5,2009,6,I Know You Want Me (Calle Ocho),4.0,1,2009-06-01
6,2009,7,I Know You Want Me (Calle Ocho),4.0,1,2009-07-01
7,2009,7,Hotel Room Service,4.0,1,2009-07-01
8,2009,8,Hotel Room Service,5.0,1,2009-08-01
9,2009,8,I Know You Want Me (Calle Ocho),5.0,1,2009-08-01


Now that we have two dataframes that include the relevant information, month and year, we can join them to see side-by-side what the unemployment rate was for a given month, and how many times Pitbull appeared on the billboard 100 in that month (if he appeared)

In [15]:
%sql simplecomp_df << SELECT up_df.year, up_df.date, pab_df.appearance, up_df.month, up_df.unrate, pab_df.songfreq FROM up_df INNER JOIN pab_df ON up_df.date=pab_df.date
simplecomp_df.head(10)

Returning data to local variable simplecomp_df


Unnamed: 0,year,date,appearance,month,unrate,songfreq
0,2009,2009-01-01,0,1,8.5,0.0
1,2009,2009-02-01,0,2,8.9,0.0
2,2009,2009-03-01,1,3,9.0,3.0
3,2009,2009-04-01,1,4,8.6,4.0
4,2009,2009-05-01,1,5,9.1,5.0
5,2009,2009-06-01,1,6,9.7,4.0
6,2009,2009-07-01,1,7,9.7,4.0
7,2009,2009-07-01,1,7,9.7,4.0
8,2009,2009-08-01,1,8,9.6,5.0
9,2009,2009-08-01,1,8,9.6,5.0


In [16]:
%sql unemployaward_df << SELECT up_df.year, up_df.month, up_df.unrate, pitbull_billboard_all_awards_df.Latin FROM up_df INNER JOIN pitbull_billboard_all_awards_df ON up_df.year=pitbull_billboard_all_awards_df.year
unemployaward_df.head(10)

Returning data to local variable unemployaward_df


Unnamed: 0,year,month,unrate,Latin
0,2009,12,9.7,0
1,2009,12,9.7,0
2,2009,12,9.7,0
3,2009,12,9.7,0
4,2009,12,9.7,0
5,2009,12,9.7,0
6,2009,12,9.7,0
7,2009,12,9.7,0
8,2009,12,9.7,0
9,2009,12,9.7,0
