In [263]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import time

import seaborn
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn import utils
import scipy.stats as stats
from scipy.stats import binom, poisson, norm
from sklearn.metrics import log_loss

In [264]:
%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


## Phase IV

### Introduction

Keikei & Yasmine

### Data description

get data description from Phase 2

### Preregistration statement

From Phase 3

### Data Analysis

Keikei & Yasmine

### Evaluation of significance

Keikei & Yasmine

### Interpretation and conclusions

Interpretation:
Hypothesis 1: Yasmine
Hypothesis 2: Yasmine
Hypothesis 3: Keikei
Conclusions: Yasmine

### Limitations

Keikei

### Acknowledgments

### Questions for reviewers

Yasmine & Keikei

#### Hypothesis #1

There's a correlation between an award in a given year being Latin and the unemployment rate fluctuating by a certain amount from 2009 to 2012.
- We will perform a ppf on the probability that given a song's award is Latin, the unemployment rate would fluctuate by a certain amount.
- We will also utilize a correlation matrix.

In [265]:
unemployaward_df = pd.read_csv('unemployaward_df.csv')
print (unemployaward_df.head())

   Unnamed: 0  unrate  Year  Month        date Latin
0           0     8.5  2009      1  2009-01-01     0
1           1     8.9  2009      2  2009-02-01     0
2           2     9.0  2009      3  2009-03-01     0
3           3     8.6  2009      4  2009-04-01     0
4           4     9.1  2009      5  2009-05-01     0


In [266]:
print (unemployaward_df.loc[unemployaward_df.Latin == 'Yes', 'Latin'].count())
print (unemployaward_df.loc[unemployaward_df.Latin == 'No', 'Latin'].count())

20
7


In [267]:
style_dummies= pd.get_dummies(unemployaward_df, columns = ['Latin'])
frames = [unemployaward_df, style_dummies]
unemployaward_df= pd.concat(frames, axis=1)
unemployaward_df= unemployaward_df.rename(columns= {'Latin_0':'no_award'})
unemployaward_df = unemployaward_df.loc[:,~unemployaward_df.columns.duplicated()].copy()
unemployaward_df = unemployaward_df.drop(columns= ['Unnamed: 0'])
unemployaward_df.head()

Unnamed: 0,unrate,Year,Month,date,Latin,no_award,Latin_No,Latin_Yes
0,8.5,2009,1,2009-01-01,0,1,0,0
1,8.9,2009,2,2009-02-01,0,1,0,0
2,9.0,2009,3,2009-03-01,0,1,0,0
3,8.6,2009,4,2009-04-01,0,1,0,0
4,9.1,2009,5,2009-05-01,0,1,0,0


In [268]:
unemployaward_df[['unrate','Latin_Yes']]
y= unemployaward_df['Latin_Yes']
x= unemployaward_df['unrate'].values.reshape(-1,1)
pitbullaward_logreg = LogisticRegression()
pitbullaward_logreg.fit(x, y)
print (pitbullaward_logreg.coef_, pitbullaward_logreg.intercept_)
unemployaward_df['unrate'] = pitbullaward_logreg.predict_proba(x)[:,0]

[[-0.95830123]] [7.34098937]


In [269]:
pitbull_latin_year_df = unemployaward_df.groupby(["Year"]).Latin_Yes.sum().reset_index()
pitbull_us_year_df = unemployaward_df.groupby(["Year"]).Latin_No.sum().reset_index()

In [270]:
pitbull_latin_year_df['latin_award_zscore'] = stats.zscore(pitbull_latin_year_df['Latin_Yes'])
pitbull_us_year_df['us_award_zscore'] = stats.zscore(pitbull_us_year_df['Latin_No'])
print (pitbull_latin_year_df)
print (pitbull_us_year_df)

   Year  Latin_Yes  latin_award_zscore
0  2009          1           -1.088662
1  2010          2           -0.816497
2  2011          7            0.544331
3  2012         10            1.360828
   Year  Latin_No  us_award_zscore
0  2009         0        -0.855186
1  2010         0        -0.855186
2  2011         2         0.122169
3  2012         5         1.588203


In [271]:
year_frames= [pitbull_latin_year_df, pitbull_us_year_df]
both_years_df= pd.concat(year_frames, axis=1)
both_years_df['Award']= both_years_df['Latin_Yes']+both_years_df['Latin_No']
std= both_years_df['Award'].std()
mean= 20/27
print (std)
print(mean)


6.551081335677848
0.7407407407407407


In [272]:
probabilites= norm.cdf(pitbull_latin_year_df['latin_award_zscore'], loc= 0.7407407407407407, scale= 6.551081335677848)
probabilites= probabilites.reshape(-1,1)
print (probabilites)

[[0.3900257 ]
 [0.40605421]
 [0.488041  ]
 [0.53770522]]


In [273]:
pitbull_unrate_year_df = unemployaward_df.groupby(["Year"]).unrate.mean().reset_index()
pitbull_unrate_year_df
x_new= np.array(pitbull_unrate_year_df['unrate'].values).reshape(-1,1)

#### Hypothesis #2

Given a host of variables in our unemployment data-- age range of those unemployed, their gender, and the cumulative unemployment rate-- we can predict whether or not Pitbull would make it onto the Billboard Charts.
 - We will do a train-test split logistical regression to see if we can train the data set on unemployment-- given a host of variables-- to see if we can predict Pitbull's chart success, as well as understand the relationship and effect different variables have on Pitbull chart success
 - Per feedback, we will utilize K-Folds instead

In [274]:
unemployment_rate_df = pd.read_csv('unemployment_rate_data.csv')
unemployment_rate_df

Unnamed: 0,date,unrate,unrate_men,unrate_women,unrate_16_to_17,unrate_18_to_19,unrate_20_to_24,unrate_25_to_34,unrate_35_to_44,unrate_45_to_54,unrate_55_over
0,1/1/1948,4.0,4.2,3.5,10.8,9.6,6.6,3.6,2.6,2.7,3.6
1,2/1/1948,4.7,4.7,4.8,15.0,9.5,8.0,4.0,3.2,3.4,4.0
2,3/1/1948,4.5,4.5,4.4,13.2,9.3,8.6,3.5,3.2,2.9,3.5
3,4/1/1948,4.0,4.0,4.1,9.9,8.1,6.8,3.5,3.1,2.9,3.2
4,5/1/1948,3.4,3.3,3.4,6.4,7.2,6.3,2.8,2.5,2.3,2.9
...,...,...,...,...,...,...,...,...,...,...,...
882,7/1/2021,5.7,5.5,5.8,12.8,9.9,9.5,6.3,4.8,4.0,4.6
883,8/1/2021,5.3,5.1,5.5,10.7,11.0,9.1,5.8,4.4,4.2,4.1
884,9/1/2021,4.6,4.6,4.5,9.2,12.6,7.7,5.0,3.8,3.7,3.3
885,10/1/2021,4.3,4.2,4.4,8.6,12.7,6.8,4.5,3.6,3.5,3.3


In [275]:
estimator= LogisticRegression()
list_of_variables= ['unrate', 'unrate_20_to_24', 'unrate_men', 'unrate_55_over']
kf = KFold(n_splits= 3, shuffle= False)
x= cars[list_of_variables]
y= cars['body-style_sedan']
my_scorer = 'f1'
result = cross_val_score(estimator, x, y, cv= kf, scoring= my_scorer)
print (result)
print (np.mean(result))
print (np.std(result))

NameError: name 'KFold' is not defined

#### Hypothesis #3
We want to know the magnitude of the effect, if any, that the unemployment rate on a given month, both total average per month and by different categories, has on Pitbull popularity as measured by search trends. We do this by testing the hypothesis that the coefficients on the different unemployment measures are zero, having no effect on search term popularity. 
* We run a linear regression since the dependent variable, interest index, is discrete, ranging from 7 to 100

In [None]:
ucomplex_df=pd.read_csv('ucomplex_df.csv')
gtrends_df=pd.read_csv('gtrends_df.csv')

In [None]:
ucomplex_df.head(5)

Unnamed: 0.1,Unnamed: 0,date,unrate,u_men,u_women,u16_17,u18_19,u20_24,u25_34,u35_44,u45_54
0,0,1948-01-01,4.0,4.2,3.5,10.8,9.6,6.6,3.6,2.6,2.7
1,1,1948-02-01,4.7,4.7,4.8,15.0,9.5,8.0,4.0,3.2,3.4
2,2,1948-03-01,4.5,4.5,4.4,13.2,9.3,8.6,3.5,3.2,2.9
3,3,1948-04-01,4.0,4.0,4.1,9.9,8.1,6.8,3.5,3.1,2.9
4,4,1948-05-01,3.4,3.3,3.4,6.4,7.2,6.3,2.8,2.5,2.3


In [None]:
gtrends_df.head(5)

Unnamed: 0.1,Unnamed: 0,Interest Index,Date,Month,Year
0,1,7,2004-01-01,1,2004
1,2,9,2004-02-01,2,2004
2,3,10,2004-03-01,3,2004
3,4,13,2004-04-01,4,2004
4,5,20,2004-05-01,5,2004


In [None]:
%sql uandg_df << SELECT * FROM ucomplex_df INNER JOIN gtrends_df ON ucomplex_df.date=gtrends_df.Date
uandg_df.head(5)

Returning data to local variable uandg_df


Unnamed: 0.2,Unnamed: 0,date,unrate,u_men,u_women,u16_17,u18_19,u20_24,u25_34,u35_44,u45_54,Unnamed: 0.1,Interest Index,Date,Month,Year
0,672,2004-01-01,6.3,6.7,5.8,18.4,16.9,10.7,6.6,5.0,4.5,1,7,2004-01-01,1,2004
1,673,2004-02-01,6.0,6.4,5.5,17.9,16.6,10.1,6.2,4.9,4.3,2,9,2004-02-01,2,2004
2,674,2004-03-01,6.0,6.4,5.6,20.4,14.7,10.1,6.2,5.2,4.3,3,10,2004-03-01,3,2004
3,675,2004-04-01,5.4,5.6,5.1,21.1,13.3,8.7,5.4,4.3,4.0,4,13,2004-04-01,4,2004
4,676,2004-05-01,5.3,5.5,5.1,22.7,14.2,9.9,5.2,3.9,3.6,5,20,2004-05-01,5,2004


In [None]:
uandg_df.drop('Date',axis=1,inplace=True)

In [None]:
uandg_df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
uandg_df.head(5)

Unnamed: 0,date,unrate,u_men,u_women,u16_17,u18_19,u20_24,u25_34,u35_44,u45_54,Interest Index,Month,Year
0,2004-01-01,6.3,6.7,5.8,18.4,16.9,10.7,6.6,5.0,4.5,7,1,2004
1,2004-02-01,6.0,6.4,5.5,17.9,16.6,10.1,6.2,4.9,4.3,9,2,2004
2,2004-03-01,6.0,6.4,5.6,20.4,14.7,10.1,6.2,5.2,4.3,10,3,2004
3,2004-04-01,5.4,5.6,5.1,21.1,13.3,8.7,5.4,4.3,4.0,13,4,2004
4,2004-05-01,5.3,5.5,5.1,22.7,14.2,9.9,5.2,3.9,3.6,20,5,2004


In [None]:
intcol=uandg_df.pop('Interest Index')
mcol=uandg_df.pop('Month')
ycol=uandg_df.pop('Year')
uandg_df.insert(0, 'Year', ycol)

In [None]:
uandg_df.insert(1, 'Month', mcol)
uandg_df.insert(2, 'Interest Index', intcol)

In [None]:
uandg_df.head(5)

Unnamed: 0,Year,Month,Interest Index,date,unrate,u_men,u_women,u16_17,u18_19,u20_24,u25_34,u35_44,u45_54
0,2004,1,7,2004-01-01,6.3,6.7,5.8,18.4,16.9,10.7,6.6,5.0,4.5
1,2004,2,9,2004-02-01,6.0,6.4,5.5,17.9,16.6,10.1,6.2,4.9,4.3
2,2004,3,10,2004-03-01,6.0,6.4,5.6,20.4,14.7,10.1,6.2,5.2,4.3
3,2004,4,13,2004-04-01,5.4,5.6,5.1,21.1,13.3,8.7,5.4,4.3,4.0
4,2004,5,20,2004-05-01,5.3,5.5,5.1,22.7,14.2,9.9,5.2,3.9,3.6
