# Notebook #3: Exploratory Data Analysis

In [None]:
#!pip install numpy pandas numerize lets_plot

In [8]:
import numpy as np
import pandas as pd
import sqlite3

from numerize import numerize as nz

from lets_plot import * 
LetsPlot.setup_html()


# 3.1 Connect to sqlite database 

In [9]:

conn = sqlite3.connect('../data/clean/Spotifyartist.db')

df_artists = pd.read_sql('SELECT * FROM artists', conn)
genres_df = pd.read_sql('SELECT * FROM genres', conn)

df_merged = genres_df.merge(df_artists)

pd.merge (df_artists, genres_df) 


conn.close()


# 3.2 Create necessary dataframes and filter data

In [10]:

df_merged['genres'] = df_merged['genres'].str.split(', ')
df_merged =df_merged.explode('genres')

# Sum the total followers for each genre
df_genre_followers = df_merged.groupby('genres', as_index=False)['followers_total'].sum()

# Get the top 10 genres
df_top_10_genres = df_genre_followers.nlargest(10, 'followers_total')

print(df_top_10_genres)

                      genres  followers_total
9       contemporary country        119267500
10                   country        100522978
14              country road         75512918
21       modern country rock         59825683
6   classic oklahoma country         20119238
24            outlaw country         19638533
11              country dawn         13520456
26                      rock         13041752
20        modern country pop          9741905
15              country rock          8970417


In [11]:

# Filter for artists with 'country' as the genre (excluding all other country variations)
df_country_artists = df_merged[df_merged['genres'].str.lower() == 'country']

# Get the top 10 most popular country artists
df_top_10_country_artists = df_country_artists.nlargest(10, 'popularity')

print(df_top_10_country_artists[['name', 'popularity', 'followers_total']])


                    name  popularity  followers_total
38            Luke Combs          86         10911264
20            Luke Bryan          76          8697929
27            Kane Brown          75          5467138
2           Thomas Rhett          74          4708139
18  Florida Georgia Line          74          5422352
24             Jon Pardi          74          2030314
46         Blake Shelton          74          7172304
49         George Strait          74          4380567
6           Alan Jackson          73          2759900
16          Dolly Parton          73          2534262


# 3.3 Who is the most popular "country" artist?

In [26]:
# Graph 1 showing most popular specifically "country" artists

most_popular_artists=(
    ggplot(data=df_top_10_country_artists, mapping=aes(
                    x='name', 
                    y='followers_total', 
                    fill = 'name')
          ) +
    geom_bar(stat = 'identity') +
    scale_fill_viridis() +
    theme_minimal() +
    theme(axis_text_x=element_text(angle=90, hjust=1)) +
    labs(
        title='Top 10 Most Popular Country Artists', 
        x='Artist Name',
        y='Number of followers',
        color='Artist Name'
    )
)
most_popular_artists.show()

#Save the plot to the figures folder
ggsave(plot=most_popular_artists, filename='most_popular_artists.svg', path='../docs/figures/')


'/Users/janekakalec/Desktop/Final/me204-2024-project-janekakalec/docs/figures/most_popular_artists.svg'

# 3.4 What divison of country music has the most followers?

In [23]:
#Create chart to show the top genres in country music

top_genres_chart = (
    ggplot(df_top_10_genres, mapping=aes(
                    x='genres', 
                    y='followers_total',
                    fill='genres')
          ) +
    geom_bar(stat='identity') +
    scale_fill_viridis() +
    theme(axis_text_x=element_text(angle=90, hjust=1)) +
    labs(
        title='Top 10 Country Genres by Total Followers',
        x='Country Genre',
        y='Number of Followers'
    )
)

# Display the bar chart
top_genres_chart.show()

#Save the chart to the figures folder
ggsave(plot=top_genres_chart, filename='top_genres_chart.svg', path='../docs/figures/')


'/Users/janekakalec/Desktop/Final/me204-2024-project-janekakalec/docs/figures/top_genres_chart.svg'