In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as graph_objects

from plotly.subplots import make_subplots

from wordcloud import WordCloud

### top words wordclouds

In [None]:
top_20_words_genre_df = pd.read_csv('dataframes/top_20_words_by_genre.csv')
top_20_filtered_words_genre_df = pd.read_csv('dataframes/top_20_filtered_words_by_genre.csv')
top_20_words_artist_df = pd.read_csv('dataframes/top_20_words_by_artist.csv')
top_20_filtered_words_artist_df = pd.read_csv('dataframes/top_20_filtered_words_by_artist.csv')

In [None]:
count_cols = []
word_cols = []
n = 20
for ind in range(n):
    word_cols.append(f'word{ind}')
    count_cols.append(f'word{ind}_count')

In [None]:
top_20_words_genre_df.index

In [None]:
max(top_20_words_genre_df.loc[0, [*count_cols]])

In [None]:
top_20_words_genre_df[[*count_cols]]

In [None]:
top_20_words_genre_df.rename(columns={'Unnamed: 0' : 'genre'}, inplace=True)
top_20_filtered_words_genre_df.rename(columns={'Unnamed: 0' : 'genre'}, inplace=True)

In [None]:
top_20_words_genre_df.head()

In [None]:
genres = top_20_words_genre_df['genre'].unique()

In [None]:
test_df = top_20_words_genre_df[top_20_words_genre_df['genre'] == genres[0]]

In [None]:
test_df.shape

In [None]:
test_words = [test_df[word].values[0] for word in word_cols]

In [None]:
test_counts = [test_df[count].values[0] for count in count_cols]

In [None]:
test_words

In [None]:
d = {}
for word, count in zip(test_words, test_counts):
    d[word] = count

In [None]:
wordcloud = WordCloud(background_color = "white", width=800, height=400)
wordcloud.generate_from_frequencies(frequencies=d)

In [None]:
fig = px.imshow(wordcloud)
fig.show()

In [None]:
genre_wordclouds = []

In [None]:
for genre in genres:
    genre_df = top_20_filtered_words_genre_df[top_20_filtered_words_genre_df['genre'] == genre]
    genre_words = [genre_df[word].values[0] for word in word_cols]
    genre_counts = [genre_df[count].values[0] for count in count_cols]
    
    d = {}
    for word, count in zip(genre_words, genre_counts):
        d[word] = count

    wordcloud = WordCloud(background_color = "white", width=800, height=400)
    wordcloud.generate_from_frequencies(frequencies=d)
    genre_wordclouds.append(wordcloud)

In [None]:
fig = make_subplots(rows=4, cols=1, subplot_titles = genres)
# for i in range(1,3):
#     for j in range(1,3):
for i, genre in enumerate(genres):
    fig.add_trace(graph_objects.Image(z=genre_wordclouds[i]), row = i+1, col = 1)

In [None]:
fig.update_layout(height = 4 * 400)
fig.show()

### top ngrams wordclouds

In [None]:
artist_ngrams_df = pd.read_csv('dataframes/artist_ngrams.csv', index_col=0)
genre_ngrams_df = pd.read_csv('dataframes/genre_ngrams.csv', index_col=0)

In [None]:
artists = list(artist_ngrams_df['Artist'].unique())
len(artists)

In [None]:
test_df = artist_ngrams_df[artist_ngrams_df['Artist'] == artists[0]]

In [None]:
test_df

In [None]:
ngram_lens = [2,3,4]
ngrams_n = 20
n = len(ngram_lens) * ngrams_n

test_gram_wordclouds_fig = make_subplots(rows=4, cols=1, subplot_titles = ngram_lens)

# for ind in range(n):
for len_index, gram_len in enumerate(ngram_lens):
    gram_count_cols = []
    gram_cols = []
    for ind in range(ngrams_n):
        gram_cols.append(f'ngram_{gram_len}_{ind}')
        gram_count_cols.append(f'count_{gram_len}_{ind}')
        
    gram_words = [test_df[word].values[0] for word in gram_cols]
    gram_counts = [test_df[count].values[0] for count in gram_count_cols]
        
    d = {}
    for word, count in zip(gram_words, gram_counts):
        d[word] = count
        
    wordcloud = WordCloud(background_color = "white", width=800, height=400)
    wordcloud.generate_from_frequencies(frequencies=d)
    # print(f'ind: {ind+1}')
    test_gram_wordclouds_fig.add_trace(graph_objects.Image(z=wordcloud), row=len_index+1, col=1)
        # fig = px.imshow(wordcloud)
        # fig.show()
test_gram_wordclouds_fig.update_layout(height = 3*400)
test_gram_wordclouds_fig.show()

In [None]:
import os
os.getcwd()

In [None]:
artist_sum_df = pd.read_csv('dataframes/group_stats/sum/combined_artists.csv', index_col=0)

In [None]:
artist_mean_df = pd.read_csv('dataframes/group_stats/mean/combined_artists.csv', index_col=0)

In [None]:
topic_X = 'manual_joy_count'
topic_Y = 'manual_sadness_count'
colorby = 'genre'

In [None]:
scatter_fig = px.scatter(artist_sum_df, x=topic_X, y=topic_Y, color=colorby)

In [None]:
# scatter_fig.show()

In [None]:
artist_mean_df.head()

In [None]:
cols = [col for col in list(artist_mean_df.columns) if col.endswith('percent')]

In [None]:
cols

In [None]:
cols_no_gendered = [col for col in cols if not 'gendered' in col]

In [None]:
gender_grouped = artist_mean_df[['gender', *cols]].groupby(['gender']).mean()

In [None]:
gender_grouped

In [None]:
female_row = gender_grouped.loc['female', :]

In [None]:
female_row = female_row * -1

In [None]:
female_row

In [None]:
male_row = gender_grouped.loc['male', :]

In [None]:
spine_fig = graph_objects.Figure(
    data = [
        graph_objects.Bar(name='male',
                          y=cols_no_gendered,
                          x = male_row.tolist(),
                          orientation='h',
                          marker=dict(color='#DD5555',
                                     line=dict(
                                     color='rgba(0,0,0,1.0)', width=0.5)),
                          hoverinfo='none'
                         ),
        graph_objects.Bar(name='female',
                  y=cols_no_gendered,
                  x = female_row.tolist(),
                  orientation='h',
                  marker=dict(color='#5555DD',
                             line=dict(
                             color='rgba(0,0,0,1.0)', width=0.5)),
                  hoverinfo='none'
                 ),
        
    ],
)

In [None]:
spine_fig.update_layout(barmode='relative')

### Artist metadata explorations

In [None]:
df = pd.read_csv('dataframes/with_counts/combined_count.csv', index_col=0)

In [None]:
df.head()

In [None]:
columns = ['unique_words', 'total_words', 'featured_count', 'producer_count', 'writer_count']

In [None]:
mean_df = pd.read_csv('dataframes/group_stats/genre_mean_df.csv', index_col=0)

In [None]:
px.bar(mean_df, x=mean_df.index, y=mean_df['unique_words'])