In [7]:
import pandas as pd
import matplotlib.pyplot as plt

def select_data(tracks_csv, albums_csv): #selecting the data from df_tracks and df_albums and retrieve only the columns we are interested in
    df_tracks = pd.read_csv(tracks_csv)
    df_albums = pd.read_csv(albums_csv)

    df_tracks = df_tracks.loc[:, ['album_id', 'artists_id', 'duration_ms', 'id', 'name', 'popularity']]
    df_albums = df_albums.loc[:, ['album_type', 'id', 'name', 'release_date', 'total_tracks']]
    
    return df_tracks, df_albums


def get_time_popularity(df_tracks): # We divide the songs in different groups depending on the time, and calculate the average popularity for each group
    time_intervalls = [x for x in range(120000, 360001, 10000)] # time_intervalls contains all the interval edges
    time_intervalls.insert(0,0) # lower limit, just for the for loop below to work
    time_intervalls.append(1000000) # uppe limit, just for the for loop
    
    time_sequences = []
    time_popularity = []

    for i in range(len(time_intervalls)-1): # We loop through the time_intervalls for each intervall
        # we select only the songs that satisfy the current time intervall, calculate the average popularity for the group and add to time_popularity
        time_popularity.append((df_tracks[(df_tracks['duration_ms']<time_intervalls[i+1]) & (df_tracks['duration_ms']>=time_intervalls[i])]['popularity'].mean()))
        time_sequences.append(f"{int(time_intervalls[i]/1000)}-{int(time_intervalls[i+1]/1000)}") # We create a list of strings with the interval edges in seconds
        
    return time_sequences, time_popularity


def create_df_time(df_tracks): # we create a dataframe from the lists received from get_time_popularity
    df_time = pd.DataFrame()

    time_sequences, time_popularity = get_time_popularity(df_tracks)
    
    df_time.index = time_sequences
    df_time.index.name = "Track length in seconds"
    df_time["Average popularity"] = time_popularity 
    df_time["Average popularity"] = df_time["Average popularity"].round(2) # We round the average popularity to 2 decimals

    return df_time


def create_bar_plot(df_time): # We create a barplot where each time groups mean popularity becomes a bar
    fig, ax = plt.subplots(figsize=(11,4)) #To change the size of the plot
    df_time.plot.bar(ax=ax)
    plt.ylim(57,66) #To zoom in on the tops of the bars
    plt.xticks(rotation=45) #rotating the titles
    plt.show()

In [1]:
from langdetect import detect 

def safe_detect(title): #Some "words" consists only of numbers or other signs that can not be interpreted as english or not
    try:
        return detect(title)
    except:
        return "not english" # if a word can not be interpreted, so do we return "not english". Since it is not 'en' so will i automatically be cut in get_english_tracks 

# OBS! get_english_tracks takes about 45 seconds to run
def get_english_tracks(df_tracks): #We select only the songs with english titles, 
    df_english = df_tracks[df_tracks['name'].apply(safe_detect)=='en'] #select the rows if the name column consists of an english string
    df_english.reset_index(drop=True, inplace=True) # we reset index which will be convenient in later 
    return df_english

In [14]:
from wordcloud import STOPWORDS
import string
import statistics


def select_only_title(list_of_words): 
    for index in range(len(list_of_words)): 
        if '(' in list_of_words[index] or '[' in list_of_words[index] or '-' in list_of_words[index]: 
            list_of_words = list_of_words[:index]
            break
    return list_of_words


def correct_words(word):
    word = word.lower()
    for letter in word:
        if letter not in string.ascii_lowercase:
            word = word.replace(letter,"")
    return word        


def fill_dict_word_pop(word, index, word_popularity, stopwords, df_english):
    word = correct_words(word)
    if word not in stopwords and len(word) > 1:
        if word in word_popularity:
            word_popularity[word].append(df_english.loc[index,'popularity'])
        else:
            word_popularity[word] = [df_english.loc[index,'popularity']]
    return word_popularity


def get_word_popularity(df_english):
    stopwords = list(STOPWORDS)
    stopwords.extend(['christmas', 'snow', 'snowman', 'white', 'mistletoe', 'xmas', 'santa', 'claus', 'tree', 'merry'])

    word_popularity = {}
    
    for index in df_english.index:
        words = select_only_title(df_english.loc[index,'name'].split())
        for word in words:
            word_popularity = fill_dict_word_pop(word, index, word_popularity, stopwords, df_english)

    for word in word_popularity:
        word_popularity[word] = [statistics.mean(word_popularity[word]),len(word_popularity[word])]
    
    return word_popularity


def get_df_word_popularity(df_english):
    word_popularity = get_word_popularity(df_english)
    
    df_word_popularity = pd.DataFrame(word_popularity)
    df_word_popularity.index = ["Popularity", "Frequency"]
    df_word_popularity = df_word_popularity.T
    
    return df_word_popularity

In [15]:
def get_worst_best_dfs(df_freq_word_pop):
    df_best = df_freq_word_pop.sort_values('Popularity', ascending = False).head(20)
    df_worst = df_freq_word_pop.sort_values('Popularity').head(20)
    
    best_words = list(df_best.index)
    worst_words = list(df_worst.index)
    
    df_best.reset_index(drop=True, inplace=True)
    df_worst.reset_index(drop=True, inplace=True)

    return df_best, df_worst, best_words, worst_words
    

def show_worst_best_words(df_freq_word_pop):
    df_best, df_worst, best_words, worst_words = get_worst_best_dfs(df_freq_word_pop)
    df_best_worst = pd.DataFrame({
        "Best words" : best_words,
        "Best popularity" : df_best["Popularity"],
        "Best frequency" : df_best["Frequency"],
        "Worst words" : worst_words,
        'Worst popularity' : df_worst['Popularity'],
        'Worst frequency' : df_worst['Frequency']
    })
    return df_best_worst

def show_result(df_word_popularity):
    df_show_result = show_worst_best_words(df_word_popularity[df_word_popularity['Frequency']>=10])
    return df_show_result

In [4]:
import plotly.express as px

def merge_df_album_track(df_tracks, df_albums):
    df_album_track = pd.merge(df_tracks, df_albums, left_on = "album_id", right_on = "id", suffixes = ("_track", "_album"))
    df_album_track = df_album_track.loc[:, ["album_id", "popularity","album_type", "id_track", "total_tracks", "release_date"]]
    return df_album_track


def get_album_type_pop(df_album_track):
    df_album_type = pd.merge(
        df_album_track.loc[:, ["popularity", "album_type", "total_tracks"]].groupby("album_type").mean(),
        df_album_track.loc[:,["album_type", "total_tracks"]].groupby("album_type").count(), 
        on = "album_type", 
        suffixes = ["_mean", "_freq"]
    )
    return df_album_type


def get_px_scatter(df_album_type):
    fig = px.scatter(df_album_type, x = "total_tracks_mean", y = "popularity", size = "total_tracks_freq", text = df_album_type.index)
    fig.update_traces(textposition="top center")
    fig.write_image("px_scatter_pic.png")
    return "px_scatter_pic.png"

In [None]:
import seaborn as sns

def get_album_size_pop(df_album_track):
    df_album_size = df_album_track[df_album_track["album_type"] == "album"].loc[:,["popularity","total_tracks"]]
    return df_album_size

def get_regplot(df_album_size):
    plt.figure(figsize=(12,5))
    sns.regplot(x = "total_tracks", y = "popularity", data=df_album_size, scatter_kws = {"s":5})
    plt.xlim(0,50)

In [None]:
def extract_year(date):
    if len(date) == 4:
        return int(date)
    else:
        return pd.to_datetime(date).year
        
def add_release_year(df_album_track):
    df_album_track["release_year"] = df_album_track["release_date"].apply(extract_year)
    return df_album_track

def get_year_seq_pop(df_album_track):
    year_intervalls = [x for x in range (1960,2011, 5)] 
    year_intervalls.extend([x for x in range(2011,2025,1)])
    year_intervalls.insert(0,-1)

    year_sequences = []
    years_popularity = []

    for i in range(len(year_intervalls)-1):  
        years_popularity.append((df_album_track[(df_album_track['release_year']<=year_intervalls[i+1]) & (df_album_track['release_year']>year_intervalls[i])]['popularity'].mean()))
        year_sequences.append(f"{int(year_intervalls[i]+1)}-{int(year_intervalls[i+1])}")

    return year_sequences, years_popularity

def get_year_popularity(df_album_track):
    df_year = pd.DataFrame()

    year_sequences, years_popularity = get_year_seq_pop(df_album_track)
    
    df_year.index = year_sequences
    df_year.index.name = "Release year"
    df_year["Average popularity"] = years_popularity

    return df_year

def get_barplot(df_year):
    fig, ax = plt.subplots(figsize=(11,4))
    df_year.plot.bar(ax=ax)
    plt.ylim(59,68)
    plt.xticks(rotation=45)
    plt.show()

In [1]:
def get_freq_artists_pop(df_tracks):
    df_song_artists = df_tracks.loc[:,["popularity"]]
    # 'artists_id' is a string of format: ['id1', 'id2', ...] where each id is 22 characters, so can we divide with 26 to get number of artists
    df_song_artists["num_of_artists"] = df_tracks['artists_id'].map(lambda x: len(x)/26) 
    
    return df_song_artists

def get_freq_pop(df_song_artists):
    df_freq_pop = pd.merge(
        df_song_artists[df_song_artists["num_of_artists"]<5].groupby("num_of_artists").mean(),
        df_song_artists[df_song_artists["num_of_artists"]<5].groupby("num_of_artists").count(), 
        on = "num_of_artists",
        suffixes = ["_mean", "_freq"] 
    )
    df_freq_pop.loc["5.0-10.0"] = [df_song_artists[df_song_artists["num_of_artists"]>=5]["popularity"].mean(), 
                                   df_song_artists[df_song_artists["num_of_artists"]>=5]["popularity"].count()]
    return df_freq_pop

def get_px_scatter_reg(df_freq_pop):
    df_freq_pop["x_values_plot"] = [1,2,3,4,7.5]
    
    fig = px.scatter(
        df_freq_pop, 
        x = "x_values_plot", 
        y = "popularity_mean", 
        size = "popularity_freq", 
        text = df_freq_pop.index,
        trendline = "ols"
    )
    fig.update_traces(textposition="middle left")
    fig.write_image("px_scatter_reg_pic.png")
    return "px_scatter_reg_pic.png"