In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
dataset = pd.read_csv('Emotify/data.csv')
dataset.head()

Unnamed: 0,track id,genre,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful_activation,tension,sadness,mood,liked,disliked,age,gender,mother tongue
0,1,classical,0,1,0,0,0,0,1,1,0,3,1,0,21,1,English
1,1,classical,0,0,0,1,0,0,0,0,0,3,0,1,41,1,Dutch
2,1,classical,0,0,0,1,0,0,0,0,1,3,0,0,24,1,English
3,1,classical,0,0,0,0,1,0,0,0,0,3,0,0,32,0,Spanish
4,1,classical,0,0,0,1,1,0,0,0,0,4,0,1,21,0,English


In [9]:
dataset.columns

Index(['track id', ' genre', ' amazement', ' solemnity', ' tenderness',
       ' nostalgia', ' calmness', ' power', ' joyful_activation', ' tension',
       ' sadness', ' mood', ' liked', ' disliked', ' age', ' gender',
       ' mother tongue'],
      dtype='object')

In [10]:
dataset.columns = dataset.columns.str.strip()

In [11]:
gems_annotations = ['amazement', 'solemnity', 'tenderness', 
                    'nostalgia', 'calmness', 'power', 
                    'joyful_activation', 'tension', 'sadness']
print(dataset['genre'].unique())
print(dataset[gems_annotations])

['classical' 'rock' 'electronic' 'pop']
      amazement  solemnity  tenderness  nostalgia  calmness  power  \
0             0          1           0          0         0      0   
1             0          0           0          1         0      0   
2             0          0           0          1         0      0   
3             0          0           0          0         1      0   
4             0          0           0          1         1      0   
...         ...        ...         ...        ...       ...    ...   
8402          1          1           0          0         0      0   
8403          0          0           0          1         0      0   
8404          0          0           0          0         0      0   
8405          1          0           0          0         0      0   
8406          1          0           0          0         0      0   

      joyful_activation  tension  sadness  
0                     1        1        0  
1                     0        

In [13]:
dataset[gems_annotations] = dataset[gems_annotations].astype(float)

dataset[gems_annotations].dtypes

amazement            float64
solemnity            float64
tenderness           float64
nostalgia            float64
calmness             float64
power                float64
joyful_activation    float64
tension              float64
sadness              float64
dtype: object

In [14]:
# Group by track ID and genre, summing up the GEMS values
data = dataset.groupby(["track id", "genre"])[gems_annotations].sum().reset_index()

# Compute the count per group
count = dataset.groupby(["track id", "genre"]).size().reset_index(name="count")

# Merge count into the summed data
data = data.merge(count, on=["track id", "genre"], how="left")

data.to_csv("Emotify/summed_emotify.csv", index=False)  # Saves without the index column

In [15]:
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Subtract max for numerical stability
    return exp_x / np.sum(exp_x)
data.head()

Unnamed: 0,track id,genre,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful_activation,tension,sadness,count
0,1,classical,7.0,16.0,10.0,14.0,30.0,1.0,4.0,3.0,15.0,48
1,2,classical,5.0,8.0,20.0,16.0,35.0,2.0,1.0,1.0,5.0,47
2,3,classical,8.0,16.0,6.0,8.0,3.0,13.0,19.0,11.0,7.0,46
3,4,classical,5.0,6.0,22.0,19.0,32.0,1.0,0.0,1.0,6.0,42
4,5,classical,2.0,13.0,14.0,23.0,14.0,2.0,3.0,8.0,14.0,44


In [16]:
for index, row in data.iterrows():
    print(f"Row {index}: {row['track id']} - {row['genre']} - {row[gems_annotations].values}")
    # Using np.float64 ensures higher numerical stability in the softmax function, especially when dealing with exponentiation
    data_row = np.array(row[gems_annotations].values, dtype=np.float64) / row["count"]
    softmax_values = softmax(data_row)
    print("Data Divided", data_row)
    threshold = int(np.ceil((np.std(data_row) * 10)))
    print("Standard Deviation", threshold)
    # print("Data Softmax", softmax_values)
    
    # Find the threshold value (the Nth largest value)
    top_nth_value = np.partition(data_row, -threshold)[-threshold]  # Gets the Nth largest value

    # Create a binary mask: 1 for values ≥ top_nth_value, 0 otherwise
    binary_arr = (data_row >= top_nth_value).astype(int)

    print(binary_arr)  # Example output: [0 0 0 1 1 1]
    
    data.loc[index, gems_annotations] = binary_arr
    
    print("-" * 100)
    # for col, value in row[gems_annotations].items():
    #     print(data.at[index, col])

Row 0: 1 - classical - [7.0 16.0 10.0 14.0 30.0 1.0 4.0 3.0 15.0]
Data Divided [0.14583333 0.33333333 0.20833333 0.29166667 0.625      0.02083333
 0.08333333 0.0625     0.3125    ]
Standard Deviation 2
[0 1 0 0 1 0 0 0 0]
----------------------------------------------------------------------------------------------------
Row 1: 2 - classical - [5.0 8.0 20.0 16.0 35.0 2.0 1.0 1.0 5.0]
Data Divided [0.10638298 0.17021277 0.42553191 0.34042553 0.74468085 0.04255319
 0.0212766  0.0212766  0.10638298]
Standard Deviation 3
[0 0 1 1 1 0 0 0 0]
----------------------------------------------------------------------------------------------------
Row 2: 3 - classical - [8.0 16.0 6.0 8.0 3.0 13.0 19.0 11.0 7.0]
Data Divided [0.17391304 0.34782609 0.13043478 0.17391304 0.06521739 0.2826087
 0.41304348 0.23913043 0.15217391]
Standard Deviation 2
[0 1 0 0 0 0 1 0 0]
----------------------------------------------------------------------------------------------------
Row 3: 4 - classical - [5.0 6.0 22.

In [17]:
data.to_csv("Emotify/multi_hot_encoded_emotify.csv", index=False)  # Saves without the index column

In [18]:
for index, row in data.iterrows():
    # print(f"Row {index}: {row['track id']} - {row['genre']} - {row[gems_annotations].values}")
    # Using np.float64 ensures higher numerical stability in the softmax function, especially when dealing with exponentiation
    data_row = np.array(row[gems_annotations].values, dtype=np.float64) / row["count"]
    # softmax_values = softmax(data_row)
    print("Data Divided", data_row)
    
    data.loc[index, gems_annotations] = data_row
    
    print("-" * 100)
    # for col, value in row[gems_annotations].items():
    #     print(data.at[index, col])

Data Divided [0.         0.02083333 0.         0.         0.02083333 0.
 0.         0.         0.        ]
----------------------------------------------------------------------------------------------------
Data Divided [0.        0.        0.0212766 0.0212766 0.0212766 0.        0.
 0.        0.       ]
----------------------------------------------------------------------------------------------------
Data Divided [0.         0.02173913 0.         0.         0.         0.
 0.02173913 0.         0.        ]
----------------------------------------------------------------------------------------------------
Data Divided [0.         0.         0.02380952 0.02380952 0.02380952 0.
 0.         0.         0.        ]
----------------------------------------------------------------------------------------------------
Data Divided [0.         0.         0.02272727 0.02272727 0.02272727 0.
 0.         0.         0.02272727]
---------------------------------------------------------------------

In [19]:
data.to_csv("Emotify/divided_emotify.csv", index=False)  # Saves without the index column