#### Feature Engineering  
Dataset: _music_clean_   
Author: Luis Sergio Pastrana Lemus  
Date: 2025-04-23

## __1. Libraries__

In [1]:
from IPython.display import display, HTML
import os
import pandas as pd
from pathlib import Path
import sys

# Define project root dynamically, gets the current directory from whick the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *

## __2. Path to Data file__

In [2]:
# Build route to data file and upload
data_file_path = project_root / "data" / "processed"
df_music = load_dataset_from_csv(data_file_path, "music_clean.csv", sep=',', header='infer', keep_default_na=False)


In [3]:
# Format notebook output
format_notebook()

## 3 __Casting to data types__

### 3.1 Casting to category data type

In [4]:
# Casting to category dtype
df_music['genre'] = df_music['genre'].astype('category')
df_music['city'] = df_music['city'].astype('category')
df_music['day'] = df_music['day'].astype('category')

In [5]:
# Checking dtypes after imputation
display(HTML(f"> Data types after imputation:\n"))
print(df_music.loc[:, ['genre', 'city', 'day']].dtypes)

genre    category
city     category
day      category
dtype: object


### 3.1 Casting to datetime data type

In [6]:
df_music = normalize_datetime(df_music, include=["time"], frmt="%H:%M:%S")

type(df_music['time'].iloc[0])

datetime.time

In [7]:
df_music['hour'] = df_music['time'].apply(lambda x: x.hour if pd.notnull(x) else None)
df_music

Unnamed: 0,userid,track,artist,genre,city,time,day,hour
0,FFB692EC,kamigata_to_boots,the_mass_missile,rock,shelbyville,20:28:33,wednesday,20
1,55204538,delayed_because_of_accident,andreas_rönnberg,rock,springfield,14:07:09,friday,14
2,20EC38,funiculì_funiculà,mario_lanza,pop,shelbyville,20:58:07,wednesday,20
3,A3DD03C9,dragons_in_the_sunset,fire_ice,folk,shelbyville,08:37:09,monday,8
4,E2DC1FAE,soul_people,space_echo,dance,springfield,08:34:34,monday,8
...,...,...,...,...,...,...,...,...
59986,729CBB09,my_name,mclean,rnb,springfield,13:32:28,wednesday,13
59987,D08D4A55,maybe_one_day_feat_black_spade_,blu_exile,hiphop,shelbyville,10:00:00,monday,10
59988,C5E3A0D5,jalopiina,unknown,industrial,springfield,20:09:26,friday,20
59989,321D0506,freight_train,chas_mcdevitt,rock,springfield,21:43:59,friday,21


In [8]:
df_music['hour'] = df_music['hour'].astype('category')
# Checking dtypes after imputation
display(HTML(f"> Data types after imputation:\n"))
print(df_music.loc[:, ['hour']].dtypes)

hour    category
dtype: object


## 3. Feature Engineering

### 3.1 User activity variation by day and city

#### 3.1.1 User activity variation by city

In [9]:
df_music

Unnamed: 0,userid,track,artist,genre,city,time,day,hour
0,FFB692EC,kamigata_to_boots,the_mass_missile,rock,shelbyville,20:28:33,wednesday,20
1,55204538,delayed_because_of_accident,andreas_rönnberg,rock,springfield,14:07:09,friday,14
2,20EC38,funiculì_funiculà,mario_lanza,pop,shelbyville,20:58:07,wednesday,20
3,A3DD03C9,dragons_in_the_sunset,fire_ice,folk,shelbyville,08:37:09,monday,8
4,E2DC1FAE,soul_people,space_echo,dance,springfield,08:34:34,monday,8
...,...,...,...,...,...,...,...,...
59986,729CBB09,my_name,mclean,rnb,springfield,13:32:28,wednesday,13
59987,D08D4A55,maybe_one_day_feat_black_spade_,blu_exile,hiphop,shelbyville,10:00:00,monday,10
59988,C5E3A0D5,jalopiina,unknown,industrial,springfield,20:09:26,friday,20
59989,321D0506,freight_train,chas_mcdevitt,rock,springfield,21:43:59,friday,21


In [10]:
df_music_city = pd.pivot_table(df_music, index='city', values=['userid', 'track'], aggfunc={'userid': pd.Series.nunique, 'track': [pd.Series.nunique, 'count']}, observed=False)
df_music_city


Unnamed: 0_level_0,track,track,userid
Unnamed: 0_level_1,count,nunique,nunique
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
shelbyville,18118,14406,12293
springfield,41873,29208,29069


In [11]:
df_music_city.columns = df_music_city.columns = ['total_tracks', 'tracks', 'users']
df_music_city


Unnamed: 0_level_0,total_tracks,tracks,users
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
shelbyville,18118,14406,12293
springfield,41873,29208,29069


In [12]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "music_activity_city.csv"
df_music_city.to_csv(processed_path)


#### 3.1.2 User activity variation by day

In [13]:
df_music_day = pd.pivot_table(df_music, index='day', values=['userid', 'track'], aggfunc={'userid': pd.Series.nunique, 'track': [pd.Series.nunique, 'count']}, observed=False)
df_music_day

Unnamed: 0_level_0,track,track,userid
Unnamed: 0_level_1,count,nunique,nunique
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
friday,21475,17141,16473
monday,20799,16570,15925
wednesday,17717,14370,13490


In [14]:
df_music_day.columns = ['total_tracks', 'tracks', 'users']
df_music_day

Unnamed: 0_level_0,total_tracks,tracks,users
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
friday,21475,17141,16473
monday,20799,16570,15925
wednesday,17717,14370,13490


In [16]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "music_activity_day.csv"

df_music_day.to_csv(processed_path)

#### 3.1.3 User activity variation by city and day

In [17]:
df_music_city_day = pd.pivot_table(df_music, index=['city', 'day'], values=['userid', 'track'], aggfunc={'userid': pd.Series.nunique, 'track': [pd.Series.nunique, 'count']}, observed=False)
df_music_city_day

Unnamed: 0_level_0,Unnamed: 1_level_0,track,track,userid
Unnamed: 0_level_1,Unnamed: 1_level_1,count,nunique,nunique
city,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
shelbyville,friday,5786,5208,4447
shelbyville,monday,5462,4896,4186
shelbyville,wednesday,6870,6055,5177
springfield,friday,15689,12908,12030
springfield,monday,15337,12593,11747
springfield,wednesday,10847,9236,8314


In [18]:
df_music_city_day = df_music_city_day.reset_index()
df_music_city_day

Unnamed: 0_level_0,city,day,track,track,userid
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,nunique,nunique
0,shelbyville,friday,5786,5208,4447
1,shelbyville,monday,5462,4896,4186
2,shelbyville,wednesday,6870,6055,5177
3,springfield,friday,15689,12908,12030
4,springfield,monday,15337,12593,11747
5,springfield,wednesday,10847,9236,8314


In [19]:
df_music_city_day.columns = ['city', 'day', 'total_tracks', 'tracks', 'users']
df_music_city_day

Unnamed: 0,city,day,total_tracks,tracks,users
0,shelbyville,friday,5786,5208,4447
1,shelbyville,monday,5462,4896,4186
2,shelbyville,wednesday,6870,6055,5177
3,springfield,friday,15689,12908,12030
4,springfield,monday,15337,12593,11747
5,springfield,wednesday,10847,9236,8314


In [21]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "music_activity_city_day.csv"

df_music_city_day.to_csv(processed_path, index=False)

### 3.2 User activity variation by time, day and city

#### 3.2.1 User activity variation by time and city

In [22]:
df_music_time_city = pd.pivot_table(df_music, index=['city', 'hour'], values=['userid', 'track'], aggfunc={'userid': pd.Series.nunique, 'track': [pd.Series.nunique, 'count']}, observed=False)
df_music_time_city

Unnamed: 0_level_0,Unnamed: 1_level_0,track,track,userid
Unnamed: 0_level_1,Unnamed: 1_level_1,count,nunique,nunique
city,hour,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
shelbyville,8,2553,2335,2030
shelbyville,9,2661,2435,2111
shelbyville,10,41,41,41
shelbyville,13,3120,2912,2522
shelbyville,14,3341,3066,2706
shelbyville,15,48,48,47
shelbyville,20,3223,2996,2576
shelbyville,21,3074,2848,2402
shelbyville,22,57,57,56
springfield,8,6690,5718,5264


In [23]:
df_music_time_city = df_music_time_city.reset_index()
df_music_time_city

Unnamed: 0_level_0,city,hour,track,track,userid
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,nunique,nunique
0,shelbyville,8,2553,2335,2030
1,shelbyville,9,2661,2435,2111
2,shelbyville,10,41,41,41
3,shelbyville,13,3120,2912,2522
4,shelbyville,14,3341,3066,2706
5,shelbyville,15,48,48,47
6,shelbyville,20,3223,2996,2576
7,shelbyville,21,3074,2848,2402
8,shelbyville,22,57,57,56
9,springfield,8,6690,5718,5264


In [24]:
df_music_time_city.columns = ['city', 'hour', 'total_tracks', 'tracks', 'users']
df_music_time_city

Unnamed: 0,city,hour,total_tracks,tracks,users
0,shelbyville,8,2553,2335,2030
1,shelbyville,9,2661,2435,2111
2,shelbyville,10,41,41,41
3,shelbyville,13,3120,2912,2522
4,shelbyville,14,3341,3066,2706
5,shelbyville,15,48,48,47
6,shelbyville,20,3223,2996,2576
7,shelbyville,21,3074,2848,2402
8,shelbyville,22,57,57,56
9,springfield,8,6690,5718,5264


In [25]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "music_activity_time_city.csv"

df_music_time_city.to_csv(processed_path, index=False)

#### 3.2.2 User activity variation by time and day

In [26]:
df_music_time_day = pd.pivot_table(df_music, index=['day', 'hour'], values=['userid', 'track'], aggfunc={'userid': pd.Series.nunique, 'track': [pd.Series.nunique, 'count']}, observed=False)
df_music_time_day

Unnamed: 0_level_0,Unnamed: 1_level_0,track,track,userid
Unnamed: 0_level_1,Unnamed: 1_level_1,count,nunique,nunique
day,hour,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
friday,8,3508,3170,2886
friday,9,3523,3230,2947
friday,10,70,70,70
friday,13,3456,3212,2943
friday,14,3639,3333,3077
...,...,...,...,...
wednesday,14,3291,3045,2789
wednesday,15,49,49,49
wednesday,20,3218,2989,2649
wednesday,21,2942,2711,2346


In [27]:
df_music_time_day = df_music_time_day.reset_index()
df_music_time_day

Unnamed: 0_level_0,day,hour,track,track,userid
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,nunique,nunique
0,friday,8,3508,3170,2886
1,friday,9,3523,3230,2947
2,friday,10,70,70,70
3,friday,13,3456,3212,2943
4,friday,14,3639,3333,3077
...,...,...,...,...,...
22,wednesday,14,3291,3045,2789
23,wednesday,15,49,49,49
24,wednesday,20,3218,2989,2649
25,wednesday,21,2942,2711,2346


In [28]:
df_music_time_day.columns = ['day', 'hour', 'total_tracks', 'tracks', 'users']
df_music_time_day

Unnamed: 0,day,hour,total_tracks,tracks,users
0,friday,8,3508,3170,2886
1,friday,9,3523,3230,2947
2,friday,10,70,70,70
3,friday,13,3456,3212,2943
4,friday,14,3639,3333,3077
...,...,...,...,...,...
22,wednesday,14,3291,3045,2789
23,wednesday,15,49,49,49
24,wednesday,20,3218,2989,2649
25,wednesday,21,2942,2711,2346


In [29]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "music_activity_time_day.csv"

df_music_time_day.to_csv(processed_path, index=False)

#### 3.2.3 User activity variation by time, city and day

In [30]:
df_music_time_city_day = pd.pivot_table(df_music, index=['city', 'day', 'hour'], values=['userid', 'track'], aggfunc={'userid': pd.Series.nunique, 'track': [pd.Series.nunique, 'count']}, observed=False)
df_music_time_city_day

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,track,track,userid
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,nunique,nunique
city,day,hour,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
shelbyville,friday,8,832,809,689
shelbyville,friday,9,848,823,730
shelbyville,friday,10,18,18,18
shelbyville,friday,13,1020,987,853
shelbyville,friday,14,1033,990,871
...,...,...,...,...,...
springfield,wednesday,14,2019,1903,1715
springfield,wednesday,15,38,38,38
springfield,wednesday,20,1987,1885,1642
springfield,wednesday,21,1794,1676,1426


In [31]:
df_music_time_city_day = df_music_time_city_day.reset_index()
df_music_time_city_day

Unnamed: 0_level_0,city,day,hour,track,track,userid
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,nunique,nunique
0,shelbyville,friday,8,832,809,689
1,shelbyville,friday,9,848,823,730
2,shelbyville,friday,10,18,18,18
3,shelbyville,friday,13,1020,987,853
4,shelbyville,friday,14,1033,990,871
...,...,...,...,...,...,...
49,springfield,wednesday,14,2019,1903,1715
50,springfield,wednesday,15,38,38,38
51,springfield,wednesday,20,1987,1885,1642
52,springfield,wednesday,21,1794,1676,1426


In [32]:
df_music_time_city_day.columns = ['city', 'day', 'hour', 'total_tracks', 'tracks', 'users']
df_music_time_city_day

Unnamed: 0,city,day,hour,total_tracks,tracks,users
0,shelbyville,friday,8,832,809,689
1,shelbyville,friday,9,848,823,730
2,shelbyville,friday,10,18,18,18
3,shelbyville,friday,13,1020,987,853
4,shelbyville,friday,14,1033,990,871
...,...,...,...,...,...,...
49,springfield,wednesday,14,2019,1903,1715
50,springfield,wednesday,15,38,38,38
51,springfield,wednesday,20,1987,1885,1642
52,springfield,wednesday,21,1794,1676,1426


In [33]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "music_activity_time_city_day.csv"

df_music_time_city_day.to_csv(processed_path, index=False)