# Import Required Libraries
Import the necessary libraries, including pandas.

In [3]:
import pandas as pd
import numpy as np
import os

# Load the Dataset
Load the Spotify streaming history dataset into a pandas DataFrame.

In [5]:
# Path of datasets
path = 'datasets/streaming_history/'

# List of datasets
datasets = []

# Load the dataset into a pandas DataFrame
spotify_data = pd.read_json(path)
spotify_data['ts'] = pd.to_datetime(spotify_data['ts'])

# Display the first few rows of the DataFrame to verify it was loaded correctly
print(spotify_data['ts'].dt.year.value_counts())



PermissionError: [Errno 13] Permission denied: 'datasets/streaming_history/'

# Inspect the Dataset
Inspect the dataset using methods like info() and describe().

In [3]:
# Use the info() method to get a concise summary of the DataFrame
spotify_data.info()

# Use the describe() method to generate descriptive statistics of the DataFrame
spotify_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16097 entries, 0 to 16096
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype              
---  ------                             --------------  -----              
 0   ts                                 16097 non-null  datetime64[ns, UTC]
 1   username                           16097 non-null  object             
 2   platform                           16097 non-null  object             
 3   ms_played                          16097 non-null  int64              
 4   conn_country                       16097 non-null  object             
 5   ip_addr_decrypted                  8255 non-null   object             
 6   user_agent_decrypted               8255 non-null   object             
 7   master_metadata_track_name         15969 non-null  object             
 8   master_metadata_album_artist_name  15969 non-null  object             
 9   master_metadata_album_album_name   15969 non-null 

Unnamed: 0,ms_played,skipped,offline_timestamp
count,16097.0,10.0,16097.0
mean,116160.007703,0.0,1432568000000.0
std,106978.649011,0.0,358419000000.0
min,0.0,0.0,0.0
25%,4979.0,0.0,1508683000000.0
50%,102663.0,0.0,1529621000000.0
75%,214425.0,0.0,1534514000000.0
max,861083.0,0.0,1538428000000.0


# Handle Missing Data
Handle any missing data in the dataset, either by removing it or filling it in.

In [4]:
# Count the missing values 
sum_missing_values = spotify_data.isna().sum()
print(sum_missing_values)

ts                                       0
username                                 0
platform                                 0
ms_played                                0
conn_country                             0
ip_addr_decrypted                     7842
user_agent_decrypted                  7842
master_metadata_track_name             128
master_metadata_album_artist_name      128
master_metadata_album_album_name       128
spotify_track_uri                      128
episode_name                         16095
episode_show_name                    16095
spotify_episode_uri                  16095
reason_start                             0
reason_end                            7842
shuffle                                  0
skipped                              16087
offline                                  0
offline_timestamp                        0
incognito_mode                           0
dtype: int64


# Convert Data Types
Convert the data types of any columns that are not correctly typed.

In [5]:
spotify_data.head()

spotify_data = spotify_data.astype({'offline':'bool','shuffle':'bool'})

tempo_total_2017 = spotify_data['ms_played'].sum()

tempo_total_2017_horas = round(tempo_total_2017 / 3600000,2)

print(tempo_total_2017_horas)

519.4


# Normalize Text Data
Normalize the text data by converting it to lowercase and removing any special characters.

In [9]:
# Normalize Text Data

# Importing the re library for regular expressions
import re

# Function to normalize text data
def normalize_text(text):
    if text is None:
        return ''
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text

# Apply the normalize_text function to all text columns
for col in spotify_data.select_dtypes(include=[object]).columns:
    spotify_data[col] = spotify_data[col].apply(normalize_text)

# Verify the changes
spotify_data.head()


Unnamed: 0,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,...,episode_name,episode_show_name,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode
0,2017-06-17 07:45:29+00:00,ox2t5jgl3cz2w3gbmcne91ay7,android os 5 1 1 api 22 asus zb500kg,170640,br,,,goya no machiawase,hello sleepwalkers,masked monkey awakening,...,,,,playbtn,,True,,False,1497688921675,False
1,2017-06-17 07:47:14+00:00,ox2t5jgl3cz2w3gbmcne91ay7,android os 5 1 1 api 22 asus zb500kg,103057,br,,,夏の夜 mindless ver,the salovers,いざ サラバーズ,...,,,,trackdone,,True,,False,1497689091506,False
2,2017-06-17 07:50:35+00:00,ox2t5jgl3cz2w3gbmcne91ay7,android os 5 1 1 api 22 asus zb500kg,202160,br,,,catchy,sakanamon,cue,...,,,,fwdbtn,,True,,False,1497689196284,False
3,2017-06-17 07:51:11+00:00,ox2t5jgl3cz2w3gbmcne91ay7,android os 5 1 1 api 22 asus zb500kg,34252,br,,,忘却,unsuspected monogram,the mass,...,,,,trackdone,,True,,False,1497689397548,False
4,2017-06-17 07:53:01+00:00,ox2t5jgl3cz2w3gbmcne91ay7,android os 5 1 1 api 22 asus zb500kg,19598,br,,,odoru rollschach album mix,owarikara,saihate songs,...,,,,fwdbtn,,True,,False,1497689433323,False


# Extract Date and Time Features
Extract useful features from any date and time columns, such as the day of the week or the hour of the day.

In [11]:
# Extract Date and Time Features

# Convert the 'ts' column to datetime format
spotify_data['ts'] = pd.to_datetime(spotify_data['ts'])

# Extract the day of the week from the 'ts' column
# The day of the week with Monday=0, Sunday=6
spotify_data['day_of_week'] = spotify_data['ts'].dt.dayofweek

# Extract the hour of the day from the 'ts' column
spotify_data['hour_of_day'] = spotify_data['ts'].dt.hour

# Verify the changes
spotify_data.head()

Unnamed: 0,ts,username,platform,ms_played,conn_country,ip_addr_decrypted,user_agent_decrypted,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,...,spotify_episode_uri,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode,day_of_week,hour_of_day
0,2017-06-17 07:45:29+00:00,ox2t5jgl3cz2w3gbmcne91ay7,android os 5 1 1 api 22 asus zb500kg,170640,br,,,goya no machiawase,hello sleepwalkers,masked monkey awakening,...,,playbtn,,True,,False,1497688921675,False,5,7
1,2017-06-17 07:47:14+00:00,ox2t5jgl3cz2w3gbmcne91ay7,android os 5 1 1 api 22 asus zb500kg,103057,br,,,夏の夜 mindless ver,the salovers,いざ サラバーズ,...,,trackdone,,True,,False,1497689091506,False,5,7
2,2017-06-17 07:50:35+00:00,ox2t5jgl3cz2w3gbmcne91ay7,android os 5 1 1 api 22 asus zb500kg,202160,br,,,catchy,sakanamon,cue,...,,fwdbtn,,True,,False,1497689196284,False,5,7
3,2017-06-17 07:51:11+00:00,ox2t5jgl3cz2w3gbmcne91ay7,android os 5 1 1 api 22 asus zb500kg,34252,br,,,忘却,unsuspected monogram,the mass,...,,trackdone,,True,,False,1497689397548,False,5,7
4,2017-06-17 07:53:01+00:00,ox2t5jgl3cz2w3gbmcne91ay7,android os 5 1 1 api 22 asus zb500kg,19598,br,,,odoru rollschach album mix,owarikara,saihate songs,...,,fwdbtn,,True,,False,1497689433323,False,5,7


# Save Cleaned Data
Save the cleaned data to a new CSV file for further analysis.

In [None]:
# Save Cleaned Data

```python
# Define the path to the new CSV file where the cleaned data will be saved
cleaned_file_path = 'cleaned_spotify_streaming_history.csv'

# Save the cleaned data to the new CSV file
# The index=False parameter prevents pandas from writing row indices into the CSV file
spotify_data.to_csv(cleaned_file_path, index=False)
```