In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from scipy import stats
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import json
import ast

In [None]:
df = pd.read_csv('lastfm_user_clean.csv', index = False)
print(df.info())
print(df.nunique())

## EDA

In [None]:
# Descriptive Statistics
numerical_cols = ['playcount',
                  'artist_count',
                  'track_count',
                  'album_count',
                  'registered_year',
                  ]

descriptive_stats = df[numerical_cols].describe().loc[['min', 'max', 'mean', 'std', '50%']].transpose()
descriptive_stats.columns = ['min', 'max', 'mean', 'std', 'median']
descriptive_stats['mode'] = df[numerical_cols].mode().transpose()[0]
descriptive_stats['mad'] = [stats.median_abs_deviation(df[x]) for x in numerical_cols]
descriptive_stats['kurtosis'] = [stats.kurtosis(df[x], fisher=False) for x in numerical_cols]
descriptive_stats['skewness'] = [stats.skew(df[x]) for x in numerical_cols]
descriptive_stats

As we can observe from discriptive statistics, the various music playing counts of users are quite postively skewed. The positive skewness of music playing counts suggests that most users have relatively low playcounts, but there are a few users with extremely high playcounts, resulting in the right-skewed distribution.

Out of these, `playcount` and `album_count` have mode value of 0. This could mean that a significant number of users have not played any songs or albums, respectively. For example, it could be due to inactive accounts, new users who have not started using the platform yet. However, it could also be due to users who listen to music but do not scrobble their plays to Last.fm, since the users have non-empty `Top_50` songs lists. 

The users have a wide range of registration year from 2002 to 2024. Moving on, we plot histograms to visualize the distribution of these numerical columns. 

In [None]:
# histogram of features to observe the distribution of values
ax = df.hist(figsize = (12, 10), bins = 100)

From the above analysis and visualization, we find `track_count` to be the most representative music playing counts among the four for users' activeness on the platform. There are extreme outliers in this value, therefore, we have decided to transform it into categorical variable, though risking losing some information. 

In [None]:
# transform track_count to categorical
categories = pd.cut(df.track_count, bins = [0, 4600, 9500, 16000, 30000, 450000],
                    labels = ["most inactive", "inactive", "medium", "active", "most active"])
categories.value_counts(normalize = True)

In [None]:
# try the above category, plot the percentage of frauds in each category
df['active_category'] = categories

plt.figure(figsize=(8, 8))
sns.countplot(x = 'active_category', data = df)
plt.xlabel('Activeness Category')
plt.ylabel('Number of Users')
plt.title('Distribution of Users for Different Activeness Level', fontsize = 12, fontweight = 'bold')
plt.show()

In [None]:
df['type'].value_counts()

In [None]:
# Observe where users come from
nocountry = df['country'] == 'None'
df.loc[nocountry, 'country'] = 'Not Indicated'
countries = df.groupby(['country']).agg({"Username":'count'}).reset_index()
countries = countries.sort_values(by = 'Username', ascending = False).head(15)
sns.barplot(y = countries.country, x = countries.Username).set(title = 'Top 15 Countries Where Users Come From')
plt.xlabel('Number of Users')
plt.show()


We can see that except for users who did not indicate their countries, most users come from the US, Brazil, UK, Canada and Australia.

In [None]:
plt.figure(figsize=(8, 8))

df['registered_year'] = df['registered_year'].astype('category')
type(df['registered_year'][0])
sns.boxplot(y = 'registered_year', x = 'track_count', data = df)