#Spotify Data: Popular Hip-hop Artists and Tracks

1. Read the dataframe, check null value if present then do the needful, check duplicate row , if present then do
the needful.

In [None]:
import pandas as pd

# Read the dataframe
df = pd.read_csv('spotify.csv')

# Check for null values
null_values = df.isnull().sum()
print("Null Values:")
print(null_values)

# Handle null values (e.g., drop rows or fill with mean/median)
if null_values.any():
    # Option 1: Drop rows with null values
    # df = df.dropna()

    # Option 2: Fill null values with mean/median (for numeric columns)
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    # Option 3: Fill null values with a specific value (e.g., 'Unknown' for categorical columns)
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].fillna('Unknown')

# Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print(f"Duplicate Rows: {duplicate_rows}")

# Handle duplicate rows (e.g., drop duplicates)
if duplicate_rows > 0:
    df = df.drop_duplicates()

# Verify the changes
print("Updated Null Values:")
print(df.isnull().sum())
print(f"Updated Duplicate Rows: {df.duplicated().sum()}")


2. What is the distribution of popularity among the tracks in the dataset? Visualize it using a histogram.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('spotify.csv')

# Plot the histogram
plt.figure(figsize=(8, 6))
sns.histplot(df['Popularity'], bins=20, kde=True)
plt.title('Distribution of Track Popularity')
plt.xlabel('Popularity')
plt.ylabel('Count')
plt.show()


3. Is there any relationship between the popularity and the duration of tracks? Explore this using a scatter plot.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('spotify.csv')

# Convert duration from ms to minutes
df['Duration (minutes)'] = df['Duration (ms)'] / 60000

# Plot the scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Duration (minutes)', y='Popularity', data=df)
plt.title('Relationship between Track Duration and Popularity')
plt.xlabel('Duration (minutes)')
plt.ylabel('Popularity')
plt.show()

# Calculate the correlation coefficient
correlation = df['Duration (minutes)'].corr(df['Popularity'])
print(f'Correlation coefficient: {correlation:.2f}')



4. Which artist has the highest number of tracks in the dataset? Display the count of tracks for each artist using
a countplot.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('spotify.csv')

# Count the number of tracks for each artist
artist_track_count = df['Artist'].value_counts().reset_index()
artist_track_count.columns = ['Artist', 'Track Count']

# Get the artist with the highest number of tracks
top_artist = artist_track_count.loc[artist_track_count['Track Count'].idxmax()]

# Print the top artist
print(f"The artist with the highest number of tracks is {top_artist['Artist']} with {top_artist['Track Count']} tracks.")

# Plot the countplot for top 10 artists
plt.figure(figsize=(10, 6))
sns.countplot(x='Artist', data=df, order=df['Artist'].value_counts().head(10).index)
plt.title('Count of Tracks for Top 10 Artists')
plt.xlabel('Artist')
plt.ylabel('Track Count')
plt.xticks(rotation=90)
plt.show()



5. What are the top 5 least popular tracks in the dataset? Provide the artist name and track name for each.

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('spotify.csv')

# Sort the tracks by popularity in ascending order
least_popular_tracks = df.sort_values(by='Popularity', ascending=True).head(5)

# Print the results
print(least_popular_tracks[['Artist', 'Track Name', 'Popularity']])


6. Among the top 5 most popular artists, which artist has the highest popularity on average? Calculate and
display the average popularity for each artist.

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('spotify.csv')

# Calculate the average popularity for each artist
avg_popularity_by_artist = df.groupby('Artist')['Popularity'].mean().reset_index()

# Get the top 5 most popular artists
top_artists = avg_popularity_by_artist.sort_values(by='Popularity', ascending=False).head(5)

# Print the results
print(top_artists)

# Find the artist with the highest average popularity
top_artist = top_artists.loc[top_artists['Popularity'].idxmax()]

# Print the top artist
print(f"The artist with the highest average popularity is {top_artist['Artist']} with an average popularity of {top_artist['Popularity']:.2f}.")



7. For the top 5 most popular artists, what are their most popular tracks? List the track name for each artist.

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('spotify.csv')

# Calculate the average popularity for each artist
avg_popularity_by_artist = df.groupby('Artist')['Popularity'].mean().reset_index()

# Get the top 5 most popular artists
top_artists = avg_popularity_by_artist.sort_values(by='Popularity', ascending=False).head(5)

# Print the top artists
print(top_artists)

# Find the most popular track for each top artist
for artist in top_artists['Artist']:
    artist_tracks = df[df['Artist'] == artist]
    most_popular_track = artist_tracks.loc[artist_tracks['Popularity'].idxmax()]
    print(f"Most popular track for {artist}: {most_popular_track['Track Name']} (Popularity: {most_popular_track['Popularity']})")



8. Visualize relationships between multiple numerical variables simultaneously using a pair plot.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('spotify.csv')

# Select numerical columns
numerical_cols = ['Popularity', 'Duration (ms)']
df_numerical = df[numerical_cols]

# Create a pair plot
plt.figure(figsize=(8, 6))
sns.pairplot(df_numerical)
plt.show()


9. Does the duration of tracks vary significantly across different artists? Explore this visually using a box plot or
violin plot.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('spotify.csv')

# Convert duration from ms to minutes
df['Duration (minutes)'] = df['Duration (ms)'] / 60000

# Select top 5 artists by track count
top_artists = df['Artist'].value_counts().head(5).index

# Filter the dataframe to include only top artists
df_top_artists = df[df['Artist'].isin(top_artists)]

# Create a box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='Artist', y='Duration (minutes)', data=df_top_artists)
plt.title('Track Duration Distribution Across Top Artists')
plt.xlabel('Artist')
plt.ylabel('Duration (minutes)')
plt.show()

# Alternatively, create a violin plot
plt.figure(figsize=(10, 6))
sns.violinplot(x='Artist', y='Duration (minutes)', data=df_top_artists)
plt.title('Track Duration Distribution Across Top Artists')
plt.xlabel('Artist')
plt.ylabel('Duration (minutes)')
plt.show()




10. How does the distribution of track popularity vary for different artists? Visualize this using a swarm plot or a
violin plot.



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('spotify.csv')

# Select top 5 artists by track count
top_artists = df['Artist'].value_counts().head(5).index

# Filter the dataframe to include only top artists
df_top_artists = df[df['Artist'].isin(top_artists)]

# Create a swarm plot
plt.figure(figsize=(10, 6))
sns.swarmplot(x='Artist', y='Popularity', data=df_top_artists)
plt.title('Track Popularity Distribution Across Top Artists')
plt.xlabel('Artist')
plt.ylabel('Popularity')
plt.show()

# Alternatively, create a violin plot
plt.figure(figsize=(10, 6))
sns.violinplot(x='Artist', y='Popularity', data=df_top_artists)
plt.title('Track Popularity Distribution Across Top Artists')
plt.xlabel('Artist')
plt.ylabel('Popularity')
plt.show()

