# DataPrep

This project needs to be at least 10 columns, with attractive visuals in Tableau that help tell a story. 

# Library Imports

In [5]:
# Standard Library Imports
import pandas as pd

In [6]:
# Load datasets
concerts = pd.read_csv("data/ConcertArchivesExportFeb22-25.csv")  # All concerts
twentyfour = pd.read_csv("data/2024concertsranked.csv")  # 2024 concerts with rankings
bestof24 = pd.read_csv("data/bestof24.csv")  # Best albums/songs of 2024
bestof23 = pd.read_csv("data/bestof23.csv")  # Best albums/songs of 2023
bestof22 = pd.read_csv("data/bestof22.csv")  # Best albums/songs of 2022
bestof21 = pd.read_csv("data/bestof21.csv")  # Best albums/songs of 2021

# View Data

In [8]:
# Display the first few rows of each dataset
twentyfour.head()

Unnamed: 0,Ranking,Start Date,Concert Name,Bands,Venue,Location,Rating,Paid,Discount
0,1,02/08/2024,,Willi Carlisle / Golden Shoals,Cactus Club,"Milwaukee, Wisconsin, United States",5.0,1,
1,2,01/13/2024,Pool Sounds v3,Lady Bird / Max Niemann / Jacob Slade / Maximi...,Pool Studios,"Milwaukee, Wisconsin, United States",5.0,1,
2,3,03/31/2024,"Barely Civil ""I'd Say I'm Not Fine"" Record Rel...",Barely Civil / Magazine Beach / Overhand / Kno...,Cactus Club,"Milwaukee, Wisconsin, United States",5.0,1,
3,4,04/20/2024,Record Store Day 2024,Friko,Liliput Records,"Milwaukee, Wisconsin, United States",5.0,0,Record Store Day
4,5,08/02/2024,,J.E. Sunde / Caley Conway,Cactus Club,"Milwaukee, Wisconsin, United States",5.0,1,


In [9]:
twentyfour.shape

(57, 9)

In [10]:
concerts.head()

Unnamed: 0,Start Date,End Date,Concert Name,Bands,Venue,Location
0,02/22/2025,,Riverwest Radio Spotlight Series,Barely Civil / Wave Chapelle / Caley Conway / 2hi,Amorphic Beer,"Milwaukee, Wisconsin, United States"
1,01/29/2025,,the DIVE tour,Almost Monday / Adrian Lyles,Vivarium,"Milwaukee, Wisconsin, United States"
2,01/24/2025,,,Beach Bunny / Delaney Bailey,Turner Hall Ballroom,"Milwaukee, Wisconsin, United States"
3,01/23/2025,,,Orillia / Poor Ridley / Maximiano,Sugar Maple,"Milwaukee, Wisconsin, United States"
4,12/26/2024,,,Johanna Rose / Ellie Jackson / Bitch Creek,Cactus Club,"Milwaukee, Wisconsin, United States"


In [11]:
concerts.shape

(221, 6)

In [12]:
# Merge concerts and twentyfour
df_concerts = pd.merge(
    concerts,                      # Left dataset
    twentyfour,                    # Right dataset
    on=['Start Date', 'Concert Name', 'Bands', 'Venue', 'Location'],  # Key columns
    how='left'                     # Keep all rows from concerts, add matching rows from twentyfour
)

# Display the merged dataset
df_concerts.head()

Unnamed: 0,Start Date,End Date,Concert Name,Bands,Venue,Location,Ranking,Rating,Paid,Discount
0,02/22/2025,,Riverwest Radio Spotlight Series,Barely Civil / Wave Chapelle / Caley Conway / 2hi,Amorphic Beer,"Milwaukee, Wisconsin, United States",,,,
1,01/29/2025,,the DIVE tour,Almost Monday / Adrian Lyles,Vivarium,"Milwaukee, Wisconsin, United States",,,,
2,01/24/2025,,,Beach Bunny / Delaney Bailey,Turner Hall Ballroom,"Milwaukee, Wisconsin, United States",,,,
3,01/23/2025,,,Orillia / Poor Ridley / Maximiano,Sugar Maple,"Milwaukee, Wisconsin, United States",,,,
4,12/26/2024,,,Johanna Rose / Ellie Jackson / Bitch Creek,Cactus Club,"Milwaukee, Wisconsin, United States",26.0,4.5,1.0,


In [13]:
df_concerts.shape

(221, 10)

In [14]:
# Check for missing values in the merged dataset
print(df_concerts.isnull().sum())

Start Date        0
End Date        216
Concert Name    145
Bands             0
Venue             0
Location          0
Ranking         173
Rating          173
Paid            173
Discount        188
dtype: int64


## Drop Unnessesary Columns

In [16]:
# Drop End Date and Concert Name columns
df_concerts = df_concerts.drop(columns=['End Date', 'Concert Name'])
df_concerts

Unnamed: 0,Start Date,Bands,Venue,Location,Ranking,Rating,Paid,Discount
0,02/22/2025,Barely Civil / Wave Chapelle / Caley Conway / 2hi,Amorphic Beer,"Milwaukee, Wisconsin, United States",,,,
1,01/29/2025,Almost Monday / Adrian Lyles,Vivarium,"Milwaukee, Wisconsin, United States",,,,
2,01/24/2025,Beach Bunny / Delaney Bailey,Turner Hall Ballroom,"Milwaukee, Wisconsin, United States",,,,
3,01/23/2025,Orillia / Poor Ridley / Maximiano,Sugar Maple,"Milwaukee, Wisconsin, United States",,,,
4,12/26/2024,Johanna Rose / Ellie Jackson / Bitch Creek,Cactus Club,"Milwaukee, Wisconsin, United States",26.0,4.5,1.0,
...,...,...,...,...,...,...,...,...
216,11/18/2011,The Avett Brothers,The Riverside Theater,"Milwaukee, Wisconsin, United States",,,,
217,07/26/2011,Blitzen Trapper / Ages and Ages,Slowdown,"Omaha, Nebraska, United States",,,,
218,07/11/2010,The Avett Brothers / Elephant Revival,Lake Superior Big Top Chautauqua,"Bayfield, WI",,,,
219,03/06/2010,The Low Anthem / The Avett Brothers,The Riverside Theater,"Milwaukee, Wisconsin, United States",,,,


## Fill Remaining Nulls

In [18]:
# Fill missing values
df_concerts['Ranking'] = df_concerts['Ranking'].fillna(0)
df_concerts['Rating'] = df_concerts['Rating'].fillna(0)
df_concerts['Paid'] = df_concerts['Paid'].fillna(1)  # Assume missing values mean paid
df_concerts['Discount'] = df_concerts['Discount'].fillna("Unknown")

# Display the updated df
df_concerts.head()

Unnamed: 0,Start Date,Bands,Venue,Location,Ranking,Rating,Paid,Discount
0,02/22/2025,Barely Civil / Wave Chapelle / Caley Conway / 2hi,Amorphic Beer,"Milwaukee, Wisconsin, United States",0.0,0.0,1.0,Unknown
1,01/29/2025,Almost Monday / Adrian Lyles,Vivarium,"Milwaukee, Wisconsin, United States",0.0,0.0,1.0,Unknown
2,01/24/2025,Beach Bunny / Delaney Bailey,Turner Hall Ballroom,"Milwaukee, Wisconsin, United States",0.0,0.0,1.0,Unknown
3,01/23/2025,Orillia / Poor Ridley / Maximiano,Sugar Maple,"Milwaukee, Wisconsin, United States",0.0,0.0,1.0,Unknown
4,12/26/2024,Johanna Rose / Ellie Jackson / Bitch Creek,Cactus Club,"Milwaukee, Wisconsin, United States",26.0,4.5,1.0,Unknown


In [19]:
# Convert Start Date to datetime
df_concerts['Start Date'] = pd.to_datetime(df_concerts['Start Date'], format='%m/%d/%Y')

# Filter for 2024 data
df_2024 = df_concerts[df_concerts['Start Date'].dt.year == 2024]

# Display the filtered dataset
df_2024.head()

Unnamed: 0,Start Date,Bands,Venue,Location,Ranking,Rating,Paid,Discount
4,2024-12-26,Johanna Rose / Ellie Jackson / Bitch Creek,Cactus Club,"Milwaukee, Wisconsin, United States",26.0,4.5,1.0,Unknown
5,2024-11-13,Modest Mouse / The Black Heart Procession,The Riverside Theater,"Milwaukee, Wisconsin, United States",0.0,0.0,1.0,Unknown
6,2024-11-09,Haley Heynderickx / Lily Breshears,Vivarium,"Milwaukee, Wisconsin, United States",7.0,5.0,0.0,Street Team
7,2024-11-02,The Menzingers / Direct Hit! / Holy Pinto,The Rave,"Milwaukee, Wisconsin, United States",15.0,4.5,0.0,Rave Contest
8,2024-10-24,The Nunnery / Dosh / Luke Callen / Maximiano /...,Cactus Club,"Milwaukee, Wisconsin, United States",0.0,0.0,1.0,Unknown


## Extract a list of unique venues visited in 2024

In [21]:
# Get unique venues
unique_venues = df_2024['Venue'].unique()

# Convert to a DataFrame
df_unique_venues = pd.DataFrame(unique_venues, columns=['Venue'])

# Display the unique venues
df_unique_venues

Unnamed: 0,Venue
0,Cactus Club
1,The Riverside Theater
2,Vivarium
3,The Rave
4,Riverwalk Commons
5,Pabst Theater
6,88Nine Radio Milwaukee Studios
7,X-Ray Arcade
8,The Rave/Eagles Club
9,Miller High Life Theatre


In [22]:
# Save to CSV
df_unique_venues.to_csv("data/unique_venues_2024.csv", index=False)

#### I manually typed the distance from home, time to travel, and venue capacity by hand in Google Sheets

In [24]:
# Load the venue details CSV
venue_info = pd.read_csv("data/venue_info_2024.csv")

# Display the first few rows
venue_info.head()

Unnamed: 0,Venue,dist_from_home,drive_time,capacity
0,Cactus Club,9.5,15,150
1,The Riverside Theater,5.5,12,2450
2,Vivarium,5.5,16,450
3,The Rave,4.9,10,1000
4,Riverwalk Commons,6.4,12,100


In [25]:
## Merge Back with df_2024

In [26]:
# Merge the datasets
df_2024 = pd.merge(
    df_2024,                      # Left dataset
    venue_info,                   # Right dataset
    on='Venue',                   # Key column
    how='left'                    # Keep all rows from df_2024
)

# Display the updated dataset
df_2024.head()

Unnamed: 0,Start Date,Bands,Venue,Location,Ranking,Rating,Paid,Discount,dist_from_home,drive_time,capacity
0,2024-12-26,Johanna Rose / Ellie Jackson / Bitch Creek,Cactus Club,"Milwaukee, Wisconsin, United States",26.0,4.5,1.0,Unknown,9.5,15,150
1,2024-11-13,Modest Mouse / The Black Heart Procession,The Riverside Theater,"Milwaukee, Wisconsin, United States",0.0,0.0,1.0,Unknown,5.5,12,2450
2,2024-11-09,Haley Heynderickx / Lily Breshears,Vivarium,"Milwaukee, Wisconsin, United States",7.0,5.0,0.0,Street Team,5.5,16,450
3,2024-11-02,The Menzingers / Direct Hit! / Holy Pinto,The Rave,"Milwaukee, Wisconsin, United States",15.0,4.5,0.0,Rave Contest,4.9,10,1000
4,2024-10-24,The Nunnery / Dosh / Luke Callen / Maximiano /...,Cactus Club,"Milwaukee, Wisconsin, United States",0.0,0.0,1.0,Unknown,9.5,15,150


In [27]:
df_2024.head()

Unnamed: 0,Start Date,Bands,Venue,Location,Ranking,Rating,Paid,Discount,dist_from_home,drive_time,capacity
0,2024-12-26,Johanna Rose / Ellie Jackson / Bitch Creek,Cactus Club,"Milwaukee, Wisconsin, United States",26.0,4.5,1.0,Unknown,9.5,15,150
1,2024-11-13,Modest Mouse / The Black Heart Procession,The Riverside Theater,"Milwaukee, Wisconsin, United States",0.0,0.0,1.0,Unknown,5.5,12,2450
2,2024-11-09,Haley Heynderickx / Lily Breshears,Vivarium,"Milwaukee, Wisconsin, United States",7.0,5.0,0.0,Street Team,5.5,16,450
3,2024-11-02,The Menzingers / Direct Hit! / Holy Pinto,The Rave,"Milwaukee, Wisconsin, United States",15.0,4.5,0.0,Rave Contest,4.9,10,1000
4,2024-10-24,The Nunnery / Dosh / Luke Callen / Maximiano /...,Cactus Club,"Milwaukee, Wisconsin, United States",0.0,0.0,1.0,Unknown,9.5,15,150


In [28]:
# Define a mapping dictionary for venue names
venue_name_mapping = {
    'The Rave/Eagles Club': 'The Rave',
    'Marcus Performing Arts Center': 'Marcus Center for the Performing Arts',
    'Summerfest Grounds at Henry Maier Festival Park-M & I Bank Classic Rock Stage': 'Summerfest Grounds at Henry Maier Festival Park',
}

# Standardize venue names in df_2024
df_2024['Venue'] = df_2024['Venue'].replace(venue_name_mapping)

# Standardize venue names in venue_info
venue_info['Venue'] = venue_info['Venue'].replace(venue_name_mapping)


In [29]:
# Add a ranking column to each bestof dataset
bestof24['Rank'] = bestof24.index + 1  # Index starts at 0, so add 1
bestof23['Rank'] = bestof23.index + 1
bestof22['Rank'] = bestof22.index + 1
bestof21['Rank'] = bestof21.index + 1

# Display the updated bestof24 dataset
bestof24.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Rank
0,24TyIHRNtcNihfFoWKkqzP,Where We've Been,"Where we've been, Where we go from here",Friko,2024-02-16,315633,32,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:12:53Z,,...,-10.981,1,0.0437,0.787,0.0412,0.0876,0.116,146.426,4,1
1,78SjYKRXdivtTDHBzaehnm,The Architect,Deeper Well,Kacey Musgraves,2024-03-15,177293,55,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:13:50Z,,...,-8.603,1,0.032,0.842,0.0,0.0858,0.441,123.285,4,2
2,42gvftYbB8lU8B6pP7Hp6k,Jaybird,Critterland,Willi Carlisle,2024-01-26,233091,22,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:14:05Z,"alt country,americana,bluegrass",...,-10.13,1,0.0303,0.647,4.4e-05,0.156,0.527,122.981,4,3
3,0nj9Bq5sHDiTxSHunhgkFb,squabble up,GNX,Kendrick Lamar,2024-11-22,157992,88,mmr4r23xnc6oh1c77lysfbqg4,2024-12-03T14:02:59Z,"hip hop,west coast hip hop",...,-5.568,1,0.198,0.0206,0.0,0.0783,0.711,103.921,4,4
4,1Ov33kwQ8c0ZnKIiHo7yl6,Summer Bodies,No Souvenirs,Fightmilk,2024-11-15,228506,5,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:14:40Z,,...,-5.166,1,0.0563,0.0182,1.5e-05,0.13,0.395,127.861,4,5


In [30]:
# Add a year column to each dataset
bestof24['Year'] = 2024
bestof23['Year'] = 2023
bestof22['Year'] = 2022
bestof21['Year'] = 2021

# Combine all bestof datasets
bestof_all = pd.concat([bestof24, bestof23, bestof22, bestof21], ignore_index=True)

# Display the combined dataset
bestof_all.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Rank,Year
0,24TyIHRNtcNihfFoWKkqzP,Where We've Been,"Where we've been, Where we go from here",Friko,2024-02-16,315633,32,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:12:53Z,,...,1,0.0437,0.787,0.0412,0.0876,0.116,146.426,4,1,2024
1,78SjYKRXdivtTDHBzaehnm,The Architect,Deeper Well,Kacey Musgraves,2024-03-15,177293,55,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:13:50Z,,...,1,0.032,0.842,0.0,0.0858,0.441,123.285,4,2,2024
2,42gvftYbB8lU8B6pP7Hp6k,Jaybird,Critterland,Willi Carlisle,2024-01-26,233091,22,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:14:05Z,"alt country,americana,bluegrass",...,1,0.0303,0.647,4.4e-05,0.156,0.527,122.981,4,3,2024
3,0nj9Bq5sHDiTxSHunhgkFb,squabble up,GNX,Kendrick Lamar,2024-11-22,157992,88,mmr4r23xnc6oh1c77lysfbqg4,2024-12-03T14:02:59Z,"hip hop,west coast hip hop",...,1,0.198,0.0206,0.0,0.0783,0.711,103.921,4,4,2024
4,1Ov33kwQ8c0ZnKIiHo7yl6,Summer Bodies,No Souvenirs,Fightmilk,2024-11-15,228506,5,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:14:40Z,,...,1,0.0563,0.0182,1.5e-05,0.13,0.395,127.861,4,5,2024


In [31]:
# Define a function to categorize albums
def categorize_album(rank):
    if rank <= 5:
        return 'Top 5'
    elif rank <= 10:
        return 'Top 10'
    elif rank <= 25:
        return 'Top 25'
    elif rank <= 50:
        return 'Top 50'
    elif rank <= 100:
        return 'Top 100'
    else:
        return 'Not in Top 100'

# Apply the function to create a new column
bestof_all['Tier'] = bestof_all['Rank'].apply(categorize_album)

# Display the updated dataset
bestof_all.head()

Unnamed: 0,Track ID,Track Name,Album Name,Artist Name(s),Release Date,Duration (ms),Popularity,Added By,Added At,Genres,...,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Rank,Year,Tier
0,24TyIHRNtcNihfFoWKkqzP,Where We've Been,"Where we've been, Where we go from here",Friko,2024-02-16,315633,32,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:12:53Z,,...,0.0437,0.787,0.0412,0.0876,0.116,146.426,4,1,2024,Top 5
1,78SjYKRXdivtTDHBzaehnm,The Architect,Deeper Well,Kacey Musgraves,2024-03-15,177293,55,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:13:50Z,,...,0.032,0.842,0.0,0.0858,0.441,123.285,4,2,2024,Top 5
2,42gvftYbB8lU8B6pP7Hp6k,Jaybird,Critterland,Willi Carlisle,2024-01-26,233091,22,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:14:05Z,"alt country,americana,bluegrass",...,0.0303,0.647,4.4e-05,0.156,0.527,122.981,4,3,2024,Top 5
3,0nj9Bq5sHDiTxSHunhgkFb,squabble up,GNX,Kendrick Lamar,2024-11-22,157992,88,mmr4r23xnc6oh1c77lysfbqg4,2024-12-03T14:02:59Z,"hip hop,west coast hip hop",...,0.198,0.0206,0.0,0.0783,0.711,103.921,4,4,2024,Top 5
4,1Ov33kwQ8c0ZnKIiHo7yl6,Summer Bodies,No Souvenirs,Fightmilk,2024-11-15,228506,5,mmr4r23xnc6oh1c77lysfbqg4,2024-12-02T20:14:40Z,,...,0.0563,0.0182,1.5e-05,0.13,0.395,127.861,4,5,2024,Top 5


In [32]:
bestof_all.columns

Index(['Track ID', 'Track Name', 'Album Name', 'Artist Name(s)',
       'Release Date', 'Duration (ms)', 'Popularity', 'Added By', 'Added At',
       'Genres', 'Record Label', 'Danceability', 'Energy', 'Key', 'Loudness',
       'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo', 'Time Signature', 'Rank', 'Year', 'Tier'],
      dtype='object')

## Drop Unnessisary Columns

In [34]:
# Drop unnecessary columns
bestof_all = bestof_all[['Artist Name(s)', 'Album Name', 'Release Date', 'Rank', 'Year', 'Tier']]

# Display the updated dataset
bestof_all.head()

Unnamed: 0,Artist Name(s),Album Name,Release Date,Rank,Year,Tier,Popularity
0,Friko,"Where we've been, Where we go from here",2024-02-16,1,2024,Top 5,32
1,Kacey Musgraves,Deeper Well,2024-03-15,2,2024,Top 5,55
2,Willi Carlisle,Critterland,2024-01-26,3,2024,Top 5,22
3,Kendrick Lamar,GNX,2024-11-22,4,2024,Top 5,88
4,Fightmilk,No Souvenirs,2024-11-15,5,2024,Top 5,5


In [35]:
# Define a function to extract the primary artist name
def extract_primary_artist(artist_name):
    if pd.isna(artist_name):
        return None
    # Handle special case for "Tyler, The Creator"
    if "Tyler, The Creator" in artist_name:
        return "Tyler, The Creator"
    # Split on comma and take the first part
    return artist_name.split(',')[0].strip()

# Apply the function to the Artist Name(s) column
bestof_all['Primary Artist'] = bestof_all['Artist Name(s)'].apply(extract_primary_artist)

# Drop the original Artist Name(s) column (optional)
bestof_all = bestof_all.drop(columns=['Artist Name(s)'])

# Display the updated dataset
bestof_all.head()

Unnamed: 0,Album Name,Release Date,Rank,Year,Tier,Popularity,Primary Artist
0,"Where we've been, Where we go from here",2024-02-16,1,2024,Top 5,32,Friko
1,Deeper Well,2024-03-15,2,2024,Top 5,55,Kacey Musgraves
2,Critterland,2024-01-26,3,2024,Top 5,22,Willi Carlisle
3,GNX,2024-11-22,4,2024,Top 5,88,Kendrick Lamar
4,No Souvenirs,2024-11-15,5,2024,Top 5,5,Fightmilk


#### 2025 albums have the full album, kiep the most popular song from each, and then change tier name

Unnamed: 0,Album Name,Release Date,Rank,Year,Tier,Popularity,Primary Artist
