In [68]:
import pandas as pd
import json
import time
import numpy as np
from IPython.display import display
import ipywidgets as widgets

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

In [70]:
# youtube
from googleapiclient.discovery import build
API_KEY = 'AIzaSyDfhiUHhz21lJHxJxsiy19D5KL5bYGqtL8'
# Initialize the YouTube Data API client
api_service_name = "youtube"
api_version = "v3"
youtube = build(api_service_name, api_version, developerKey=API_KEY)

In [71]:
# spotify
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials 
client_id = "125fcec321e048db972933f9f364d74e"
client_secret = "615237e26e0c4d22a13889fd7b91a46a"
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) 

In [72]:
## Step 1: Import Pollstar Data

In [73]:
# Load data
data = pd.read_csv('a-round-pollstar.csv')
data.head(3)

Unnamed: 0.1,Unnamed: 0,Event Date,# Shows,Headliner,Support,Venue,City,State,Country,Market,...,Promoter,Genre,Avg. Tickets Sold,Avg. Gross USD,Avg. Event Capacity,Avg. Capacity Sold,Ticket Price Min,Ticket Price Max,Ticket Price Avg. USD,Unnamed: 21
0,1,2023-11-04,1,xikers,,Warfield Theatre,San Francisco,California,United States,San Francisco-Oakland-San Jose,...,Sean Healy Presents,Asian Pop,1261.0,152175.02,1780.0,71.0,50.0,175.0,120.68,
1,2,2023-11-03,1,EVERGLOW,,Kings Theatre,Brooklyn,New York,United States,New York,...,Sean Healy Presents,Asian Pop,941.0,100970.0,2312.0,41.0,60.0,195.0,107.3,
2,3,2023-11-01,1,Xikers,,Orpheum Theatre,Los Angeles,California,United States,Los Angeles,...,Sean Healy Presents,Asian Pop,1812.0,209839.65,1924.0,94.0,50.0,175.0,115.81,


In [74]:
# Data preprocessing for pollstar data

# Check if 'Event Date' column contains valid dates
data['Event Date'] = pd.to_datetime(data['Event Date'], errors='coerce')

# Drop rows where 'Event Date' is null (indicating invalid date)
data = data.dropna(subset=['Event Date'])

# Reset index after dropping rows
data = data.reset_index(drop=True)

In [75]:
# extract month from Event Date to represent seasonality
data['Event Date'] = pd.to_datetime(data['Event Date'], format='%m/%d/%Y')
data['Month'] = data['Event Date'].dt.month

In [76]:
# Remove commas and convert to float
data['Avg. Gross USD'] = data['Avg. Gross USD'].astype(float)

In [77]:
# create a Location column using 'City', 'State','Country'
data['Location'] = data['City'] + '_' + data['State'] + '_' + data['Country']

In [78]:
## Step 2: Extract artist names

In [79]:
artist_names = data['Headliner'].unique()

# Print or use the unique values as needed
print((artist_names))

['xikers' 'EVERGLOW' 'Xikers' 'The Rose' 'ENHYPEN'
 '"Mcountdown In France"' 'tripleS' 'Stay-C' 'Radwimps'
 'BABYMETAL, Metalocalypse: Dethklok' 'P1harmony' 'Loona' 'Twice'
 'Jonathan Lee' 'SEVENTEEN' 'ATEEZ' 'TOMORROW X TOGETHER'
 '"Lollapalooza Aftershows", The Rose' 'Suga' 'Rain' 'Enhypen' 'KARD'
 'NCT Dream' 'BLACKPINK' 'Red Velvet' 'Nmixx' 'Mayday' 'JJ Lin' 'Sabaton'
 'Baek Z Young' 'Stray Kids' 'CIX' 'NCT 127' 'Seventeen'
 'DPR Live, DPR IAN, DPR Cream' 'ITZY' 'MUSTB' 'G I-DLE'
 'Tomorrow X Together' 'Monsta X' 'The Boyz' 'BTS' 'BABYMETAL' 'Day6'
 'Got7' 'Little Simz' 'Ateez' '"KCon", Mamamoo' 'Band Of Horses'
 '"Billboard Music Awards"' 'Fei Yu Ching' 'Winner' 'Ssingssing'
 'Jacky Cheung' 'Rene Liu' 'B.A.P.' 'Jane Zhang' 'Wanna One' 'Up10tion'
 'Leehom Wang' 'Zhang Jun' 'Nine Percent' 'Chris Lee (Li Yuchun)'
 'Yu-Ching Fei' 'Snh48' 'A-Mei' 'Kuang Program' 'FTIsland' 'Yu-Rong Yang'
 'Lala Hsu' 'G-Dragon' 'Silence (Wang Su Long)' 'Bibi Zhou' 'Taeyang'
 'Ronghao Li' 'Lion' 'Pu Shu'

In [80]:
## Step 3: Pull and Merge Youtube Data

In [81]:
def get_channel_basic_info_name(channel_names):
    channel_info_list = []
    
    for channel_name in channel_names:
        try:
            # Search for channels with the specified name
            search_request = youtube.search().list(
                part='snippet',
                type='channel',
                q=channel_name
            )
            search_response = search_request.execute()

            # Check if there are any search results
            if 'items' in search_response and search_response['items']:
                # Select the most relevant channel from the search results
                top_result = search_response['items'][0]
                channel_id = top_result['id']['channelId']
                
                # Call the channels.list method to retrieve information about the channel
                request = youtube.channels().list(
                    part="snippet,statistics",
                    id=channel_id
                )
                response = request.execute()

                # Extract relevant information from the API response
                channel_info = response['items'][0]
                snippet = channel_info['snippet']
                statistics = channel_info['statistics']

                # Store the information in a dictionary
                channel_data = {
                    "yt name":channel_name,
                    "yt Channel ID": channel_id,
                    "yt Title": snippet['title'],
                    "yt Description": snippet['description'],
                    "yt Published At": snippet['publishedAt'],
                    "yt View Count": statistics.get('viewCount', 0),
                    "yt Subscriber Count": statistics.get('subscriberCount', 0),
                    "yt Video Count": statistics.get('videoCount', 0)
                }

                # Append channel information to the list
                channel_info_list.append(channel_data)
                # print(channel_data)
            else:
                print(f"No channel found with the name: {channel_name}")

        except Exception as e:
            print(f"Error occurred: {e}")
    # Create a DataFrame from the list of dictionaries
    channel_df = pd.DataFrame(channel_info_list)
    return channel_df

In [83]:
yt_channel_info_df = get_channel_basic_info_name(artist_names)
# yt_channel_info_df = get_channel_basic_info_name(['2NE1'])
print(yt_channel_info_df)

{'yt name': 'xikers', 'yt Channel ID': 'UCPtC0MXSW40Qq7RkhsCXjUQ', 'yt Title': 'xikers', 'yt Description': 'xikers(싸이커스) Official YouTube Channel\n', 'yt Published At': '2022-11-15T14:21:56.347452Z', 'yt View Count': '37578254', 'yt Subscriber Count': '375000', 'yt Video Count': '846'}
{'yt name': 'EVERGLOW', 'yt Channel ID': 'UCg3scYhWcPmyfJcSkulJcow', 'yt Title': 'Everglow', 'yt Description': "Welcome to my channel! \nMy name is Marco, I'm from Italy and as you can see by a quick look, I focused all of my channel to the Kingdom Hearts video game saga.\nOn this channel I upload videos that I have filmed and edited myself. They're not just ORIGINAL videos but I also try and help with video guides and translations, all of them realized by me. Every video shows step-by-step the ways to beat the KH games.\n\nI also work on translations for the Japanese exclusive story updates of Kingdom Hearts Union x [Cross]. \nIt's a very hard work that you can't find anywhere else, since every dialogue

{'yt name': 'BLACKPINK', 'yt Channel ID': 'UCOmHUn--16B90oW2L6FRR3A', 'yt Title': 'BLACKPINK', 'yt Description': 'BLACKPINK Official YouTube Channel\n블랙핑크 공식 유튜브 채널입니다.\n\nJISOO, JENNIE, ROSÉ, LISA\n지수, 제니, 로제, 리사', 'yt Published At': '2016-06-29T03:15:23Z', 'yt View Count': '34947503963', 'yt Subscriber Count': '93100000', 'yt Video Count': '592'}
{'yt name': 'Red Velvet', 'yt Channel ID': 'UCk9GmdlDTBfgGRb7vXeRMoQ', 'yt Title': 'Red Velvet', 'yt Description': 'WENDY 【Wish You Hell - The 2nd Mini Album】 ➫ 2024.03.12 6PM KST\n\nWelcome to Red Velvet official YouTube Channel', 'yt Published At': '2014-07-23T13:12:45Z', 'yt View Count': '1584351799', 'yt Subscriber Count': '5420000', 'yt Video Count': '383'}
{'yt name': 'Nmixx', 'yt Channel ID': 'UCnUAyD4t2LkvW68YrDh7fDg', 'yt Title': 'NMIXX', 'yt Description': 'NMIXX Official YouTube\n', 'yt Published At': '2021-07-12T03:58:53.536982Z', 'yt View Count': '864209571', 'yt Subscriber Count': '2490000', 'yt Video Count': '770'}
{'yt name': 

{'yt name': 'Little Simz', 'yt Channel ID': 'UC89IYKXyf15qNezA7IuFG1A', 'yt Title': 'Little Simz', 'yt Description': 'Drop 7 - out now! https://littlesimz.ffm.to/drop7\n\nTo keep up to date on brand new tracks, tour information and exclusive content, remember to hit the "Subscribe" button!\n\nFollow Little Simz:\nWebsite - https://littlesimz.ffm.to/website.oyd\nInstagram - https://littlesimz.ffm.to/instagram.oyd\nFacebook - https://littlesimz.ffm.to/facebook.oyd\nTwitter - https://littlesimz.ffm.to/twitter.oyd\nApple - https://littlesimz.ffm.to/applemusic.oyd\nSpotify - https://littlesimz.ffm.to/spotify.oyd\n', 'yt Published At': '2012-12-18T17:34:41Z', 'yt View Count': '86535467', 'yt Subscriber Count': '466000', 'yt Video Count': '118'}
{'yt name': 'Ateez', 'yt Channel ID': 'UC2e4Ukj5Pfr7cb3KpJAFBdQ', 'yt Title': 'ATEEZ', 'yt Description': 'ATEEZ(에이티즈) Official YouTube Channel', 'yt Published At': '2018-06-27T01:57:58Z', 'yt View Count': '856070112', 'yt Subscriber Count': '3760000',

{'yt name': 'FTIsland', 'yt Channel ID': 'UCi00aT1kuoR7QNa45Xnwd7Q', 'yt Title': 'FTISLAND (FT아일랜드)', 'yt Description': 'FTISLAND Official YouTube Channel', 'yt Published At': '2007-06-02T00:24:39Z', 'yt View Count': '272102669', 'yt Subscriber Count': '634000', 'yt Video Count': '287'}
{'yt name': 'Yu-Rong Yang', 'yt Channel ID': 'UCxRoQbrTWX24AgpNLGSp3kQ', 'yt Title': 'yu rong yang', 'yt Description': '', 'yt Published At': '2017-04-16T07:08:18Z', 'yt View Count': '18', 'yt Subscriber Count': '0', 'yt Video Count': '5'}
{'yt name': 'Lala Hsu', 'yt Channel ID': 'UCJpFZyuCDfkif5Qt74Re0kA', 'yt Title': 'LaLa 徐佳瑩', 'yt Description': '', 'yt Published At': '2012-01-19T09:09:51Z', 'yt View Count': '203568588', 'yt Subscriber Count': '174000', 'yt Video Count': '78'}
{'yt name': 'G-Dragon', 'yt Channel ID': 'UCRfVrZpe3TA-LB4rutWAtkg', 'yt Title': 'G-DRAGON - Topic', 'yt Description': '', 'yt Published At': '2013-09-27T04:41:07Z', 'yt View Count': '247347058', 'yt Subscriber Count': '236000'

{'yt name': 'EXO', 'yt Channel ID': 'UCEUX9tUYqTFfPQdAgVNsKTA', 'yt Title': 'EXO - Topic', 'yt Description': 'EXO, which is derived from ‘EXOPLANET’, a term denoting the planets outside the solar system, is a 9-member K-pop boy band. After making their anticipated debut in April 2012, EXO is now one of the “biggest K-POP boy band in the world”. Forbes noted their immense popularity on its “Korea Power Celebrity List”, which named them one of the Top 5 K-pop groups in 2014, 2017, and 2018. \n \nEXO’s first album XOXO (2013), containing the breakthrough hit “Growl” was a critical and commercial success. The album sold over 1 million copies, making them the first million-selling Korean artist in 12 years. Their subsequent releases also had strong sales, with all full-length albums each selling more than 1 million copies. EXO’s album Don’t Mess Up My Tempo (2018) is their highest charting-album on the US Billboard 200, debuting at number 23. \n\nSince their first headlining tour in 2014, E

{'yt name': 'Zhang Hang', 'yt Channel ID': 'UCVQdENY_YMQJ5Wuy2AOhGCw', 'yt Title': 'zhang hang', 'yt Description': '', 'yt Published At': '2018-02-27T08:24:39Z', 'yt View Count': '6458', 'yt Subscriber Count': '17', 'yt Video Count': '33'}
Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/channels?part=snippet%2Cstatistics&id=UCZ7B1oCEDT_d4c9XTtChX7w&key=AIzaSyBwuVXHMSMJB5XPoGcbeRuL8rBpKf5Cp80&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
{'yt name': 'Bigbang (Korea)', 'yt Channel ID': 'UCn5Fol5NEcyotSwqrxsedtA', 'yt Title': 'Bigbear Korea channel', 'yt Description': 'ทางช่องเป็นเพียงสื่อกลางในการนำเสนอแนวทางตัวเลขสำหรับซื้อสลากกินแบ่งรัฐบาลที่ถูกกฎหมายเท่านั

Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?part=snippet&type=channel&q=FTIsland%2C+CNBlue&key=AIzaSyBwuVXHMSMJB5XPoGcbeRuL8rBpKf5Cp80&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
Error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/channels?part=snippet%2Cstatistics&id=UCsPtPer9uD6zxGiWTk95-9Q&key=AIzaSyBwuVXHMSMJB5XPoGcbeRuL8rBpKf5Cp80&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</

In [84]:
# Merge data
merged_yt = pd.merge(data, yt_channel_info_df, how='left', left_on='Headliner', right_on='yt name')

## Step 4: Pull and Merge Spotify Data

In [85]:
def get_artist_basic_info_by_name(artist_names, sp):
    basic_spotify_records = {
        'sp artist_name': [],
        'sp artist_genre': [],
        'sp followers': [],
        'sp popularity': []
    }

    for artist_name in artist_names:
        result = sp.search(artist_name, type='artist')  # Search for the artist
        if result:
            artist = result['artists']['items'][0]
            artist_id = artist['id']
            
            artist_info = sp.artist(artist_id)

            basic_spotify_records['sp artist_name'].append(artist_name)
            basic_spotify_records['sp artist_genre'].append(artist['genres'])
            basic_spotify_records['sp followers'].append(artist_info['followers']['total'])
            basic_spotify_records['sp popularity'].append(artist['popularity'])
        else:
            print(f"No artist found for {artist_name}")

    df_basic_spotify_records = pd.DataFrame(basic_spotify_records)
    return df_basic_spotify_records


In [86]:
basic_spotify_records_df=get_artist_basic_info_by_name(artist_names, sp)
print(basic_spotify_records_df)

             sp artist_name  \
0                    xikers   
1                  EVERGLOW   
2                    Xikers   
3                  The Rose   
4                   ENHYPEN   
..                      ...   
131         Chang Cheng-Yue   
132  Chang Cheng-Yue, Free9   
133               Box-O-Car   
134                  Danzig   
135                 Anthrax   

                                       sp artist_genre  sp followers  \
0                                    [k-pop boy group]        332220   
1                            [k-pop, k-pop girl group]       2098681   
2                                    [k-pop boy group]        332220   
3                             [k-pop, k-pop boy group]       2605594   
4                             [anime, k-pop boy group]       8062354   
..                                                 ...           ...   
131                                                 []         51741   
132                                                

In [87]:
# merge data
merged_yt_sp = pd.merge(merged_yt, basic_spotify_records_df, how='left', left_on='Headliner', right_on='sp artist_name')

In [88]:
## Step 5: Save Data

In [89]:
# Create a Pandas Excel writer
with pd.ExcelWriter(f'Merged_data.xlsx', engine='xlsxwriter') as writer:
    # Write each DataFrame to a different sheet in the Excel file
    merged_yt_sp.to_excel(writer, sheet_name=f'merged_yt_sp', index=False)