#Spotify API

##Get spotify client id and client Secret from .env file

In [None]:
!pip install load_dotenv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting load_dotenv
  Downloading load_dotenv-0.1.0-py3-none-any.whl (7.2 kB)
Collecting python-dotenv (from load_dotenv)
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, load_dotenv
Successfully installed load_dotenv-0.1.0 python-dotenv-1.0.0


In [None]:
from dotenv import load_dotenv
import os
import base64
from requests import post, get
import json
import time

In [None]:
load_dotenv()
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

##API function

In [None]:
def get_token():
    auth_string = client_id + ":" + client_secret
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes),"utf-8")

    url = "https://accounts.spotify.com/api/token"
    headers ={
        "Authorization" : "Basic " + auth_base64,
        "Content-Type" : "application/x-www-form-urlencoded"
    }
    data = {
        "grant_type" : "client_credentials"
    }
    result = post(url,headers=headers,data=data)
    json_result = json.loads(result.content)
    token = json_result["access_token"]
    return token

In [None]:
def get_auth_headers(token):
    return{"Authorization" : "Bearer " + token}

In [None]:
def search_for_podcast(token,podcast_name):
    url = "https://api.spotify.com/v1/search"
    headers = get_auth_headers(token)
    query = f"?q={podcast_name}&type=show&market=ID"

    query_url = url + query
    result = get(query_url,headers=headers)
    json_result = json.loads(result.content)["shows"]["items"]
    if(len(json_result)==0):
        print("No podcast")
        return None
    return json_result[0]

In [None]:
def get_podcast_by_id(token,podcast_id):
    url = f"https://api.spotify.com/v1/shows/{podcast_id}?market=ID"
    headers = get_auth_headers(token)
    result = get(url,headers=headers)
    json_result = json.loads(result.content)
    podcasts_name =json_result["name"]
    podcasts_img = json_result["images"][0]["url"]
    return podcasts_name,podcasts_img

In [None]:
def get_episodes_by_id(token,podcast_id):
    url = f"https://api.spotify.com/v1/shows/{podcast_id}/episodes?market=ID"
    headers = get_auth_headers(token)
    result = get(url,headers=headers)
    json_result = json.loads(result.content)["items"]
    episode_ids = [data["id"] for data in json_result]
    episodes_names = [data["name"] for data in json_result]
    episodes_urls =[data["href"] for data in json_result]
    return episode_ids,episodes_names,episodes_urls

In [None]:
def get_audio(token,episode_id):
    url = f"https://api.spotify.com/v1/episodes/{episode_id}?market=ID"
    headers = get_auth_headers(token)
    result = get(url,headers=headers)
    json_result = json.loads(result.content)['audio_preview_url']
    return json_result

##Fetch Spotify Data

In [None]:
url =[]
token = get_token()
result = search_for_podcast(token,"McKinsey on Start-ups")
podcast_id = result["id"]
podcasts_name,podcasts_img = get_podcast_by_id(token,podcast_id)
episode_ids,episode_names,episode_urls = get_episodes_by_id(token,podcast_id)
total = len(episode_ids)
for i in range(total):
  audio_url = get_audio(token,episode_ids[i])
  url.append(audio_url)

In [None]:
episode_names

['Creating a clean water supply from the air and sun',
 'Fueling Mexico’s startup ecosystem with an equity-free helping hand',
 'Base10’s Ade Ajao: A data-driven approach to funding more diverse founders',
 'Investing in transformative tech: EQT Ventures’ long view',
 'What it takes for successful startups to keep growing',
 'The keys to framing a winning investor pitch',
 'Indonesian unicorn Ajaib: Building a nation of new retail investors',
 'Versatile’s Meirav Oren: Building the data-driven future of construction',
 'Plotting an alternative foods revolution',
 'Operator’s manual: QED’s approach to investing in fintech',
 'Bringing Latin American micro-business into the fintech age',
 'How harnessing human data could reinvent drug discovery',
 'The next test for edtech',
 'Getting remote work(ers) off to a good start',
 'How Veho aims to deliver on last-mile logistics',
 'Activate’s Ilan Gur: Bridging the science-to-market gap',
 'The emerging use cases of the metaverse',
 'The emerg

In [None]:
url

['https://podz-content.spotifycdn.com/audio/clips/6kSbVnMBDjaFsIMzRSfs8W/clip_203600_267000.mp3',
 'https://podz-content.spotifycdn.com/audio/clips/25VREkmrMyRyCtEqVaUJ5S/clip_1040000_1104400.mp3',
 'https://podz-content.spotifycdn.com/audio/clips/27w3K9vRaf4aCbMLmKIhbp/clip_1413000_1478800.mp3',
 'https://podz-content.spotifycdn.com/audio/clips/5DYVzinAqCJcZ4bmaKfhJ2/clip_121300_169200.mp3',
 'https://podz-content.spotifycdn.com/audio/clips/6ZKiA2YRXD89Zj88rLL0Ql/clip_1400800_1472100.mp3',
 'https://podz-content.spotifycdn.com/audio/clips/4rL2LuYGR38fnkUk5A3AWX/clip_1382950_1430700.mp3',
 'https://podz-content.spotifycdn.com/audio/clips/1tLjSEdm2MokPHmLA97WJ8/clip_615900_689000.mp3',
 'https://podz-content.spotifycdn.com/audio/clips/60l9YVDxIsNJH9AY6E3zj9/clip_258100_304600.mp3',
 'https://podz-content.spotifycdn.com/audio/clips/4SHjL5Cpi01ZaKwORweGSE/clip_301900_360600.mp3',
 'https://podz-content.spotifycdn.com/audio/clips/4oe1B2Ci5wFUO2faQ3Ctex/clip_633300_690800.mp3',
 'https://po

##Write podcast audio to colab files

In [None]:
total = len(url)
for i in range(total):
  SPEECH_FILE = "_assets/" + str(i) +".mp3"
  if not os.path.exists(SPEECH_FILE):
    os.makedirs("_assets",exist_ok=True)
    with open(SPEECH_FILE,"wb") as file:
      file.write(get(url[i]).content)

#Audio Transcript

##Install Transformer based encoder-decoder model faster_whisper

In [None]:
!pip install faster_whisper

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faster_whisper
  Downloading faster_whisper-0.6.0-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting av==10.* (from faster_whisper)
  Downloading av-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.0/31.0 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctranslate2<4,>=3.10 (from faster_whisper)
  Downloading ctranslate2-3.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.7/33.7 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.13 (from faster_whisper)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━

##Transcript audio to text

In [None]:
from faster_whisper import WhisperModel
model = WhisperModel("large-v2")
transcript_text=[]
for i in range(total):
  text=""
  SPEECH_FILE = "_assets/"+str(i)+".mp3"
  segments, info = model.transcribe(SPEECH_FILE)
  for segment in segments:
      text = text + segment.text
  transcript_text.append(text.capitalize())

Downloading model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Downloading (…)08837e8b/config.json:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

Downloading (…)37e8b/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading (…)37e8b/vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

In [None]:
transcript_text

[" that turns into vapor. so the liquid becomes a gas and then distributes into the kitchen. in the troposphere, the lower part of the atmosphere that we live in, there are one and then 16 zeros, big number, kilograms of water vapor in the air. it's about six times all of earth's rivers at any given time. and the average water molecule stays in the atmosphere for about a week. so you're talking about a massive renewable resource, an atmospheric ocean of water vapor that we live within. the question then becomes, is there a way to very efficiently take those water molecules that are in the vapor form and turn them into liquid anywhere on the planet? we all have familiarity with a glass of iced tea where you get condensation on the outside and liquid water is dripping down in humid places. we also have a familiarity with when you leave the lid off of a sugar bowl and that sugar starts to get clumpy. the water vapor is getting absorbed into the sugar. but we developed a set of engineered 

#Text Summarization

##Install Transformer

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: safetensors, transformers
Successfully installed safetensors-0.3.1 transformers-4.30.2


##Make T5 and BART Summarization Model

In [None]:
t5 = []
bart = []

In [None]:
from transformers import pipeline
pipeT5 = pipeline('summarization', model = 't5-large' )
pipeBART = pipeline("summarization", model="facebook/bart-large-cnn")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

##Summarize Text

In [None]:
for i in transcript_text:
  pipeT5_out = pipeT5(i)
  pipeBART_out = pipeBART(i)
  t5.append(pipeT5_out[0]["summary_text"])
  bart.append(pipeBART_out[0]["summary_text"])

Your max_length is set to 200, but your input_length is only 193. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 200, but your input_length is only 167. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=83)
Your max_length is set to 200, but your input_length is only 153. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=76)
Your max_length is set to 200, but your input_length is only 176. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=88)


In [None]:
t5

["in the troposphere, there are one and then 16 zeros, big number, kilograms of water vapor in the air . we developed a set of engineered materials that do that same process, just many hundreds of times faster . they're able to take water molecules in the vapor form and turn them into liquid anywhere on the planet .",
 "55 active funds in mexico at this moment, they're around in seed stage, they invest around 1.5 million . the number of new companies that are being built has affected, says lucas . when there's not capital available, they are like, ok, maybe i will start it later, she says .",
 "a lot of these founders look more diverse than your typical san francisco-based firm . doing good and doing well truly are becoming more and more one and the same . we're very excited about that. for us in particular.",
 "tech's long-term prospects remain strong, even if the time frame and cost of money have dramatically changed, writes john sutter . transformative tech is the key to addressing 

In [None]:
bart

["In the troposphere, the lower part of the atmosphere that we live in, there are one and then 16 zeros, kilograms of water vapor in the air. It's about six times all of earth's rivers at any given time. The average water molecule stays in the atmosphere for about a week.",
 "There are 55 active funds in mexico at this moment. They're around in seed stage, they invest around 1.5 million. There's some capital there, but it's going to be hard. These startups have to be more cautious of how they spend their money, that their unit economics work.",
 'The company has a diverse portfolio of founders. The founders look more diverse than your typical san francisco-based firm, silicon valley portfolio. The companies that are doing better are those that are able to show that they are a force for good for the community in the long term.',
 "i truly believe that good companies will excel in this environment, even if there are several bumps along the way, and even if it's painful. remember during t

#Export result to .csv format

In [None]:
import pandas as pd
df = pd.DataFrame()
df['Episode Name'] = episode_names
df['URL'] = url
df['Transcript'] = transcript_text
bart = [text.replace('"','') for text in bart]
df['BART'] = bart
t5 = [text.replace('"','') for text in t5]
df['T5'] = t5

In [None]:
df

Unnamed: 0,Episode Name,URL,Transcript,BART,T5
0,Creating a clean water supply from the air and...,https://podz-content.spotifycdn.com/audio/clip...,that turns into vapor. so the liquid becomes ...,"In the troposphere, the lower part of the atmo...","in the troposphere, there are one and then 16 ..."
1,Fueling Mexico’s startup ecosystem with an equ...,https://podz-content.spotifycdn.com/audio/clip...,these startups have to be more cautious of ho...,There are 55 active funds in mexico at this mo...,"55 active funds in mexico at this moment, they..."
2,Base10’s Ade Ajao: A data-driven approach to f...,https://podz-content.spotifycdn.com/audio/clip...,and yet we have ended up with a pretty divers...,The company has a diverse portfolio of founder...,a lot of these founders look more diverse than...
3,Investing in transformative tech: EQT Ventures...,https://podz-content.spotifycdn.com/audio/clip...,"knew this was coming, it was really just a ma...",i truly believe that good companies will excel...,"tech's long-term prospects remain strong, even..."
4,What it takes for successful startups to keep ...,https://podz-content.spotifycdn.com/audio/clip...,when each of these micro-questions comes into...,There needs to be a recognition on the part of...,founder and c-suite leaders need to recognize ...
5,The keys to framing a winning investor pitch,https://podz-content.spotifycdn.com/audio/clip...,"so, so that is basically the move that you wa...","Carmel: The fourth stage, carmel, our startup ...",how are you changing the world? this is perhap...
6,Indonesian unicorn Ajaib: Building a nation of...,https://podz-content.spotifycdn.com/audio/clip...,something social and something about their fr...,i would think about how do you empower the sel...,"in thailand, the reason why we adopted mutual ..."
7,Versatile’s Meirav Oren: Building the data-dri...,https://podz-content.spotifycdn.com/audio/clip...,yes. it's not about how you get the data. we ...,We empower the people who build to have contro...,we empower superintendents to be the most info...
8,Plotting an alternative foods revolution,https://podz-content.spotifycdn.com/audio/clip...,"and as you went along the journey, was this s...",When we started over nine and a half years ago...,when we started over nine and a half years ago...
9,Operator’s manual: QED’s approach to investing...,https://podz-content.spotifycdn.com/audio/clip...,"you've got product challenges, operational ch...",A far majority of our investments are regulate...,a far majority of our investments are regulate...


In [None]:
df.to_csv('McKinsey on Start-ups Summarization.csv')