In [2]:
# a dataset of lyrics from billboard songs, 
# and find the most common words used over time.

import sys

from operator import add
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
import pyspark.sql.functions as f

spark = SparkSession\
  .builder \
  .appName("PythonWordCount") \
  .getOrCreate()


# Read CSV from OVHcloud Object Storage https://github.com/walkerkq/musiclyrics
data = spark.read.format('csv').options(header='true', inferSchema='true') \
  .load('billboard_lyrics_1964-2015.csv') \

print('############ CSV extract:')
data.show()

                                                                                

############ CSV extract:
+----+--------------------+--------------------+----+--------------------+------+
|Rank|                Song|              Artist|Year|              Lyrics|Source|
+----+--------------------+--------------------+----+--------------------+------+
|   1|         wooly bully|sam the sham and ...|1965|sam the sham misc...|     3|
|   2|i cant help mysel...|           four tops|1965| sugar pie honey ...|     1|
|   3|i cant get no sat...|  the rolling stones|1965|                    |     1|
|   4| you were on my mind|             we five|1965| when i woke up t...|     1|
|   5|youve lost that l...|the righteous bro...|1965| you never close ...|     1|
|   6|            downtown|        petula clark|1965| when youre alone...|     1|
|   7|                help|         the beatles|1965|help i need someb...|     3|
|   8|cant you hear my ...|     hermans hermits|1965|carterlewis every...|     5|
|   9|crying in the chapel|       elvis presley|1965| you saw me cryin..

In [4]:
# Count and group word frequencies on the column Lyrics, when splitted by space comma
# The explode() function will take the data inside the Lyrics column, 
# and separate all the data found based on the separator “whitespace character”.

data.withColumn('word', f.explode(f.split(f.col('Lyrics'), ' '))) \
  .groupBy('word') \
  .count() \
  .sort('count', ascending=False) \
  .show()

[Stage 6:>                                                          (0 + 2) / 2]

+----+-----+
|word|count|
+----+-----+
| you|64606|
|   i|56466|
| the|53451|
|  to|35752|
| and|32555|
|  me|31170|
|   a|29282|
|  it|25688|
|  my|22821|
|  in|18553|
|that|16151|
|  on|15814|
|your|15459|
|love|15283|
|  im|14278|
|  be|13004|
|  of|12825|
|    |12266|
| all|11895|
|dont|11587|
+----+-----+
only showing top 20 rows



                                                                                

In [5]:
# To remove stop words (like "I", "The", ...), we need to provide arrays of words, not strings. Here we use APache Spark Tokenizer to do so.
# We create a new column to push our arrays of words
tokenizer = Tokenizer(inputCol="Lyrics", outputCol="words_token")
tokenized = tokenizer.transform(data).select('Rank','words_token')

print('############ Tokenized data extract:')
tokenized.show()

############ Tokenized data extract:
+----+--------------------+
|Rank|         words_token|
+----+--------------------+
|   1|[sam, the, sham, ...|
|   2|[, sugar, pie, ho...|
|   3|                  []|
|   4|[, when, i, woke,...|
|   5|[, you, never, cl...|
|   6|[, when, youre, a...|
|   7|[help, i, need, s...|
|   8|[carterlewis, eve...|
|   9|[, you, saw, me, ...|
|  10|[ive, got, sunshi...|
|  11|[well, since, she...|
|  12|[, trailer, for, ...|
|  13|[let, me, tell, y...|
|  14|[, hold, me, hold...|
|  15|[i, said, ̢shotgu...|
|  16|[they, say, were,...|
|  17|[who, wants, to, ...|
|  18|      [instrumental]|
|  19|[, mrs, brown, yo...|
|  20|[, stop, in, the,...|
+----+--------------------+
only showing top 20 rows



In [6]:
# Once in arrays, we can use the Apache Spark function StopWordsRemover
# A new column "words_clean" is here as an output
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
data_clean = remover.transform(tokenized).select('Rank', 'words_clean')

print('############ Data Cleaning extract:')
data_clean.show()

############ Data Cleaning extract:
+----+--------------------+
|Rank|         words_clean|
+----+--------------------+
|   1|[sam, sham, misce...|
|   2|[, sugar, pie, ho...|
|   3|                  []|
|   4|[, woke, morning,...|
|   5|[, never, close, ...|
|   6|[, youre, alone, ...|
|   7|[help, need, some...|
|   8|[carterlewis, eve...|
|   9|[, saw, crying, c...|
|  10|[ive, got, sunshi...|
|  11|[well, since, put...|
|  12|[, trailer, sale,...|
|  13|[let, tell, ya, b...|
|  14|[, hold, hold, ne...|
|  15|[said, ̢shotgun, ...|
|  16|[say, young, dont...|
|  17|[wants, buy, diam...|
|  18|      [instrumental]|
|  19|[, mrs, brown, yo...|
|  20|[, stop, name, lo...|
+----+--------------------+
only showing top 20 rows



In [7]:
# Final step : like in the beginning, we can group again words and sort them by the most used
result = data_clean.withColumn('word', f.explode(f.col('words_clean'))) \
  .groupBy('word') \
  .count().sort('count', ascending=False) \

print('############ TOP20 Most used words in Billboard songs are:')
result.show()

# Stop Spark Process
spark.stop()

############ TOP20 Most used words in Billboard songs are:


                                                                                

+-----+-----+
| word|count|
+-----+-----+
| love|15283|
|   im|14278|
| dont|11587|
| know|11166|
| like|10949|
|   oh| 9736|
| baby| 9098|
|  got| 8289|
|  get| 8265|
|     | 7982|
|youre| 6592|
| yeah| 6259|
| want| 6214|
|   go| 6105|
| make| 5520|
|  one| 5412|
| cant| 5338|
|  see| 5264|
| time| 5176|
|  let| 4927|
+-----+-----+
only showing top 20 rows



In [None]:
#  if we analyze the lyrics of most famous songs during the last 50 years, the word “love” comes in first position with 15283 occurences. 
# Followed by “i’m”, “don’t”, “know”, and so on.

In [11]:
# Scrapped lyrics from 6 genres, 
# and find the most common words used over time.


import sys

from operator import add
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
import pyspark.sql.functions as f

spark = SparkSession\
  .builder \
  .appName("PythonWordCount") \
  .getOrCreate()


# Read CSV 
data = spark.read.format('csv').options(header='true', inferSchema='true') \
  .load('lyrics-data.csv') \

print('############ CSV extract:')
data.show()

                                                                                

############ CSV extract:
+---------------+--------------------+--------------------+--------------------+--------------------+
|          ALink|               SName|               SLink|               Lyric|               Idiom|
+---------------+--------------------+--------------------+--------------------+--------------------+
|/10000-maniacs/|      More Than This|/10000-maniacs/mo...|I could feel at t...|             ENGLISH|
|/10000-maniacs/|   Because The Night|/10000-maniacs/be...|Take me now, baby...|             ENGLISH|
|/10000-maniacs/|      These Are Days|/10000-maniacs/th...|These are. These ...|             ENGLISH|
|/10000-maniacs/|     A Campfire Song|/10000-maniacs/a-...|"A lie to say, ""...| ""O my. river wh...|
|/10000-maniacs/|Everyday Is Like ...|/10000-maniacs/ev...|Trudging slowly o...|             ENGLISH|
|/10000-maniacs/|          Don't Talk|/10000-maniacs/do...|Don't talk, I wil...|             ENGLISH|
|/10000-maniacs/|   Across The Fields|/10000-maniacs/ac.

In [13]:
data.withColumn('word', f.explode(f.split(f.col('Lyric'), ' '))) \
  .groupBy('word') \
  .count() \
  .sort('count', ascending=False) \
  .show()



+----+-------+
|word|  count|
+----+-------+
|   I|1016667|
| the| 981725|
|   a| 801613|
| you| 732151|
|  to| 590347|
|  me| 491956|
| que| 423262|
|  my| 375483|
| and| 361048|
|  de| 324536|
|  in| 313670|
|   o| 300001|
|  it| 296355|
| I'm| 281620|
|your| 263533|
|  of| 252005|
| And| 249278|
|  eu| 232985|
|that| 226913|
|  on| 225446|
+----+-------+
only showing top 20 rows



                                                                                

In [16]:
tokenizer = Tokenizer(inputCol="Lyric", outputCol="words_token")
tokenized = tokenizer.transform(data).select('words_token')

print('############ Tokenized data extract:')
tokenized.show()

############ Tokenized data extract:
+--------------------+
|         words_token|
+--------------------+
|[i, could, feel, ...|
|[take, me, now,, ...|
|[these, are., the...|
|["a, lie, to, say...|
|[trudging, slowly...|
|[don't, talk,, i,...|
|[well, they, left...|
|[[, music:, denni...|
|[on, bended, knee...|
|[for, whom, do, t...|
|[she, walks, alon...|
|[jenny., jenny, y...|
|[you, were, looki...|
|["the, color, of,...|
|["oh,., baby, bla...|
|["the, legs, of, ...|
|[steep, is, the, ...|
|[[original, versi...|
|[that, summer, fi...|
|[[, music:, denni...|
+--------------------+
only showing top 20 rows



In [18]:
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
data_clean = remover.transform(tokenized).select('words_clean')

print('############ Data Cleaning extract:')
data_clean.show()

############ Data Cleaning extract:
+--------------------+
|         words_clean|
+--------------------+
|[feel, time., way...|
|[take, now,, baby...|
|[are., days, reme...|
|["a, lie, say,, "...|
|[trudging, slowly...|
|[talk,, listen., ...|
|[well, left, morn...|
|[[, music:, denni...|
|[bended, kneei've...|
|[bells, toll., se...|
|[walks, alone, br...|
|[jenny., jenny, k...|
|[looking, away, m...|
|["the, color, sky...|
|["oh,., baby, bla...|
|["the, legs, madd...|
|[steep, water, to...|
|[[original, versi...|
|[summer, fields, ...|
|[[, music:, denni...|
+--------------------+
only showing top 20 rows

