In [2]:
import sys

from operator import add
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
import pyspark.sql.functions as f


spark = SparkSession.builder.appName("PythonWordCount").getOrCreate()


# Read CSV from OVHcloud Object Storage https://github.com/walkerkq/musiclyrics
data = spark.read.format('csv').options(header='true', inferSchema='true').load('billboard_lyrics_1964-2015.csv')

print('############ CSV extract:')
data.show()

############ CSV extract:
+----+--------------------+--------------------+----+--------------------+------+
|Rank|                Song|              Artist|Year|              Lyrics|Source|
+----+--------------------+--------------------+----+--------------------+------+
|   1|         wooly bully|sam the sham and ...|1965|sam the sham misc...|     3|
|   2|i cant help mysel...|           four tops|1965| sugar pie honey ...|     1|
|   3|i cant get no sat...|  the rolling stones|1965|                    |     1|
|   4| you were on my mind|             we five|1965| when i woke up t...|     1|
|   5|youve lost that l...|the righteous bro...|1965| you never close ...|     1|
|   6|            downtown|        petula clark|1965| when youre alone...|     1|
|   7|                help|         the beatles|1965|help i need someb...|     3|
|   8|cant you hear my ...|     hermans hermits|1965|carterlewis every...|     5|
|   9|crying in the chapel|       elvis presley|1965| you saw me cryin..

In [3]:
# Count and group word frequencies on the column Lyrics, when splitted by space comma
data.withColumn('word', f.explode(f.split(f.col('Lyrics'), ' '))) \
  .groupBy('word') \
  .count() \
  .sort('count', ascending=False) \
  .show()

+----+-----+
|word|count|
+----+-----+
| you|64606|
|   i|56466|
| the|53451|
|  to|35752|
| and|32555|
|  me|31170|
|   a|29282|
|  it|25688|
|  my|22821|
|  in|18553|
|that|16151|
|  on|15814|
|your|15459|
|love|15283|
|  im|14278|
|  be|13004|
|  of|12825|
|    |12266|
| all|11895|
|dont|11587|
+----+-----+
only showing top 20 rows



In [4]:
# To remove stop words (like "I", "The", ...), we need to provide arrays of words, not strings. Here we use APache Spark Tokenizer to do so.
# We create a new column to push our arrays of words
tokenizer = Tokenizer(inputCol="Lyrics", outputCol="words_token")
tokenized = tokenizer.transform(data).select('Rank','words_token')

print('############ Tokenized data extract:')
tokenized.show()


# Once in arrays, we can use the Apache Spark function StopWordsRemover
# A new column "words_clean" is here as an output
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
data_clean = remover.transform(tokenized).select('Rank', 'words_clean')

print('############ Data Cleaning extract:')
data_clean.show()


# Final step : like in the beginning, we can group again words and sort them by the most used
result = data_clean.withColumn('word', f.explode(f.col('words_clean'))) \
  .groupBy('word') \
  .count().sort('count', ascending=False) \

print('############ TOP20 Most used words in Billboard songs are:')
result.show()

# Stop Spark Process
spark.stop()

############ Tokenized data extract:
+----+--------------------+
|Rank|         words_token|
+----+--------------------+
|   1|[sam, the, sham, ...|
|   2|[, sugar, pie, ho...|
|   3|                  []|
|   4|[, when, i, woke,...|
|   5|[, you, never, cl...|
|   6|[, when, youre, a...|
|   7|[help, i, need, s...|
|   8|[carterlewis, eve...|
|   9|[, you, saw, me, ...|
|  10|[ive, got, sunshi...|
|  11|[well, since, she...|
|  12|[, trailer, for, ...|
|  13|[let, me, tell, y...|
|  14|[, hold, me, hold...|
|  15|[i, said, ̢shotgu...|
|  16|[they, say, were,...|
|  17|[who, wants, to, ...|
|  18|      [instrumental]|
|  19|[, mrs, brown, yo...|
|  20|[, stop, in, the,...|
+----+--------------------+
only showing top 20 rows

############ Data Cleaning extract:
+----+--------------------+
|Rank|         words_clean|
+----+--------------------+
|   1|[sam, sham, misce...|
|   2|[, sugar, pie, ho...|
|   3|                  []|
|   4|[, woke, morning,...|
|   5|[, never, close, ...|
|   6