# Exercise 2: Advanced Analytics NLP

Install spark-nlp in cmd

`pip install spark-nlp==1.7.3`

In [1]:
import pandas as pd

pd.set_option('max_colwidth', 800) # set pandas dataframe at max_colwidth to accomodate up to 800 characters

# Create a spark context that includes a 3rd party jar for NLP

In [2]:
import findspark
import os

findspark.init(os.environ['SPARK_HOME'])

In [9]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [4]:
spark = SparkSession.builder.config("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.8.2").getOrCreate()
spark

# Read multiple files in a dir as one Dataframe

In [4]:
df = spark.read.csv('./data/*.csv', header=True, sep=',', inferSchema=True)

In [6]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- score: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- removed_by: string (nullable = true)
 |-- total_awards_received: string (nullable = true)
 |-- awarders: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- full_link: string (nullable = true)
 |-- num_comments: string (nullable = true)
 |-- over_18: string (nullable = true)



In [7]:
df.count()

190888

# Deal with Struct type to query subfields 

In [10]:
df_TA = df.select(['title', 'author']).filter(~(F.col('title').contains('[OC]')))
df_TA.limit(5).toPandas()

Unnamed: 0,title,author
0,Wordcloud of trending videos on YouTube in the United States over 2017- 2018,OmarZiada
1,Immunization in India. Source: https://niti.gov.in/content/immunisation,Professional_Napper_
2,How to quickly estimate the impact of players during a basketball game ?,Viziball
3,PhpStorm 2020.3.3 Crack Full Activation Code Latest Version Free,maxpoul
4,3DMark 2.17.7137 Crack + Serial Key 2021 For [Mac/Win] Update,maxpoul


# Try to implement the equivalent of flatMap in dataframes

In [15]:
# .explode(): separate elements of array into multiple rows
df_TA_wc = df_TA.select(
    F.explode(F.split(F.col('title'), ' ')).alias('word')
).groupBy('word').count().orderBy(F.col('count').desc())

df_TA_wc.limit(10).toPandas()

Unnamed: 0,word,count
0,of,39171
1,the,37734
2,in,26497
3,and,17735
4,to,17007
5,a,12895
6,The,11851
7,by,10072
8,for,9568
9,-,9277


In [18]:
df_TA = df_TA.withColumnRenamed('title', 'text')
df_TA.columns

['text', 'author']

# Use an NLP libary to do Part-of-Speech Tagging

In [16]:
from com.johnsnowlabs.nlp.pretrained.pipeline.en import BasicPipeline as bp

In [11]:
df_annotated = bp.annotate(df_TA, 'title')
df_annotated.printSchema()

root
 |-- text: string (nullable = true)
 |-- author: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |-- normal: array (nullable = true)
 |    |-- element: struct (contains

## Deal with Map type to query subfields

In [12]:
df_annotated.select('pos').limit(5).toPandas()

Unnamed: 0,pos
0,"[(pos, 0, 8, NNP, {'word': 'Wordcloud'}), (pos, 10, 11, IN, {'word': 'of'}), (pos, 13, 20, VBG, {'word': 'trending'}), (pos, 22, 27, NN, {'word': 'videos'}), (pos, 29, 30, IN, {'word': 'on'}), (pos, 32, 38, NNP, {'word': 'YouTube'}), (pos, 40, 41, IN, {'word': 'in'}), (pos, 43, 45, DT, {'word': 'the'}), (pos, 47, 52, NNP, {'word': 'United'}), (pos, 54, 59, NNPS, {'word': 'States'}), (pos, 61, 64, IN, {'word': 'over'})]"
1,"[(pos, 0, 11, NNP, {'word': 'Immunization'}), (pos, 13, 14, IN, {'word': 'in'}), (pos, 16, 20, NNP, {'word': 'India'}), (pos, 23, 28, NNP, {'word': 'Source'}), (pos, 31, 70, NN, {'word': 'httpsnitigovincontentimmunisation'})]"
2,"[(pos, 0, 2, WRB, {'word': 'How'}), (pos, 4, 5, TO, {'word': 'to'}), (pos, 7, 13, RB, {'word': 'quickly'}), (pos, 15, 22, VB, {'word': 'estimate'}), (pos, 24, 26, DT, {'word': 'the'}), (pos, 28, 33, NN, {'word': 'impact'}), (pos, 35, 36, IN, {'word': 'of'}), (pos, 38, 44, NNS, {'word': 'players'}), (pos, 46, 51, IN, {'word': 'during'}), (pos, 53, 53, DT, {'word': 'a'}), (pos, 55, 64, NN, {'word': 'basketball'}), (pos, 66, 69, NN, {'word': 'game'})]"
3,"[(pos, 0, 7, NNP, {'word': 'PhpStorm'}), (pos, 18, 22, NNP, {'word': 'Crack'}), (pos, 24, 27, NNP, {'word': 'Full'}), (pos, 29, 38, NNP, {'word': 'Activation'}), (pos, 40, 43, NNP, {'word': 'Code'}), (pos, 45, 50, NNP, {'word': 'Latest'}), (pos, 52, 58, NNP, {'word': 'Version'}), (pos, 60, 63, NNP, {'word': 'Free'})]"
4,"[(pos, 0, 5, NNP, {'word': 'DMark'}), (pos, 17, 21, NNP, {'word': 'Crack'}), (pos, 23, 23, NN, {'word': '+'}), (pos, 25, 30, NNP, {'word': 'Serial'}), (pos, 32, 34, NNP, {'word': 'Key'}), (pos, 41, 43, IN, {'word': 'For'}), (pos, 46, 52, NNP, {'word': 'MacWin'}), (pos, 55, 60, NNP, {'word': 'Update'})]"


In [13]:
df_pos = df_annotated.select(['text', 'pos.metadata', 'pos.result'])
df_pos.limit(5).toPandas()

Unnamed: 0,text,metadata,result
0,Wordcloud of trending videos on YouTube in the United States over 2017- 2018,"[{'word': 'Wordcloud'}, {'word': 'of'}, {'word': 'trending'}, {'word': 'videos'}, {'word': 'on'}, {'word': 'YouTube'}, {'word': 'in'}, {'word': 'the'}, {'word': 'United'}, {'word': 'States'}, {'word': 'over'}]","[NNP, IN, VBG, NN, IN, NNP, IN, DT, NNP, NNPS, IN]"
1,Immunization in India. Source: https://niti.gov.in/content/immunisation,"[{'word': 'Immunization'}, {'word': 'in'}, {'word': 'India'}, {'word': 'Source'}, {'word': 'httpsnitigovincontentimmunisation'}]","[NNP, IN, NNP, NNP, NN]"
2,How to quickly estimate the impact of players during a basketball game ?,"[{'word': 'How'}, {'word': 'to'}, {'word': 'quickly'}, {'word': 'estimate'}, {'word': 'the'}, {'word': 'impact'}, {'word': 'of'}, {'word': 'players'}, {'word': 'during'}, {'word': 'a'}, {'word': 'basketball'}, {'word': 'game'}]","[WRB, TO, RB, VB, DT, NN, IN, NNS, IN, DT, NN, NN]"
3,PhpStorm 2020.3.3 Crack Full Activation Code Latest Version Free,"[{'word': 'PhpStorm'}, {'word': 'Crack'}, {'word': 'Full'}, {'word': 'Activation'}, {'word': 'Code'}, {'word': 'Latest'}, {'word': 'Version'}, {'word': 'Free'}]","[NNP, NNP, NNP, NNP, NNP, NNP, NNP, NNP]"
4,3DMark 2.17.7137 Crack + Serial Key 2021 For [Mac/Win] Update,"[{'word': 'DMark'}, {'word': 'Crack'}, {'word': '+'}, {'word': 'Serial'}, {'word': 'Key'}, {'word': 'For'}, {'word': 'MacWin'}, {'word': 'Update'}]","[NNP, NNP, NN, NNP, NNP, IN, NNP, NNP]"


In [14]:
df_pos_2 = df_annotated.select(F.explode(F.col('pos')).alias('pos'))
print(df_pos_2.printSchema())
df_pos_2.limit(10).toPandas()

root
 |-- pos: struct (nullable = true)
 |    |-- annotatorType: string (nullable = true)
 |    |-- begin: integer (nullable = false)
 |    |-- end: integer (nullable = false)
 |    |-- result: string (nullable = true)
 |    |-- metadata: map (nullable = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)

None


Unnamed: 0,pos
0,"(pos, 0, 8, NNP, {'word': 'Wordcloud'})"
1,"(pos, 10, 11, IN, {'word': 'of'})"
2,"(pos, 13, 20, VBG, {'word': 'trending'})"
3,"(pos, 22, 27, NN, {'word': 'videos'})"
4,"(pos, 29, 30, IN, {'word': 'on'})"
5,"(pos, 32, 38, NNP, {'word': 'YouTube'})"
6,"(pos, 40, 41, IN, {'word': 'in'})"
7,"(pos, 43, 45, DT, {'word': 'the'})"
8,"(pos, 47, 52, NNP, {'word': 'United'})"
9,"(pos, 54, 59, NNPS, {'word': 'States'})"


## Keep only proper nouns NNP or NNPS

In [15]:
df_NNP = df_pos_2.where("pos.result IN ('NNP', 'NNPS')")
# or: df_NNP = df_pos_2.filter((df_pos_2.pos.result == 'NNP') | (df_pos_2.pos.result == 'NNPS'))
df_NNP.limit(10).toPandas()

Unnamed: 0,pos
0,"(pos, 0, 8, NNP, {'word': 'Wordcloud'})"
1,"(pos, 32, 38, NNP, {'word': 'YouTube'})"
2,"(pos, 47, 52, NNP, {'word': 'United'})"
3,"(pos, 54, 59, NNPS, {'word': 'States'})"
4,"(pos, 0, 11, NNP, {'word': 'Immunization'})"
5,"(pos, 16, 20, NNP, {'word': 'India'})"
6,"(pos, 23, 28, NNP, {'word': 'Source'})"
7,"(pos, 0, 7, NNP, {'word': 'PhpStorm'})"
8,"(pos, 18, 22, NNP, {'word': 'Crack'})"
9,"(pos, 24, 27, NNP, {'word': 'Full'})"


## Extract columns form a map in a col

In [16]:
df_WordTag = df_NNP.selectExpr(["pos.metadata['word'] AS word", 'pos.result AS tag'])
df_WordTag.limit(10).toPandas()

Unnamed: 0,word,tag
0,Wordcloud,NNP
1,YouTube,NNP
2,United,NNP
3,States,NNPS
4,Immunization,NNP
5,India,NNP
6,Source,NNP
7,PhpStorm,NNP
8,Crack,NNP
9,Full,NNP


In [17]:
df_WordTag.groupBy('word').count().sort(F.desc('count')).limit(10).toPandas()

Unnamed: 0,word,count
0,US,6128
1,Data,3575
2,World,3140
3,Map,2380
4,COVID,2089
5,Google,1746
6,New,1588
7,States,1558
8,America,1355
9,United,1310
