In [1]:
import sparknlp
sparknlp.start()

In [2]:
from pyspark.sql import SparkSession
# start spark session configured for spark nlp
spark = SparkSession.builder \
     .master("local[*]") \
     .config("spark.driver.memory","16G")\
     .config("spark.driver.maxResultSize", "0") \
     .config("spark.kryoserializer.buffer.max", "2000M")\
     .appName("Spark NLP") \
     .config("spark.jars.packages", 
             "com.johnsnowlabs.nlp:spark-nlp_2.11:2.3.5") \
     .getOrCreate()

In [3]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nares\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('xxxx')

In [5]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline

In [6]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [7]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])

In [8]:
# import data
df = spark.read.csv('twitter.csv',
                     inferSchema='true', header='true')
data = df.select('text')
                                                               
# transform text with the pipeline
equifax = pipeline.fit(data).transform(data)
equifax.head()

Row(text='Two people killed in fiery Tesla crash with no one driving', document=[Row(annotatorType='document', begin=0, end=57, result='Two people killed in fiery Tesla crash with no one driving', metadata={'sentence': '0'}, embeddings=[])], token=[Row(annotatorType='token', begin=0, end=2, result='Two', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=4, end=9, result='people', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=11, end=16, result='killed', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=18, end=19, result='in', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=21, end=25, result='fiery', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=27, end=31, result='Tesla', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=33, end=37, result='crash', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='toke

In [22]:
#equifax = equifax.toPandas()
equifax.head()

Unnamed: 0,text,document,token,normalized,lemma,clean_lemma,finished_clean_lemma
0,Two people killed in fiery Tesla crash with no...,"[(document, 0, 57, Two people killed in fiery ...","[(token, 0, 2, Two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[two, people, kill, fiery, tesla, crash, one, ..."
1,,"[(document, 0, -1, , {'sentence': '0'}, [])]",[],[],[],[],[]
2,RT @lorakolodny: Two men dead after Tesla cras...,"[(document, 0, 138, RT @lorakolodny: Two men d...","[(token, 0, 1, RT, {'sentence': '0'}, []), (to...","[(token, 0, 1, rt, {'sentence': '0'}, []), (to...","[(token, 0, 1, rt, {'sentence': '0'}, []), (to...","[(token, 0, 1, rt, {'sentence': '0'}, []), (to...","[rt, lorakolodny, two, man, dead, tesla, crash..."
3,RT @elonmusk: Tesla with Autopilot engaged now...,"[(document, 0, 112, RT @elonmusk: Tesla with A...","[(token, 0, 1, RT, {'sentence': '0'}, []), (to...","[(token, 0, 1, rt, {'sentence': '0'}, []), (to...","[(token, 0, 1, rt, {'sentence': '0'}, []), (to...","[(token, 0, 1, rt, {'sentence': '0'}, []), (to...","[rt, elonmusk, tesla, autopilot, engage, appro..."
4,RT @JadeRhysThomas: Because my Transmasc artis...,"[(document, 0, 139, RT @JadeRhysThomas: Becaus...","[(token, 0, 1, RT, {'sentence': '0'}, []), (to...","[(token, 0, 1, rt, {'sentence': '0'}, []), (to...","[(token, 0, 1, rt, {'sentence': '0'}, []), (to...","[(token, 0, 1, rt, {'sentence': '0'}, []), (to...","[rt, jaderhysthomas, transmasc, artist, friend..."


In [9]:
from pyspark.sql.functions import explode, col
equifax_words = equifax.withColumn('exploded_text', 
                               explode(col('finished_clean_lemma')))

In [16]:
equifax_words.columns

['text',
 'document',
 'token',
 'normalized',
 'lemma',
 'clean_lemma',
 'finished_clean_lemma',
 'exploded_text']

In [18]:
equifax_words1 = equifax_words.toPandas()
equifax_words1

Unnamed: 0,text,document,token,normalized,lemma,clean_lemma,finished_clean_lemma,exploded_text
0,Two people killed in fiery Tesla crash with no...,"[(document, 0, 57, Two people killed in fiery ...","[(token, 0, 2, Two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[two, people, kill, fiery, tesla, crash, one, ...",two
1,Two people killed in fiery Tesla crash with no...,"[(document, 0, 57, Two people killed in fiery ...","[(token, 0, 2, Two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[two, people, kill, fiery, tesla, crash, one, ...",people
2,Two people killed in fiery Tesla crash with no...,"[(document, 0, 57, Two people killed in fiery ...","[(token, 0, 2, Two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[two, people, kill, fiery, tesla, crash, one, ...",kill
3,Two people killed in fiery Tesla crash with no...,"[(document, 0, 57, Two people killed in fiery ...","[(token, 0, 2, Two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[two, people, kill, fiery, tesla, crash, one, ...",fiery
4,Two people killed in fiery Tesla crash with no...,"[(document, 0, 57, Two people killed in fiery ...","[(token, 0, 2, Two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[two, people, kill, fiery, tesla, crash, one, ...",tesla
...,...,...,...,...,...,...,...,...
529,Two people killed in fiery Tesla crash with no...,"[(document, 0, 105, Two people killed in fiery...","[(token, 0, 2, Two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[two, people, kill, fiery, tesla, crash, one, ...",crash
530,Two people killed in fiery Tesla crash with no...,"[(document, 0, 105, Two people killed in fiery...","[(token, 0, 2, Two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[two, people, kill, fiery, tesla, crash, one, ...",one
531,Two people killed in fiery Tesla crash with no...,"[(document, 0, 105, Two people killed in fiery...","[(token, 0, 2, Two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[two, people, kill, fiery, tesla, crash, one, ...",drive
532,Two people killed in fiery Tesla crash with no...,"[(document, 0, 105, Two people killed in fiery...","[(token, 0, 2, Two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[(token, 0, 2, two, {'sentence': '0'}, []), (t...","[two, people, kill, fiery, tesla, crash, one, ...",httpstcojfawp


In [None]:
finished_clean_lemma

In [11]:
counts = equifax_words.groupby('exploded_text').count()

In [12]:
counts_pd = counts.toPandas()
counts_pd

Unnamed: 0,exploded_text,count
0,art,4
1,hope,5
2,giga,2
3,httpstcotuualcafd,1
4,transaction,1
...,...,...
320,almost,1
321,big,1
322,fiery,7
323,capaci,1


In [13]:
counts_pd.shape

(325, 2)

In [14]:
{counts_pd.loc[i, 'exploded_text']: counts_pd.loc[i, 'count'] for i in range(counts_pd.shape[0])}

{'art': 4,
 'hope': 5,
 'giga': 2,
 'httpstcotuualcafd': 1,
 'transaction': 1,
 'positive': 1,
 'lmfao': 1,
 'explain': 1,
 'equal': 1,
 'alexroy': 1,
 'amp': 1,
 'film': 1,
 'ready': 1,
 'elonmusk': 5,
 'day': 1,
 'perc': 1,
 'even': 1,
 'happen': 1,
 'joseantonio': 1,
 'two': 9,
 'local': 1,
 'odd': 1,
 'loan': 1,
 'buy': 3,
 'promise': 1,
 'donate': 2,
 'cryptos': 1,
 'rise': 1,
 'localbadde': 1,
 'g': 1,
 'low': 2,
 'safety': 2,
 'teslectrics': 1,
 'venture': 1,
 'heck': 1,
 'hiphopcultnft': 1,
 'capable': 2,
 'month': 1,
 'httpstcoxnrdwyysff': 1,
 'elongateog': 1,
 'hand': 1,
 'neto': 1,
 'commission': 2,
 'new': 4,
 'success': 1,
 'jose': 1,
 'amazon': 1,
 'simultaneously': 1,
 'man': 3,
 'efficiencylast': 1,
 'netflix': 1,
 'vincent': 1,
 'httpstcokeszghrpy': 1,
 'ark': 1,
 'minefox': 1,
 'ppp': 1,
 'rt': 19,
 'stay': 1,
 'auto': 1,
 'chamber': 1,
 'friend': 2,
 'optimistic': 1,
 'httpstcovzhqtgg': 1,
 'rcnne': 1,
 'soon': 1,
 'money': 2,
 'halfbaked': 2,
 'wedding': 1,
 'safe':