In [None]:
#! pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
#! python -m spacy validate
#! python -m spacy download en_core_web_sm
# pip install spark-nlp

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
import os
import sys
import sparknlp
from nltk.corpus import stopwords

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['get','from', 'subject', 're', 'edu', 'use', 'could', 'also', 'would', 'maybe', 'still', 
                   'say', 'go','be', 's','like', 'dont', 'dent', 'kind', 'maybe', 'didnt', 'went', 'wanted', 
                   'way', 'says', 'think', 'said', 'thats', 'thing', 'going', 'things', 'u', 'theres', 'cnn', 'fox', 'breitbart'])

# import spacy
# nlp = spacy.load("en_core_web_sm")

#import sys, glob, os
#sys.path.extend(glob.glob(os.path.join(os.path.expanduser("~"), ".ivy2/jars/*.jar")))


from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline

In [2]:
stopWords = set(stopwords.words('english'))

In [6]:
adddd = ['get','from', 'subject', 're', 'edu', 'use', 'could', 'also', 'would', 'maybe', 'still', 
                   'say', 'go','be', 's','like', 'dont', 'dent', 'kind', 'maybe', 'didnt', 'went', 'wanted', 
                   'way', 'says', 'think', 'said', 'thats', 'thing', 'going', 'things', 'u', 'theres', 'cnn', 'fox', 'breitbart']

for word in adddd:
    stopWords.add(word)
    

In [7]:
stopWords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'also',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'breitbart',
 'but',
 'by',
 'can',
 'cnn',
 'could',
 'couldn',
 "couldn't",
 'd',
 'dent',
 'did',
 'didn',
 "didn't",
 'didnt',
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'dont',
 'down',
 'during',
 'each',
 'edu',
 'few',
 'for',
 'fox',
 'from',
 'further',
 'get',
 'go',
 'going',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'kind',
 'like',
 'll',
 'm',
 'ma',
 'maybe',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',


In [None]:
spark = sparknlp.start()

In [None]:
data = r'./data/data.csv'

In [None]:
df = spark.read.format('csv').option("encoding", "UTF-8").load(data, inferSchema='true', header='true')

In [None]:
df.createOrReplaceTempView('articles')

In [None]:
df = spark.sql('''SELECT date,
                         year,
                         month,
                         day,
                         author,
                         title,
                         article,
                         url,
                         publication
                   FROM articles 
                   WHERE article IS NOT NULL''')

In [None]:
df = df.fillna({'date':'1970-01-01 00:00:00',
                'year':1970,
                'month':1.0,
                'day':1.0,
                'author':'missing',
                'title':'missing',
                'url':'missing',
                'publication':'missing'})

In [None]:
df.createOrReplaceTempView('articles')

In [None]:
df = spark.sql('''
                    SELECT *
                    FROM articles
                    WHERE year >= 2000
                    AND year <2021

               ''')

In [None]:
documentAssembler = DocumentAssembler() \
     .setInputCol('article') \
     .setOutputCol('document')

tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(stop_words)

# finisher converts tokens to human-readable output
finisher = Finisher() \
     .setInputCols(['clean_lemma']) \
     .setCleanAnnotations(False)

In [None]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])

In [None]:
pipe = pipeline.fit(df)

In [None]:
clean = pipe.transform(df)

In [None]:
clean.createOrReplaceTempView('clean')

In [None]:
spark.sql('select year, finished_clean_lemma from clean limit 1').toPandas()

# pandas

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(data, low_memory=False)

In [None]:
df = df.loc[:, ['date', 'year', 'month', 'day', 'author', 'title', 'article', 'url', 'publication']]

In [None]:
df = df.dropna(subset=['article'])

In [None]:
df.date = df.date.fillna('1970-01-01 00:00:00')
df.year = df.year.fillna(1970)
df.month = df.month.fillna(1.0)
df.day = df.day.fillna(1.0)
df.author = df.author.fillna('missing')
df.title = df.title.fillna('missing')
df.url = df.url.fillna('missing')
df.publication = df.publication.fillna('missing')

In [None]:
df.to_csv(r'./data/pandas.csv')

In [None]:
!pip install nltk

In [8]:
!pip freeze

argon2-cffi @ file:///tmp/build/80754af9/argon2-cffi_1596828496740/work
async-generator==1.10
attrs @ file:///tmp/build/80754af9/attrs_1600298409949/work
backcall==0.2.0
bleach @ file:///tmp/build/80754af9/bleach_1600439572647/work
blis==0.4.1
botocore==1.13.17
catalogue==1.0.0
certifi==2020.6.20
cffi @ file:///tmp/build/80754af9/cffi_1600699165083/work
chardet==3.0.4
click==7.1.2
colorama==0.4.1
cymem==2.0.4
decorator==4.4.2
defusedxml==0.6.0
docutils==0.15.2
en-core-web-sm==2.2.0
entrypoints==0.3
idna==2.10
importlib-metadata==2.0.0
ipykernel @ file:///tmp/build/80754af9/ipykernel_1596206602906/work/dist/ipykernel-5.3.4-py3-none-any.whl
ipython @ file:///tmp/build/80754af9/ipython_1593447367857/work
ipython-genutils==0.2.0
ipywidgets @ file:///tmp/build/80754af9/ipywidgets_1601490159889/work
jedi @ file:///tmp/build/80754af9/jedi_1598371618777/work
Jinja2==2.11.2
jmespath==0.9.4
joblib==0.17.0
jsonschema @ file:///tmp/build/80754af9/jsonschema_16026071554