In [None]:
#! pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
#! python -m spacy validate
#! python -m spacy download en_core_web_sm
# pip install spark-nlp

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import sys
import sparknlp
from nltk.corpus import stopwords

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['get','from', 'subject', 're', 'edu', 'use', 'could', 'also', 'would', 'maybe', 'still', 
                   'say', 'go','be', 's','like', 'dont', 'dent', 'kind', 'maybe', 'didnt', 'went', 'wanted', 
                   'way', 'says', 'think', 'said', 'thats', 'thing', 'going', 'things', 'u', 'theres', 'cnn', 'fox', 'breitbart'])

# import spacy
# nlp = spacy.load("en_core_web_sm")

#import sys, glob, os
#sys.path.extend(glob.glob(os.path.join(os.path.expanduser("~"), ".ivy2/jars/*.jar")))


from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline

In [3]:
spark = sparknlp.start()

In [4]:
data = r'./data/data.csv'

In [5]:
df = spark.read.format('csv').option("encoding", "UTF-8").load(data, inferSchema='true', header='true')

In [6]:
df.createOrReplaceTempView('articles')

In [7]:
df = spark.sql('''SELECT date,
                         year,
                         month,
                         day,
                         author,
                         title,
                         article,
                         url,
                         publication
                   FROM articles 
                   WHERE article IS NOT NULL''')

In [8]:
df = df.fillna({'date':'1970-01-01 00:00:00',
                'year':1970,
                'month':1.0,
                'day':1.0,
                'author':'missing',
                'title':'missing',
                'url':'missing',
                'publication':'missing'})

In [9]:
df.createOrReplaceTempView('articles')

In [10]:
df = spark.sql('''
                    SELECT *
                    FROM articles
                    WHERE year >= 2000
                    AND year <2021

               ''')

In [11]:
documentAssembler = DocumentAssembler() \
     .setInputCol('article') \
     .setOutputCol('document')

tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(stop_words)

# finisher converts tokens to human-readable output
finisher = Finisher() \
     .setInputCols(['clean_lemma']) \
     .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [12]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])

In [13]:
pipe = pipeline.fit(df)

In [14]:
clean = pipe.transform(df)

In [None]:
from pyspark.sql.functions import explode, col
words = clean.withColumn('exploded_text', 
                               explode(col('finished_clean_lemma')))

In [17]:
clean.createOrReplaceTempView('clean')

In [20]:
spark.sql('select year, finished_clean_lemma from clean limit 1').toPandas()

Unnamed: 0,year,finished_clean_lemma
0,2016,"[post, part, polyarchy, independent, blog, pro..."


# pandas

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(data, low_memory=False)

In [None]:
df = df.loc[:, ['date', 'year', 'month', 'day', 'author', 'title', 'article', 'url', 'publication']]

In [None]:
df = df.dropna(subset=['article'])

In [None]:
df.date = df.date.fillna('1970-01-01 00:00:00')
df.year = df.year.fillna(1970)
df.month = df.month.fillna(1.0)
df.day = df.day.fillna(1.0)
df.author = df.author.fillna('missing')
df.title = df.title.fillna('missing')
df.url = df.url.fillna('missing')
df.publication = df.publication.fillna('missing')

In [None]:
df.to_csv(r'./data/pandas.csv')

In [None]:
!pip install nltk