In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import json

from pyspark import SparkContext
from pyspark import sql
from pyspark.sql import SQLContext

from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
# Create spark session
spark = SparkSession\
    .builder\
    .master("local")\
    .appName("word_count_jsonl")\
    .config("spark.some.config.option", "some-value")\
    .getOrCreate()

In [3]:
# declare data file path for read in next line
path = 'c:/users/prayt/mini_pyspark_projects/article_samples.json.gz'
# read jsonl file into spark dataframe
json_df = spark.read.json(path)
# show spark dataframe schema
json_df.printSchema()
# observe each section_texts cell is a list of text sections

root
 |-- section_texts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- section_titles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)



In [4]:
# collapse list of strings for each article to one long text string for each article 
section_texts_rdd =  json_df.select('section_texts').rdd.map(lambda x: ' '.join(x[0])) 

section_texts_rdd.collect()[20]

'\n\n\n\n\n\'\'\'Anthropology\'\'\' is the study of humans and human behavior and societies in the past and present. Social anthropology and cultural anthropology study the norms and values of societies. Linguistic anthropology studies how language affects social life. Biological or physical anthropology studies the biological development of humans.\n\nArchaeology, which studies past human cultures through investigation of physical evidence, is thought of as a branch of anthropology in the United States and Canada, while in Europe, it is viewed as a discipline in its own right or grouped under other related disciplines, such as history.\n Bernardino de Sahagún is considered to be the founder of modern anthropology.\n\nThe abstract noun \'\'anthropology\'\' is first attested in reference to history. Its present use first appeared in Renaissance Germany in the works of Magnus Hundt and Otto Casmann. Their New Latin \'\'\'\' derived from the combining forms of the Greek words \'\'ánthrōpo

In [6]:
# declare tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# tokenize each article to create series of individual words for each article
tokenized = section_texts_rdd.map(lambda x: tokenizer.tokenize(x))

print(len(tokenized.collect()))

1000


In [8]:
# importing itemgetter so we can look at multiple word lists in the dataframe
from operator import itemgetter
# itemgetter calls multiple items from a list, tokenized_sections.collect() is the list we are calling from
# we could just use take or another collect statement, but itemgetter is a nice function to know for accessing multiple
# list items simultaneously
itemgetter(1,4)(tokenized.collect())


(['Autism',
  'is',
  'a',
  'developmental',
  'disorder',
  'characterized',
  'by',
  'difficulties',
  'with',
  'social',
  'interaction',
  'and',
  'communication',
  'and',
  'by',
  'restricted',
  'and',
  'repetitive',
  'behavior',
  'Parents',
  'usually',
  'notice',
  'signs',
  'during',
  'the',
  'first',
  'three',
  'years',
  'of',
  'their',
  'child',
  's',
  'life',
  'These',
  'signs',
  'often',
  'develop',
  'gradually',
  'though',
  'some',
  'children',
  'with',
  'autism',
  'reach',
  'their',
  'developmental',
  'milestones',
  'at',
  'a',
  'normal',
  'pace',
  'before',
  'worsening',
  'Autism',
  'is',
  'associated',
  'with',
  'a',
  'combination',
  'of',
  'genetic',
  'and',
  'environmental',
  'factors',
  'Risk',
  'factors',
  'during',
  'pregnancy',
  'include',
  'certain',
  'infections',
  'such',
  'as',
  'rubella',
  'and',
  'toxins',
  'including',
  'valproic',
  'acid',
  'alcohol',
  'cocaine',
  'pesticides',
  'and',


In [9]:
# reduce words to 'root' word and make each word lower case for word counting later
lemmatizer = WordNetLemmatizer()
lemmatized = tokenized.map(lambda x: [lemmatizer.lemmatize(word).lower() for word in x])

In [10]:
lemmatized.collect()[1]

['autism',
 'is',
 'a',
 'developmental',
 'disorder',
 'characterized',
 'by',
 'difficulty',
 'with',
 'social',
 'interaction',
 'and',
 'communication',
 'and',
 'by',
 'restricted',
 'and',
 'repetitive',
 'behavior',
 'parents',
 'usually',
 'notice',
 'sign',
 'during',
 'the',
 'first',
 'three',
 'year',
 'of',
 'their',
 'child',
 's',
 'life',
 'these',
 'sign',
 'often',
 'develop',
 'gradually',
 'though',
 'some',
 'child',
 'with',
 'autism',
 'reach',
 'their',
 'developmental',
 'milestone',
 'at',
 'a',
 'normal',
 'pace',
 'before',
 'worsening',
 'autism',
 'is',
 'associated',
 'with',
 'a',
 'combination',
 'of',
 'genetic',
 'and',
 'environmental',
 'factor',
 'risk',
 'factor',
 'during',
 'pregnancy',
 'include',
 'certain',
 'infection',
 'such',
 'a',
 'rubella',
 'and',
 'toxin',
 'including',
 'valproic',
 'acid',
 'alcohol',
 'cocaine',
 'pesticide',
 'and',
 'air',
 'pollution',
 'controversies',
 'surround',
 'other',
 'proposed',
 'environmental',
 'ca

In [11]:
tupled = lemmatized.map(lambda x: [(word,1) for word in x])

In [12]:
tupled.collect()[1]

[('autism', 1),
 ('is', 1),
 ('a', 1),
 ('developmental', 1),
 ('disorder', 1),
 ('characterized', 1),
 ('by', 1),
 ('difficulty', 1),
 ('with', 1),
 ('social', 1),
 ('interaction', 1),
 ('and', 1),
 ('communication', 1),
 ('and', 1),
 ('by', 1),
 ('restricted', 1),
 ('and', 1),
 ('repetitive', 1),
 ('behavior', 1),
 ('parents', 1),
 ('usually', 1),
 ('notice', 1),
 ('sign', 1),
 ('during', 1),
 ('the', 1),
 ('first', 1),
 ('three', 1),
 ('year', 1),
 ('of', 1),
 ('their', 1),
 ('child', 1),
 ('s', 1),
 ('life', 1),
 ('these', 1),
 ('sign', 1),
 ('often', 1),
 ('develop', 1),
 ('gradually', 1),
 ('though', 1),
 ('some', 1),
 ('child', 1),
 ('with', 1),
 ('autism', 1),
 ('reach', 1),
 ('their', 1),
 ('developmental', 1),
 ('milestone', 1),
 ('at', 1),
 ('a', 1),
 ('normal', 1),
 ('pace', 1),
 ('before', 1),
 ('worsening', 1),
 ('autism', 1),
 ('is', 1),
 ('associated', 1),
 ('with', 1),
 ('a', 1),
 ('combination', 1),
 ('of', 1),
 ('genetic', 1),
 ('and', 1),
 ('environmental', 1),
 ('f

In [None]:
# Viz what records in file look like in json format

In [None]:
import gzip

with gzip.GzipFile(path, 'r') as zipped_file:
    # read one dictionary at a time and store in data
    # declare empty counter object
    data = []
    for a_dict in zipped_file:
        data.append(json.loads(a_dict.decode('utf-8')))

In [None]:
data[575]