In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import json

from pyspark import SparkContext
from pyspark import sql
from pyspark.sql import SQLContext

from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
# Create spark session
spark = SparkSession\
    .builder\
    .master("local")\
    .appName("word_count_jsonl")\
    .config("spark.some.config.option", "some-value")\
    .getOrCreate()

In [9]:
# declare data file path for read in next line
path = 'c:/users/prayt/mini_pyspark_projects/article_samples.json.gz'
# read jsonl file into spark dataframe
json_df = spark.read.json(path)
# show spark dataframe schema
json_df.printSchema()
# observe each section_texts cell is a list of text sections

root
 |-- section_texts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- section_titles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)



In [40]:
# collapse list of strings for each article to one long text string for each article 
section_texts_rdd =  json_df.select('section_texts').rdd.map(lambda x: ' '.join(x[0])) 

section_texts_rdd.collect()[20]

'\n\n\n\n\n\'\'\'Anthropology\'\'\' is the study of humans and human behavior and societies in the past and present. Social anthropology and cultural anthropology study the norms and values of societies. Linguistic anthropology studies how language affects social life. Biological or physical anthropology studies the biological development of humans.\n\nArchaeology, which studies past human cultures through investigation of physical evidence, is thought of as a branch of anthropology in the United States and Canada, while in Europe, it is viewed as a discipline in its own right or grouped under other related disciplines, such as history.\n Bernardino de Sahagún is considered to be the founder of modern anthropology.\n\nThe abstract noun \'\'anthropology\'\' is first attested in reference to history. Its present use first appeared in Renaissance Germany in the works of Magnus Hundt and Otto Casmann. Their New Latin \'\'\'\' derived from the combining forms of the Greek words \'\'ánthrōpo

In [41]:
# declare tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# tokenize each article to create series of individual words for each article
tokenized_sections = section_texts_rdd.map(lambda x: tokenizer.tokenize(x))

print(len(tokenized_sections.collect()))
tokenized_sections.collect()[0]


1000


['Anarchism',
 'is',
 'an',
 'anti',
 'authoritarian',
 'political',
 'philosophy',
 'that',
 'advocates',
 'self',
 'governed',
 'societies',
 'based',
 'on',
 'voluntary',
 'cooperative',
 'institutions',
 'and',
 'the',
 'rejection',
 'of',
 'coercive',
 'hierarchies',
 'those',
 'societies',
 'view',
 'as',
 'unjust',
 'These',
 'institutions',
 'are',
 'often',
 'described',
 'as',
 'stateless',
 'societies',
 'although',
 'several',
 'authors',
 'have',
 'defined',
 'them',
 'more',
 'specifically',
 'as',
 'distinct',
 'institutions',
 'based',
 'on',
 'non',
 'hierarchical',
 'or',
 'free',
 'associations',
 'Anarchism',
 'holds',
 'the',
 'state',
 'to',
 'be',
 'undesirable',
 'unnecessary',
 'and',
 'harmful',
 'Any',
 'philosophy',
 'consistent',
 'with',
 'statelessness',
 'that',
 'is',
 'principled',
 'opposition',
 'to',
 'the',
 'State',
 'is',
 'anarchist',
 'thus',
 'anarchist',
 'schools',
 'of',
 'thought',
 'range',
 'from',
 'anarcho',
 'communism',
 'to',
 'anar

In [46]:
# reduce words to 'root' word and make each word lower case for word counting later
lemmatizer = WordNetLemmatizer()
lemmed_sections = tokenized_sections.map(lambda x: [lemmatizer.lemmatize(word).lower() for word in x])

In [47]:
lemmed_sections.collect()[1]

['autism',
 'is',
 'a',
 'developmental',
 'disorder',
 'characterized',
 'by',
 'difficulty',
 'with',
 'social',
 'interaction',
 'and',
 'communication',
 'and',
 'by',
 'restricted',
 'and',
 'repetitive',
 'behavior',
 'parents',
 'usually',
 'notice',
 'sign',
 'during',
 'the',
 'first',
 'three',
 'year',
 'of',
 'their',
 'child',
 's',
 'life',
 'these',
 'sign',
 'often',
 'develop',
 'gradually',
 'though',
 'some',
 'child',
 'with',
 'autism',
 'reach',
 'their',
 'developmental',
 'milestone',
 'at',
 'a',
 'normal',
 'pace',
 'before',
 'worsening',
 'autism',
 'is',
 'associated',
 'with',
 'a',
 'combination',
 'of',
 'genetic',
 'and',
 'environmental',
 'factor',
 'risk',
 'factor',
 'during',
 'pregnancy',
 'include',
 'certain',
 'infection',
 'such',
 'a',
 'rubella',
 'and',
 'toxin',
 'including',
 'valproic',
 'acid',
 'alcohol',
 'cocaine',
 'pesticide',
 'and',
 'air',
 'pollution',
 'controversies',
 'surround',
 'other',
 'proposed',
 'environmental',
 'ca

In [13]:
# Viz what records in file look like

In [14]:
import gzip

with gzip.GzipFile(path, 'r') as zipped_file:
    # read one dictionary at a time and store in data
    # declare empty counter object
    data = []
    for a_dict in zipped_file:
        data.append(json.loads(a_dict.decode('utf-8')))

In [26]:
data[575]

{'title': 'Amathus',
 'section_titles': ['Introduction',
  'History',
  'The Site and Archaeology',
  'Gallery',
  'Notes',
  'References',
  ' External links '],
 'section_texts': ["\n\n\n\n\n'''Amathus''' or '''Amathous''' () was an ancient city and one of the ancient royal cities of Cyprus until about 300 BC. Some of its impressive remains can be seen today on the southern coast in front of Agios Tychonas, about 24 miles west of Larnaca and 6 miles east of Limassol. Its ancient cult sanctuary of Aphrodite was the second most important in Cyprus, her homeland, after Paphos.\n\nArchaeological work has recently been continued at the site and many finds are exhibited in the Limassol Museum.\n",
  '===Pre-history and ancient era===\nAncient kingdoms of Cyprus\nThe pre-history of Amathus mixes myth and archaeology. Archaeology has detected human activity from the earliest Iron Age,  BC. \nThe city\'s legendary founder was Cinyras, linked with the birth of Adonis, who called the city after