# Imports and setup

In [1]:
# From example code in example assignment
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark
!pip install newspaper3k
!pip install gdelt
!pip install git+https://github.com/linwoodc3/gdeltPyR

import pyspark, os
from pyspark import SparkConf, SparkContext
os.environ["PYSPARK_PYTHON"]="python3"
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64/"

import numpy as np

[K     |████████████████████████████████| 217.8MB 59kB/s 
[K     |████████████████████████████████| 204kB 49.5MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
Collecting newspaper3k
[?25l  Downloading https://files.pythonhosted.org/packages/d7/b9/51afecb35bb61b188a4b44868001de348a0e8134b4dfa00ffc191567c4b9/newspaper3k-0.2.8-py3-none-any.whl (211kB)
[K     |████████████████████████████████| 215kB 2.8MB/s 
[?25hCollecting tinysegmenter==0.3
  Downloading https://files.pythonhosted.org/packages/17/82/86982e4b6d16e4febc79c2a1d68ee3b707e8a020c5d2bc4af8052d0f136a/tinysegmenter-0.3.tar.gz
Collecting cssselect>=0.9.2
  Downloading https://files.pythonhosted.org/packages/3b/d4/3b5c17f00cce85b9a1e6f91096e1cc8e8ede2e1be8e96b87ce1ed09e92c5/cssselect-1.1.0-py2.py3-none-any.whl
Collecting tldextract>=2.0.1
[?25l  Downloading https://files.pythonhosted.org/packages/fd/0e/9ab599d6e78f0340bb1d1e28ddeacb38c8bb7f91a1b0eae9a24e9603782f/tldextract-2.2.2-py2.py3-none-any.whl (4

In [0]:
#start spark local server
import sys, os
from operator import add
import time

os.environ["PYSPARK_PYTHON"]="python3"

import pyspark
from pyspark import SparkConf, SparkContext

#connects our python driver to a local Spark JVM running on the Google Colab server virtual machine
try:
  conf = SparkConf().setMaster("local[*]").set("spark.executor.memory", "1g")
  sc = SparkContext(conf = conf)
except ValueError:
  #it's ok if the server is already started
  pass

def dbg(x):
  """ A helper function to print debugging information on RDDs """
  if isinstance(x, pyspark.RDD):
    print([(t[0], list(t[1]) if 
            isinstance(t[1], pyspark.resultiterable.ResultIterable) else t[1])
           if isinstance(t, tuple) else t
           for t in x.take(100)])
  else:
    print(x)

# Reset

In [0]:
#!rm -rf articles
#!rm *.csv

# Pull Data from Gdelt

In [4]:
from concurrent.futures import ProcessPoolExecutor
from datetime import date, timedelta
import pandas as pd
import gdelt
import os

# set up gdeltpyr for version 2
gd = gdelt.gdelt(version=2)

# multiprocess the query
e = ProcessPoolExecutor()


# generic functions to pull and write data to disk based on date
def get_filename(x):
  date = x.strftime('%Y%m%d')
  return "{}_gdeltdata.csv".format(date)

def intofile(filename):
    try:
        if not os.path.exists(filename):
          date = filename.split("_")[0]
          d = gd.Search(date, table='events',coverage=False) #not updata at 15mins
          d.to_csv(filename,encoding='utf-8',index=False)
    except:
        print("Error occurred")

# pull the data from gdelt into multi files; this may take a long time
dates_before = [get_filename(x) for x in pd.date_range('2020 Feb 1','2020 Feb 2')]
#dates_after = [get_filename(x) for x in pd.date_range('2020 Jan 15','2020 Jan 20')]

results = list(e.map(intofile,dates_before+dates_after))

NameError: ignored

# Changes data to RDD

In [0]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

data_before = sqlContext.read.option("header", "true").csv(dates_before)
data_after = sqlContext.read.option("header", "true").csv(dates_after)


In [0]:
# Event codes categorised into positive, neutral, and negative events.
neutral_events = ['011', '019', '020', '024', '0241', '0242', '025', '0253', 
                  '0341', '0342', '0343', '0344', '035', '040', '041', '042',
                  '043', '044', '045', '046', '080', '083', '0831', '0832', '0833',
                  '0834', '084', '0841', '0842', '090', '091', '092', '093',
                  '094', '100', '104', '1041', '1042', '1043', '1044', '105', 
                  '106', '107', '108', '110', '123', '1231', '1232', '1233', 
                  '1234', '124', '125', '126', '127', '128', '129', '140', 
                  '141', '1411', '1412', '1413', '1414', '150', '160', '166', '170']


negative_events = ['012', '016', '111', '112', '1121', '1122', '1123', '1124',
                   '1125', '113', '115', '116', '120', '121', '1211', '1212', 
                   '122', '1221', '1222', '1223', '1224', '1241', '1242',
                   '1243', '1244', '1245', '1246', '130', '131', '1311', 
                   '1312', '1313', '132', '1321', '1322', '1323', '1324', 
                   '133', '134', '135', '136', '137', '1381', '138114', 
                   '1382', '1383', '1384', '1385', '139', '142', '1421', 
                   '1422', '1423', '1424', '143', '1431', '1432', '1433', 
                   '1434', '144', '1441', '1442', '1443', '1444', '145', 
                   '1451', '1452', '1453', '1454', '161', '162', '1621', 
                   '1622', '1623', '163', '164', '165', '1661', '1662', 
                   '1663', '171', '1711', '1712', '172', '1721', '1722', 
                   '1723', '1724', '173', '174', '175', '176', '180', '181', 
                   '182', '1821', '1822', '1823', '183', '1831', '1832', 
                   '1833', '1834', '184', '185', '186', '190195', '191', 
                   '192', '193', '194', '1951', '1952', '196', '200', '201',
                   '202', '203', '204', '2041', '2042']


positive_events = ['013', '014', '015', '017', '018', '021', '0211', '0212',
                   '0213', '0214', '022', '022', '023', '0231', '0232', '0233',
                   '0234', '0243', '0244', '0251', '0252', '0254', '0255', 
                   '0256', '026', '027', '028', '030', '031', '0311', '0312',
                   '0313', '0314', '032', '032', '033', '0331', '0332', '0333', 
                   '0334', '034', '0351', '0352', '0353', '0354', '0355', '0356', 
                   '036', '037', '038', '039', '050', '051', '052', '053', 
                   '054', '055', '056', '057', '060', '061', '062', '063',
                   '064', '070', '071', '072', '073', '074', '075', '081', 
                   '0811', '0812', '0813', '0814', '082', '085', '086', '0861',
                   '0862', '0863', '087', '0871', '0872', '0873', '0874',
                   '101', '1011', '1012', '1013', '1014', '102', '103', '1031',
                   '1032', '1033', '1034', '1051', '1052', '1053', '1054', 
                   '1055', '1056', '150', '151', '152', '153', '154', '155']

def event_sign(event_code):
  return ("Positive" if event_code in positive_events else
          "Neutral" if event_code in neutral_events else
          "Negative" if event_code in negative_events else "Unknown")
  




# K-means


In [0]:
class kmeans:
  def __init__(self, clusters, iterations, state):
    self.clusters = clusters
    self.iterations = iterations
    self.state = state

  