In [69]:
# imports
import wget
from zipfile import ZipFile
import os
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark import SparkFiles
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, MapType
#from pyspark.sql.functions import udf
from pyspark.sql.functions import *
import pandas as pd

In [83]:
#!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=0eb774c262e23e9ab35aeefb118a66a4407436a9052ee1c92e0f55025c57c470
  Stored in directory: /Users/kovila/Library/Caches/pip/wheels/bd/a8/c3/3cf2c14a1837a4e04bd98631724e81f33f462d86a1d895fae0
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

TODO:
- delete csv option
- delete zips option
- create history files if they do not exist
- add columns to data
- single file option for tests

In [2]:
# global variables
MASTER_FILELIST_FILEPATH = 'masterfilelist-translation.txt'
DOWNLOAD_CSV_PATH = './gdelt_data/'
START_URL = 'http://data.gdeltproject.org/gdeltv2/'
HISTORY_EXTRACTED_FILEPATH = 'history_extracted'
HISTORY_LOADED_FILEPATH = 'history_loaded'
EVENTS_COLUMN_HEADERS = './gdelt_columns/events_column_headers'
YEAR = '2022'
MONTH = '06'

## get urls from master filelist

In [3]:
# WARNING WHY ARE SOME URLS MISSING???

def get_zip_urls_from_master_filelist(year_list, month_list, day_list, zip_type, master_filelist_path='masterfilelist-translation.txt', start_url='http://data.gdeltproject.org/gdeltv2/'):
    
    # zip_type: events | mentions | gkg
    zip_type_token = ''
    if zip_type == 'events':
        zip_type_token = '.export.'
    elif zip_type == 'mentions':
        zip_type_token = '.mentions.'
    elif zip_type == 'gkg':
        zip_type_token = '.gkg.'
    else:
        raise Exception('zip_type should be one of: events | mentions | gkg')
        
    # get masterfile list path
    with open(master_filelist_path) as f:
        raw_file_list = f.readlines()
    
    raw_file_list = [line.split() for line in raw_file_list]
    
    # extract zip urls from masterfile list path
    zip_urls = []
    for i in range(len(raw_file_list)):
        try:
            zip_urls.append(raw_file_list[i][2])
        except Exception:  
            pass
        
        
    # filter specified year, month and day
    filtered_zip_urls = []
    
    for year in year_list:
        for month in month_list:
            if day_list is None:
                filtered_zip_urls = filtered_zip_urls + [file for file in zip_urls if (start_url + year + month in file) and (zip_type_token in file)]
            else:
                for day in day_list:
                    filtered_zip_urls = filtered_zip_urls + [file for file in zip_urls if (start_url + year + month + day in file) and (zip_type_token in file)]

                
    return filtered_zip_urls


## download and extract urls

In [4]:
def download_and_extract(zip_urls, download_zip_path, start_url='http://data.gdeltproject.org/gdeltv2/'):
    
    extracted_filenames = []
    
    # create download path of it does not exist
    if not os.path.exists(download_zip_path):
        os.makedirs(download_zip_path)

    for zip_url in zip_urls:
        
        downloaded_filename = zip_url.replace(start_url,'')
        
        # do not downloaded if already exists
        already_downloaded = os.path.exists(download_zip_path+downloaded_filename)
    
        # download zip file
        if not already_downloaded:
            wget.download(zip_url, out=download_zip_path+downloaded_filename)
        extracted_filename = downloaded_filename.replace('.zip','')
    
        # do not unzip if already exists
        already_extracted = os.path.exists(download_zip_path+extracted_filename)
    
        # unzip file
        if not already_extracted:
            with ZipFile(download_zip_path+downloaded_filename, 'r') as zip_ref:
                zip_ref.extractall(download_zip_path)    
        
        # delete zip file
        os.remove(download_zip_path+downloaded_filename)
        
        extracted_filenames = extracted_filenames + [extracted_filename]
    
    return extracted_filenames

## data schema

In [44]:
# EVENTS SCHEMA
# StructField(field_name, field_type, nullable)
events_schema = StructType([
    
    StructField("GLOBALEVENTID", StringType(), True), #
    StructField("SQLDATE", StringType(), True), #
    StructField("MonthYear", StringType(), True),
    StructField("Year", StringType(), True),
    
    StructField("FractionDate", FloatType(), True),
    StructField("Actor1Code", StringType(), True),
    StructField("Actor1Name", StringType(), True),
    StructField("Actor1CountryCode", StringType(), True),
    StructField("Actor1KnownGroupCode", StringType(), True),
    
    StructField("Actor1EthnicCode", StringType(), True),
    StructField("Actor1Religion1Code", StringType(), True),
    StructField("Actor1Religion2Code", StringType(), True),
    StructField("Actor1Type1Code", StringType(), True),
    StructField("Actor1Type2Code", StringType(), True),
    
    StructField("Actor1Type3Code", StringType(), True),
    StructField("Actor2Code", StringType(), True),
    StructField("Actor2Name", StringType(), True),
    StructField("Actor2CountryCode", StringType(), True),
    StructField("Actor2KnownGroupCode", StringType(), True),
    
    StructField("Actor2EthnicCode", StringType(), True),
    StructField("Actor2Religion1Code", StringType(), True),
    StructField("Actor2Religion2Code", StringType(), True),
    StructField("Actor2Type1Code", StringType(), True),
    StructField("Actor2Type2Code", StringType(), True),
    
    StructField("Actor2Type3Code", StringType(), True),
    StructField("IsRootEvent", IntegerType(), True),
    StructField("EventCode", StringType(), True),
    StructField("EventBaseCode", StringType(), True),
    StructField("EventRootCode", StringType(), True),
    
    StructField("QuadClass", IntegerType(), True),
    StructField("GoldsteinScale", FloatType(), True),
    StructField("NumMentions", IntegerType(), True),
    StructField("NumSources", IntegerType(), True),
    StructField("NumArticles", IntegerType(), True),
    
    StructField("AvgTone", FloatType(), True),
    StructField("Actor1Geo_Type", IntegerType(), True),
    StructField("Actor1Geo_FullName", StringType(), True),
    StructField("Actor1Geo_CountryCode", StringType(), True), #
    StructField("Actor1Geo_ADM1Code", StringType(), True),
    
    StructField("Actor1Geo_ADM2Code", StringType(), True),
    StructField("Actor1Geo_Lat", FloatType(), True),
    StructField("Actor1Geo_Long", FloatType(), True),
    StructField("Actor1Geo_FeatureID", StringType(), True),
    StructField("Actor2Geo_Type", IntegerType(), True),
    
    StructField("Actor2Geo_FullName", StringType(), True),
    StructField("Actor2Geo_CountryCode", StringType(), True), #
    StructField("Actor2Geo_ADM1Code", StringType(), True),
    StructField("Actor2Geo_ADM2Code", StringType(), True),
    StructField("Actor2Geo_Lat", FloatType(), True),
    
    StructField("Actor2Geo_Long", FloatType(), True),
    StructField("Actor2Geo_FeatureID", StringType(), True),
    StructField("ActionGeo_Type", IntegerType(), True),
    StructField("ActionGeo_FullName", StringType(), True),
    StructField("ActionGeo_CountryCode", StringType(), True), #
    
    StructField("ActionGeo_ADM1Code", StringType(), True),
    StructField("ActionGeo_ADM2Code", StringType(), True),
    StructField("ActionGeo_Lat", FloatType(), True),
    StructField("ActionGeo_Long", FloatType(), True),
    StructField("ActionGeo_FeatureID", StringType(), True),
    
    StructField("DATEADDED", IntegerType(), True),
    StructField("SOURCEURL", StringType(), True)
])

In [45]:
# MENTIONS SCHEMA
# StructField(field_name, field_type, nullable)
mentions_schema = StructType([
    
    StructField("GLOBALEVENTID", StringType(), True), #
    StructField("EventTimeDate", StringType(), True), #
    StructField("MentionTimeDate", StringType(), True), #
    StructField("MentionType", IntegerType(), True),
    StructField("MentionSourceName", StringType(), True),
    
    StructField("MentionIdentifier", StringType(), True), #
    StructField("SentenceID", IntegerType(), True),
    StructField("Actor1CharOffset", IntegerType(), True),
    StructField("Actor2CharOffset", IntegerType(), True),
    StructField("ActionCharOffset", IntegerType(), True),
    
    StructField("InRawText", IntegerType(), True),
    StructField("Confidence", IntegerType(), True),
    StructField("MentionDocLen", IntegerType(), True),
    StructField("MentionDocTone", FloatType(), True),
    StructField("MentionDocTranslationInfo", StringType(), True), #
    
    StructField("Extras", StringType(), True)    
])

In [46]:
# GKG SCHEMA
# StructField(field_name, field_type, nullable)
gkg_schema = StructType([
    
    StructField("GKGRECORDID", StringType(), True), #
    StructField("DATE", StringType(), True), #
    StructField("SourceCollectionIdentifier", IntegerType(), True), 
    StructField("SourceCommonName", StringType(), True), #
    StructField("DocumentIdentifier", StringType(), True), #
    
    StructField("Counts", StringType(), True), 
    StructField("V2Counts", StringType(), True),
    StructField("Themes", StringType(), True), #
    StructField("V2Themes", StringType(), True), #
    StructField("Locations", StringType(), True),
    
    StructField("V2Locations", StringType(), True),
    StructField("Persons", StringType(), True), #
    StructField("V2Persons", StringType(), True), #
    StructField("Organizations", StringType(), True),
    StructField("V2Organizations", StringType(), True),
    
    StructField("V2Tone", StringType(), True), # first array element only
    StructField("Dates", StringType(), True),
    StructField("GCAM", StringType(), True),
    StructField("SharingImage", StringType(), True),
    StructField("RelatedImages", StringType(), True),    
    
    StructField("SocialImageEmbeds", StringType(), True),
    StructField("SocialVideoEmbeds", StringType(), True),
    StructField("Quotations", StringType(), True),
    StructField("AllNames", StringType(), True),
    StructField("Amounts", StringType(), True), 
    
    StructField("TranslationInfo", StringType(), True),
    StructField("Extras", StringType(), True)
    
])

In [47]:
SCHEMA_DICTIONARY = {
    'events':events_schema,
    'mentions':mentions_schema,
    'gkg':gkg_schema
}

## spark session

In [9]:
# create spark session
SPARK = SparkSession.builder.master('local') \
    .appName('SparkSession') \
    .config("spark.mongodb.read.connection.uri", "mongodb://tp-hadoop-50/") \
    .config("spark.mongodb.write.connection.uri", "mongodb://tp-hadoop-50/") \
    .getOrCreate()

## spark_read_csv

In [10]:
def spark_read_csv(spark_session, csv_filepath, csv_file_list, schema_dictionary, csv_type):

    df_read = None
    
    for file in csv_file_list:
        
        # file_type: events | mentions | gkg
        csv_type_token = ''
        schema = None
        
        if csv_type == 'events':
            csv_type_token = '.export.'
            schema = schema_dictionary['events']
        elif csv_type == 'mentions':
            csv_type_token = '.mentions.'
            schema = schema_dictionary['mentions']
        elif csv_type == 'gkg':
            csv_type_token = '.gkg.'
            schema = schema_dictionary['gkg']
        else:
            raise Exception('csv_type should be one of: events | mentions | gkg')
        
        # read csv
        df = spark_session.read.options(delimiter='\t').csv(csv_filepath+file, schema=schema)
        
        if df_read is None:
            df_read = df.select('*')
        else:
            df_read = df_read.unionByName(df)
        
    return df_read

## transform events data

In [11]:
def select_columns_df(df, df_type):
    
    events_columns = ['GLOBALEVENTID', 'SQLDATE', 'Actor1Geo_CountryCode', 'Actor2Geo_CountryCode', 'Actor1Geo_CountryCode', ]
    mentions_columns = ['GLOBALEVENTID', 'EventTimeDate', 'MentionTimeDate', 'MentionIdentifier',  'MentionDocTranslationInfo']
    gkg_columns = ['GKGRECORDID', 'DATE', 'SourceCommonName', 'DocumentIdentifier', 'Themes', 'V2Themes', 'Persons', 'V2Persons', 'V2Tone']

    selection_columns = []
    if df_type == 'events':
        selection_columns = events_columns
    elif df_type == 'mentions':
        selection_columns = mentions_columns
    elif df_type == 'gkg':
        selection_columns = gkg_columns
    else:
        raise Exception('df_type must be: events| mentions |gkg')
        
    transformed_df = df.select(selection_columns)
    
    return transformed_df

In [11]:
def transform_events_data(events_df):
    
    events_columns = ['GLOBALEVENTID', 'SQLDATE', 'ActionGeo_CountryCode']
    transformed_df = events_df.select(events_columns)
    
    return transformed_df

## write spark dataframe to mongodb collection

In [12]:
def load_mongodb(spark_dataframe, mongodb_database, mongodb_collection):
    spark_dataframe.write.format('mongodb').option("database",mongodb_database).option("collection", mongodb_collection).mode("append").save()
    

## ETL EVENTS DATA

In [101]:
def load_data(year_list, month_list, day_list, schema_dictionary, spark_session, mongodb_database, mongodb_collection, download_csv_path, start_url):
    
    # GET MENTIONS
    # download and extract mentions csv files
    mentions_zip_urls = get_zip_urls_from_master_filelist(year_list=year_list, month_list=month_list, day_list=day_list, zip_type='mentions')
    mentions_extracted_csvs = download_and_extract(mentions_zip_urls, download_zip_path=download_csv_path, start_url=start_url)
    # read mentions csv to spark dataframe
    mentions_df = spark_read_csv(spark_session=spark_session, csv_filepath=download_csv_path, csv_file_list=[mentions_extracted_csvs], schema_dictionary=schema_dictionary, csv_type='mentions')
    mentions_df = select_columns_df(mentions_df, df_type='mentions')
    
    # GET EVENTS
    # get event date filter
    event_dates = mentions_df.select('EventTimeDate').distinct()
    event_dates = event_dates.withColumn('year', substring('EventTimeDate', 1,4))
    event_dates = event_dates.withColumn('month', substring('EventTimeDate', 5,2))
    event_dates = event_dates.withColumn('day', substring('EventTimeDate', 7,2))
        
    # warning: section of code not parallelizable: pandas dataframe!!!
    # not a transformation on event_dates dataframe
    # download event files
    event_dates_pandas = event_dates.toPandas()
    event_zip_urls = []
    for index, row in event_dates_pandas.iterrows():    
        event_zip_urls = event_zip_urls + get_zip_urls_from_master_filelist(year_list=[row['year']], month_list=[row['month']], day_list=[row['day']], zip_type='events')
        
    events_extracted_csvs = download_and_extract(event_zip_urls, download_zip_path=download_csv_path, start_url=start_url)
        
    events_df = spark_read_csv(spark_session=spark_session, csv_filepath=download_csv_path, csv_file_list=events_extracted_csvs, schema_dictionary=schema_dictionary, csv_type='events')
    events_df = select_columns_df(events_df, df_type='events')

    # JOIN MENTIONS AND EVENTS
    mentions_events_df = mentions_df.join(events_df, mentions_df.GLOBALEVENTID == events_df.GLOBALEVENTID, 'left')


    # GET GKG
    # get event date filter
    gkg_dates = mentions_df.select('MentionTimeDate').distinct()
    gkg_dates = gkg_dates.withColumn('year', substring('MentionTimeDate', 1,4))
    gkg_dates = gkg_dates.withColumn('month', substring('MentionTimeDate', 5,2))
    gkg_dates = gkg_dates.withColumn('day', substring('MentionTimeDate', 7,2))
        
    # warning: section of code not parallelizable: pandas dataframe!!!
    # not a transformation on event_dates dataframe
    # download event files
    gkg_dates_pandas = gkg_dates.toPandas()
    gkg_zip_urls = []
    for index, row in gkg_dates_pandas.iterrows():    
        gkg_zip_urls = gkg_zip_urls + get_zip_urls_from_master_filelist(year_list=[row['year']], month_list=[row['month']], day_list=[row['day']], zip_type='gkg')
        
    gkg_extracted_csvs = download_and_extract(gkg_zip_urls, download_zip_path=download_csv_path, start_url=start_url)
        
    gkg_df = spark_read_csv(spark_session=spark_session, csv_filepath=download_csv_path, csv_file_list=gkg_extracted_csvs, schema_dictionary=schema_dictionary, csv_type='gkg')
    gkg_df = select_columns_df(gkg_df, df_type='gkg')
    
    # JOIN MENTIONS AND GKG
    mentions_events_gkg_df = mentions_df.join(events_df, mentions_df.GLOBALEVENTID == events_df.GLOBALEVENTID, 'left')


    
    return mentions_df, events_df, gkg_df, mentions_events_gkg_df


In [None]:
mentions_df = load_data(year_list=['2022'], 
                        month_list=['01'], 
                        day_list=['01'], 
                        schema_dictionary=SCHEMA_DICTIONARY, 
                        spark_session = SPARK, 
                        mongodb_database = None, 
                        mongodb_collection = None, 
                        download_csv_path = './gdelt_data/', 
                        start_url = 'http://data.gdeltproject.org/gdeltv2/')



In [80]:
mentions_df['EventTimeDate'][0]

Column<'EventTimeDate[0]'>

In [84]:
test = ''
mentions_df.foreach(lambda row: 
    test = row['EventTimeDate'])

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (<ipython-input-84-294a6c78d829>, line 2)

In [82]:
mentions_df.show()

+-------------+--------------+---------------+--------------------+-------------------------+
|GLOBALEVENTID| EventTimeDate|MentionTimeDate|   MentionIdentifier|MentionDocTranslationInfo|
+-------------+--------------+---------------+--------------------+-------------------------+
|   1021421880|20220101000000| 20220101000000|http://www.pasien...|     srclc:lit;eng:GT-...|
|   1021421881|20220101000000| 20220101000000|https://aminata.c...|     srclc:fra;eng:Mos...|
|   1021421152|20220101001500| 20220101000000|https://www.caden...|     srclc:spa;eng:Mos...|
|   1021421882|20220101000000| 20220101000000|https://www.caden...|     srclc:spa;eng:Mos...|
|   1021421883|20220101000000| 20220101000000|https://www.em.co...|     srclc:por;eng:GT-...|
|   1021421884|20220101000000| 20220101000000|https://www.em.co...|     srclc:por;eng:GT-...|
|   1021421885|20220101000000| 20220101000000|http://news.abidj...|     srclc:fra;eng:Mos...|
|   1021421885|20220101000000| 20220101000000|https://www.fr

In [96]:
test = mentions_df.select('EventTimeDate').distinct()
test = test.withColumn('year', substring('EventTimeDate', 1,4))
test = test.withColumn('month', substring('EventTimeDate', 5,2))
test = test.withColumn('day', substring('EventTimeDate', 7,2))

In [98]:
test_pandas = test.toPandas()

In [100]:
for index, row in test_pandas.iterrows():
    print(row['year'], row['month'])

2022 01
2022 01
2021 12
2021 01
2021 12
2022 01
2021 12
2022 01
2022 01
2021 12
2021 01
2021 12
2022 01
2022 01
2021 01
2021 01
2022 01
2022 01
2022 01
2022 01
2021 01
2021 12
2022 01
2022 01
2022 01
2022 01
2021 12
2022 01
2022 01
2022 01
2022 01
2021 12
2021 12
2021 01
2022 01
2022 01
2021 12
2021 12
2022 01
2021 01
2022 01
2021 01
2022 01
2022 01
2021 01
2021 12
2022 01
2021 12
2021 01
2022 01
2022 01
2022 01
2022 01
2021 01
2022 01
2022 01
2021 12
2021 12
2021 12
2022 01
2022 01
2022 01
2022 01
2021 12
2022 01
2021 12
2021 01
2021 12
2022 01
2022 01
2021 12
2021 12
2022 01
2021 01
2021 01
2022 01
2022 01
2022 01
2021 12
2022 01
2021 12
2022 01
2022 01
2022 01
2021 12
2021 12
2022 01
2022 01
2021 12
2021 01
2021 01
2021 01
2022 01
2022 01
2021 12
2021 01
2022 01
2021 12
2022 01
2022 01
2021 12
2022 01
2021 12
2021 12
2022 01
2022 01
2021 01
2022 01
2022 01
2022 01
2022 01
2022 01
2022 01
2022 01
2022 01
2021 12
2021 12
2021 12
2021 01
2021 01
2021 01
2022 01
2021 12
2021 01
2022 01


In [92]:
test.show()

+-------------+--------------+---------------+--------------------+-------------------------+----+-----+
|GLOBALEVENTID| EventTimeDate|MentionTimeDate|   MentionIdentifier|MentionDocTranslationInfo|year|month|
+-------------+--------------+---------------+--------------------+-------------------------+----+-----+
|   1021421880|20220101000000| 20220101000000|http://www.pasien...|     srclc:lit;eng:GT-...|2022|   01|
|   1021421881|20220101000000| 20220101000000|https://aminata.c...|     srclc:fra;eng:Mos...|2022|   01|
|   1021421152|20220101001500| 20220101000000|https://www.caden...|     srclc:spa;eng:Mos...|2022|   01|
|   1021421882|20220101000000| 20220101000000|https://www.caden...|     srclc:spa;eng:Mos...|2022|   01|
|   1021421883|20220101000000| 20220101000000|https://www.em.co...|     srclc:por;eng:GT-...|2022|   01|
|   1021421884|20220101000000| 20220101000000|https://www.em.co...|     srclc:por;eng:GT-...|2022|   01|
|   1021421885|20220101000000| 20220101000000|http://ne

In [None]:
test.withColumn("year", col("EventTimeDate").str.)

In [76]:
event_dates.withColumn("EventTimeDate",to_timestamp(col("EventTimeDate"))).withColumn("day", date_format(col("EventTimeDate"), "D")).show()


+-------------+----+
|EventTimeDate| day|
+-------------+----+
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
|         null|null|
+-------------+----+
only showing top 20 rows



In [63]:
event_dates = mentions_df.select('EventTimeDate').distinct()

In [66]:
df.select("Seqno","Name", upper(df.Name)) \
  .show()

Column<'EventTimeDate'>

In [71]:
date_format(mentions_df.EventTimeDate, 'YYYY').show()

TypeError: 'Column' object is not callable

In [65]:
event_dates.show()

+--------------+
| EventTimeDate|
+--------------+
|20220101001500|
|20220101000000|
|20211231050000|
|20210101031500|
|20211202230000|
|20220101003000|
|20211202124500|
|20220101010000|
|20220101004500|
|20211202113000|
|20210101094500|
|20211231230000|
|20220101011500|
|20220101013000|
|20210101040000|
|20210101001500|
|20220101014500|
|20220101020000|
|20220101021500|
|20220101023000|
+--------------+
only showing top 20 rows



In [61]:
mentions_df.select(mentions_df("EventTimeDate")).distinct

TypeError: 'DataFrame' object is not callable

In [49]:
mentions_df_pandas = mentions_df.toPandas()

In [43]:
mentions_df_pandas['EventTimeDate'].isnull().all()

True

In [20]:
mentions_event_time_date = mentions_df.toPandas()['EventTimeDate']
mentions_event_time_date

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
49023   NaN
49024   NaN
49025   NaN
49026   NaN
49027   NaN
Name: EventTimeDate, Length: 49028, dtype: float64

In [12]:
def etl_events(year_list, month_list, day_list, schema_dictionary, spark_session, mongodb_database, mongodb_collection, download_csv_path, start_url):
    
    # get events urls from master filelist
    zip_urls = get_zip_urls_from_master_filelist(year_list=year_list, month_list=month_list, day_list=day_list, zip_type='events')

    # download and extract csv
    extracted_csvs = download_and_extract(zip_urls, download_zip_path=download_csv_path, start_url=start_url)

    # read csv to spark dataframe
    events_df = spark_read_csv(spark_session=spark_session, csv_filepath=download_csv_path, csv_file_list=extracted_csvs, schema_dictionary=schema_dictionary, csv_type='events')

    # transform events data
    events_df = transform_events_data(events_df)
    
    # load events data to mongodb
    load_mongodb(events_df, mongodb_database=mongodb_database, mongodb_collection=mongodb_collection)
    
    # delete zip file
    os.remove(download_zip_path+extracted_csvs)
    

In [29]:
zip_urls = get_zip_urls_from_master_filelist(year_list=['2022'], month_list=['01'], day_list=['01','02','03'], zip_type='events')
extracted_csvs = download_and_extract(zip_urls, download_zip_path='./gdelt_data/', start_url='http://data.gdeltproject.org/gdeltv2/')
events_df = spark_read_csv(spark_session=SPARK, csv_filepath='./gdelt_data/', csv_file_list=extracted_csvs, schema_dictionary=SCHEMA_DICTIONARY, csv_type='events')

#events_df = transform_events_data(events_df)

In [28]:
zip_urls = get_zip_urls_from_master_filelist(year_list=['2022'], month_list=['01'], day_list=['01','02','03'], zip_type='gkg')
extracted_csvs = download_and_extract(zip_urls, download_zip_path='./gdelt_data/', start_url='http://data.gdeltproject.org/gdeltv2/')
gkg_df = spark_read_csv(spark_session=SPARK, csv_filepath='./gdelt_data/', csv_file_list=extracted_csvs, schema_dictionary=SCHEMA_DICTIONARY, csv_type='gkg')

#events_df = transform_events_data(events_df)

In [30]:
gkg_df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string, _c23: string, _c24: string, _c25: string, _c26: string]

In [22]:
df1 = events_df.toPandas()['events_GLOBALEVENTID']

AttributeError: 'Series' object has no attribute 'values_count'

In [23]:
df1 = events_df.toPandas()

In [27]:
df1['events_GLOBALEVENTID'].values_count()

AttributeError: 'Series' object has no attribute 'values_count'

In [13]:
etl_events(year_list=['2022'], month_list=['01'], day_list=['01','02','03'], schema_dictionary=SCHEMA_DICTIONARY, spark_session=SPARK, mongodb_database='test', mongodb_collection='events', download_csv_path='./gdelt_data/', start_url='http://data.gdeltproject.org/gdeltv2/')



Py4JJavaError: An error occurred while calling o3787.save.
: java.lang.ClassNotFoundException: 
Failed to find data source: mongodb. Please find packages at
https://spark.apache.org/third-party-projects.html
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedToFindDataSourceError(QueryExecutionErrors.scala:587)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:675)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:725)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:864)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:256)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassNotFoundException: mongodb.DefaultSource
	at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:661)
	at org.apache.spark.sql.execution.datasources.DataSource$$$Lambda$1549/1171936102.apply(Unknown Source)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:661)
	at org.apache.spark.sql.execution.datasources.DataSource$$$Lambda$1548/284708524.apply(Unknown Source)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:661)
	... 16 more


In [30]:
def transform_mentions_data(spark_session, mentions_df, mongodb_events_collection, mongodb_database):
    
    # filter mentions columns
    mentions_columns = ['mentions_GLOBALEVENTID', 'MentionIdentifier', 'MentionDocTranslationInfo']
    mentions_columns = ['mentions_GLOBALEVENTID', 'MentionIdentifier']
    
    mentions_df = mentions_df.select(mentions_columns)
    
    # find event GLOBALEVENTID from mongodb events collection
    events_df = spark_session.read.format("mongodb").option("database",mongodb_database).option("collection", mongodb_events_collection).load()
    
    # join events and mentions
    mentions_events_df = mentions_df.join(events_df, events_df.events_GLOBALEVENTID == mentions_df.mentions_GLOBALEVENTID, 'left')
    
    # create nested mentions df
    build_nested_event_udf = udf(lambda SQLDATE, ActionGeo_CountryCode: {
        'SQLDATE': SQLDATE,
        'ActionGeo_CountryCode': ActionGeo_CountryCode
    }, MapType(StringType(), StringType()))

    mentions_events_df = (
        mentions_events_df
        .withColumn('event_fields', build_nested_event_udf(mentions_events_df['SQLDATE'], mentions_events_df['ActionGeo_CountryCode']))
        .drop('SQLDATE')
        .drop('ActionGeo_CountryCode')
        .drop('events_GLOBALEVENTID')
    )
    
    return mentions_events_df

In [None]:
def etl_mentions(year_list, month_list, day_list, schema_dictionary, spark_session, mongodb_database, mongodb_mentions_collection, mongodb_events_collection, download_csv_path, start_url):
    
    # get events urls from master filelist
    zip_urls = get_zip_urls_from_master_filelist(year_list=year_list, month_list=month_list, day_list=day_list, zip_type='mentions')

    # download and extract csv
    extracted_csvs = download_and_extract(zip_urls, download_zip_path=download_csv_path, start_url=start_url)

    # read csv to spark dataframe
    mentions_df = spark_read_csv(spark_session, csv_filepath=download_csv_path, csv_file_list=extracted_csvs, schema_dictionary=schema_dictionary, csv_type='mentions')

    # transform events data
    mentions_df = transform_mentions_data(spark_session=spark_session, mentions_df=mentions_df, mongodb_events_collection=mongodb_events_collection, mongodb_database=mongodb_database)
    mentions_df.show()
    # load events data to mongodb
    #load_mongodb(mentions_df, mongodb_database=mongodb_database, mongodb_collection=mongodb_mentions_collection)
    mentions_df.write.format('mongodb').option("database",mongodb_database).option("collection", mongodb_collection).mode("append").save()



In [None]:
#etl_mentions(year_list=['2022'], month_list=['01'], day_list=['01','02','03'], schema_dictionary=SCHEMA_DICTIONARY, spark_session=SPARK, mongodb_database='test', mongodb_mentions_collection='mentions', mongodb_events_collection='events', download_csv_path='./gdelt_data/', start_url='http://data.gdeltproject.org/gdeltv2/')



In [None]:
#zip_urls = get_zip_urls_from_master_filelist(year_list=['2022'], month_list=['01'], day_list=['01','02','03'], zip_type='mentions')

