## Import / Setup

If you are using this repository without any processed input data, you will need to create fresh data using Google's MapsAPIV3.
This requires a Google API Key, available at Google's Cloud Platform.

In [2]:
API_KEY = 'SETME'
API_TIMEOUT_S = 10
API_USER_AGENT = 'tcat-pridemonth.muk.uni-passau.de'
API_CSV_OUTPUT = 'geolocated.csv'

CSV_INPUT = 'tcat_input.csv'
EDGES_INPUT = 'sgraph_edges.csv'
NODES_INPUT = 'sgraph_nodes.csv'

CSV_OUTPUT = 'country_coded.csv'
NODES_W_COUNTRY_OUTPUT = 'tcat-nodes.csv'
EDGES_OUTPUT = 'tcat-edges.csv'

In [4]:
import os
import pandas as pd
import geopy as gp
import numpy as np
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm
tqdm.pandas()

### Functions

We require 3 functions to fetch data from Google's MapsAPI.

In [5]:
def retry_fn(fn):
    """
    Retry a given functions up to 5 times.
    
    Args:
        *args: The arguments to the wrapped function.
        retries: The number of retries we have left. Defaults to 5.
        **kwargs: All other keyword arguments, passed on to the given fn.
        
    Returns:
        Whatever the given function returns, or "ERROR",
        if we run out of retries.
    """
    def wrap_fn(*args, retries=5, **kwargs):
        """Function wrapper that does the retrying."""
        try:
            return fn(*args, **kwargs)
        except Exception:
            if retries > 0:
                return wrap_fn(*args, retries=retries-1, **kwargs)
            else:
                return "ERROR"
            
    return wrap_fn

In [6]:
def process_row(fn, src_col, tgt_col, row):
    """
    Apply a fn to the column of a dataframe row.
    
    Use this in combination with functools.partial to create a function usable by
    DataSeries.apply().
    
    Example:
        series.apply(functools.partial(foo, 'a', 'b'))
    
    Args:
        fn: The functiont to apply.
        src_col: The source column name.
        tgt_col: The target column name.
        row: The input row.
    """
    row[tgt_col] = row[src_col].apply(fn)
    return row

In [7]:
def parallel_apply(df, func, cores=8, partitions=24):
    """
    Apply a function on all rows of a data frame, in parallel.
    
    Args:
        df: The dataframe we operate on.
        func: The function to apply.
        cores: The number of processes we run in parallel.
        partitions: The number of data partitions we create.
    """
    df_split = np.array_split(df, partitions)
    pool = Pool(cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    
    return df

## Data

We do not query the Google API, if the processed input data is already available (API_CSV_OUTPUT).
If you wish to regenerate all data from scratch, just delete the file at API_CSV_OUTPUT and provide a
API_KEY.

In [78]:
if not os.path.exists(API_CSV_OUTPUT):
    df = pd.read_csv(CSV_INPUT, index_col='id')
    dff = df[~ pd.isnull(df.location)]

    geolocator = gp.GoogleV3(api_key=API_KEY, user_agent=USER_AGENT, timeout=10)
    geocode = retry_fn(geolocator.geocode)
    geolocated = parallel_apply(dff, partial(process_row, geocode, 'location', 'geolocation'))
    geolocated.to_csv(API_CSV_OUTPUT)
else:
    geolocated = pd.read_csv(API_CSV_OUTPUT)

### Post-Processing

We filter all data that came back as invalid from the Google API and extract all unnecessary data from the result column.
For that we split the column data that came from Google Maps at the rightmost ',' at most once. This should provide us with
the country in most cases.

You need to take special care or do further post-processing to make sure that your data does not contain invalid data.

In [80]:
def cell_fun(cell):
    content_split = cell.rsplit(',', maxsplit=1)
    return content_split[-1]

# Filter entries without a country value (NA/Empty)
geolocated = geolocated[~ pd.isnull(geolocated.geolocation)]

geolocated['country'] = geolocated['geolocation'].apply(cell_fun)

Save all data sorted by the new column contry to CSV_OUTPUT

In [81]:
geolocated.sort_values('country').to_csv(CSV_OUTPUT)

In [82]:
edges = pd.read_csv(EDGES_INPUT)
nodes = pd.read_csv(NODES_INPUT)

from_users: pd.DataFrame = geolocated[['from_user_name', 'country']] \
    .rename(index=str, columns={'from_user_name': 'label'})
from_users.head()

Unnamed: 0,label,country
0,Hits93Toronto,Canada
1,lumenprize,UK
3,WholeLottaJulie,USA
4,museumsaregreat,UK
5,Hits93Toronto,Canada


In [83]:
nodes.head()

Unnamed: 0,id,label,n_tweets,n_mentions
0,0,dmond1989,1,0
1,1,tswiftnz,0,24
2,2,wholelottajulie,1,0
3,3,bitchenboutique,3,2
4,4,museumsaregreat,5,0


In [84]:
nodes_with_country = pd.merge(nodes, from_users, left_on='label', right_on='label', how='inner').drop_duplicates()
nodes_with_country.to_csv(NODES_W_COUNTRY_OUTPUT)
edges.to_csv(EDGES_OUTPUT)