In [3]:
from pyspark.sql import SparkSession
#from pyspark.sql.functions import *
from pyspark.sql import functions as F
#from googletrans import Translator
from pyspark.sql.window import Window
from pyspark.sql.types import TimestampType
from pyspark.sql.types import DoubleType
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
#from mpl_toolkits.basemap import Basemap
import time
import re

spark = SparkSession \
    .builder \
    .appName("Large scale human mobility data analysis through social media") \
    .config("spark.sql.broadcastTimeout", "36000") \
    .getOrCreate()
instagram_df = spark.read.load("/data/instagramdata/instagram_df.parquet", format="parquet")
twitter_df = spark.read.load("/data/twitterdata/UserTweetInEurope_df.parquet", format="parquet")
instagram_df.createOrReplaceTempView("instagram")
twitter_df.createOrReplaceTempView("twitter")
twitter_ego = twitter_df.filter(twitter_df["text"].like("%@%"))
## Excluding speed should come before restrict country 

In [4]:
#extract all @mention from text and construct reciprocated network
def filter_all(x):
    result = re.findall('(\@)(\s|)+(\w+)(\\b)',x)
    return (','.join([i[2] for i in result]))
filter_all_udf = F.udf(filter_all)

twitter_ego = twitter_ego.withColumn('mention',filter_all_udf(twitter_ego['text']).alias('mention'))
twitter_ego.createOrReplaceTempView("twitter_ego")
twitter_ego1 = twitter_ego.select(F.explode(F.split(F.col("mention"),",")).alias("mention"),twitter_ego["user_name"],twitter_ego["media_id"])
twitter_ego1.createOrReplaceTempView("twitter_ego1")
twitter_ego2 = spark.sql("select user_name as source,mention as target,count(media_id) as weight from twitter_ego1 group by user_name,mention")
twitter_ego2.createOrReplaceTempView("twitter_ego2")
twitter_ego3 = spark.sql("select a.source,a.target as target1,b.target as target2,a.weight as weight1,b.weight as weight2 from twitter_ego2 a join twitter_ego2 b on a.target = b.source where a.source = b.target and a.source != a.target")
twitter_ego3 = twitter_ego3.withColumn('weight',F.least(twitter_ego3['weight1'], twitter_ego3['weight2']).alias("weight"))
twitter_ego3 = twitter_ego3.select(twitter_ego3['source'],twitter_ego3["target1"],twitter_ego3['weight'])
twitter_ego3.createOrReplaceTempView("twitter_ego3")
twitter_ego4 = spark.sql("select * from twitter where user_name in (select distinct source from twitter_ego3)")
twitter_ego4.createOrReplaceTempView("twitter_ego4")

In [8]:
user_location = twitter_ego4.withColumn('user_location_with_geo',F.regexp_extract('user_location','(-?\\d+.\\d{3,}),\\s*(-?\\d+.\\d{3,})',0).alias('user_location_with_geo'))
#user_location = user_location.filter(user_location['user_location_with_geo'] != '')
user_location.createOrReplaceTempView("user_location")
user_location= spark.sql("select distinct user_name as source,user_location from user_location")
#user_location = user_location.filter(user_location['user_location'].isNotNull())
user_location_rdd = user_location.rdd.map(lambda row :[row[0],row[1]])
user_location_list = user_location_rdd.collect()

In [9]:
import geocoder 
def geocode(list):
    for i in list:
        if i[1]==None:
            i[1]=None
        else:
            g=geocoder.google(i[1])
            if g.city == None:
                i[1]=None
            else:
                g.city_country='{},{}'.format(g.city,g.country)
                #g.city_country=g.city+g.country
                i[1]=[g.latlng,g.city_country]
                #time.sleep(10)
    print(list)
def reverse_geocode(list):
    for i in list:
        g = geocoder.google(i[1], method='reverse')
        g.city_country='{},{}'.format(g.city,g.country)
        i[1]=[i[1],g.city_country]
    print(list)

In [None]:
geocode(user_location_list)