# Putting all of the pieces together, this Notebook handles GNIP GeoJSONL files

In [133]:
import json, pprint, os, io, sys, PIL,urllib.request
import pandas as pd; import numpy as np
from IPython.display import display, Image
from urllib.request import urlopen

Where is the configuration file?

In [13]:
config_file = "hajj.config"

In [46]:
config = json.load(open(config_file,'r'))
start_date = pd.Timestamp(config['start_date'])
pprint.pprint(config)

{'start_date': '2016-09-07',
 'tweets': '/data/hajj/gnip_tweets_full.jsonl',
 'web_root': '/data/www/tweetsonamap/hajj'}


## Step 1: Read in jsonl file.

In [80]:
tweets = []
with open(config['tweets'],'r') as inFile:
    for line in inFile:
        tweets.append(json.loads(line.strip()))
df = pd.DataFrame(tweets)
df = df[df['twitter_extended_entities'].notnull()]
df['id'] = df.id.apply(lambda x: x.split(":")[2])

Two types of tweets: geo-tagged and geolocated. Identify these

In [81]:
geotagged_df  = df[df.geo.notnull()].copy()
geolocated_df = df[df.geo.isnull()].copy()

"Identified {0} geotagged tweets and {1} geolocated tweets ({2} total)".format(len(geotagged_df),len(geolocated_df), len(geolocated_df)+len(geotagged_df))

'Identified 86 geotagged tweets and 1962 geolocated tweets (2048 total)'

## Step 2: Process all of the images

First, get the raw img_url for all of the tweets

In [138]:
df['img_url'] = df.twitter_extended_entities.apply(lambda r: r['media'][0]['media_url'])

In [139]:
if not os.path.exists(config['web_root']+"/map_images"):
    os.makedirs(config['web_root']+"/map_images")

map_image_dir = config['web_root']+"/map_images/"

In [140]:
sizes = {"thumb":60,
         "small":150,
         "medium":300,
         "large":400,
         "original":-1 #basewidth value won't be used
        }

def download_and_resize(url, tweet_id):
    """
        Input: URL to image
        Returns: None
    """
    
    img_name = tweet_id + ".jpg"
    
    
    try:
        with urllib.request.urlopen(url) as img:
            f = io.BytesIO(img.read())
            orig_img = PIL.Image.open(f)
    except:
        e = sys.exc_info()[0]
        print(e,"Error on:",url)
        print(sys.exc_info())

    # Iterate over each size/basewidth
    for size,basewidth in sizes.items():

        # Make directory within map_image_dir for each size
        if not os.path.exists(map_image_dir+size):
            os.makedirs(map_image_dir+size)
            print("making directory for",size)

        # Set filename to save resized image and break to the next image if it already exists
        new_path = map_image_dir+size+"/"+img_name
        if os.path.isfile(new_path):
            break

        # Don't resize images in original folder
        if size == "original":
            orig_img.save(new_path)
        else:
            try:
                # Set height proportional to fixed basewidth from 
                # https://opensource.com/life/15/2/resize-images-python
                wpercent = (basewidth / float(orig_img.size[0]))
                hsize = int((float(orig_img.size[1]) * float(wpercent))) 
                resized_img = orig_img.resize((basewidth, hsize), PIL.Image.ANTIALIAS)
                resized_img.save(new_path)
#                     display(Image(filename=orig_path, width=basewidth, height=hsize)) # display resized image
            except:
                e = sys.exc_info()[0]
                print(e,"Error on:",img_name)
                break

In [None]:
for idx, tweet in df.iterrows():
    download_and_resize(tweet.img_url, tweet.id)
    sys.stderr.write("\r {0}: {1}, {2}".format(idx, tweet.id, tweet.img_url))

 37: 773490012331577344, http://pbs.twimg.com/media/CrXkDEuXYAQZg8Q.jpg

<class 'urllib.error.HTTPError'> Error on: http://pbs.twimg.com/media/CrXkDEuXYAQZg8Q.jpg
(<class 'urllib.error.HTTPError'>, HTTPError(), <traceback object at 0x7f86d3d0de48>)
<class 'UnboundLocalError'> Error on: 773490012331577344.jpg


 669: 774633256087990272, http://pbs.twimg.com/media/CsAM1T-UsAA62z6.jpg

## Step 2: Write these geojson files

### First, the geotagged tweets

In [None]:
feats = []
for idx, tweet in geotagged.iterrows():
    if os.path.exists(map_image_dir+"/thumb/"+tweet.id+".jpg"):
        jsonObj = {  "type"       : "Feature",
                     "geometry"   : {  "type" : "Point", 
                                       "coordinates" : list(reversed(t['geo']['coordinates']))},
                     "properties" : {  "id" : tweet_id,
                                       "user" : t['actor']['preferredUsername'],
                                       "text" : t['body'],
                                       "day" : int( ( pd.Timestamp(t['postedTime']) - start_date ).total_seconds() /(3600*24) )
           }
         }
        feats.append(jsonObj)

In [None]:
with open(config['web_root']+"/geotagged-tweets.geojson','w') as outFile:
    json.dump({'type':"FeatureCollection","features":feats},outFile)

### Second, the geolocated tweets

In [None]:
geolocated_df['location_string'] = geolocated_df.location.apply(lambda x: str(x['geo']['coordinates']))

In [None]:
gb_loc = geolocated_df.groupby('location_string').aggregate({
        'id' : {
            'Tweets' : 'count',
            'IDs'    : lambda x: [y.split(":")[2] for y in x.values]
         },
        'body': {
            'text'   : lambda x: [y for y in x.values]
         },
        'postedTime' : {
            'day'   : lambda x: [int( ( pd.Timestamp(y) - start_date ).total_seconds() /(3600*24) ) for y in x.values]
         },
        'actor'      : {
            'user'   : lambda x: [y['preferredUsername'] for y in x.values]
         },
        'location' : {
            'geojson'     : lambda x: x.values[0]['geo'],
            'displayName' : lambda x: x.values[0]['displayName']
         }
    })
gb_loc.columns = gb_loc.columns.droplevel()

In [None]:
gb_loc.sort_values(by='Tweets', ascending=False).head(2)

In [None]:
regions = []
for idx, row in gb_loc.copy().iterrows():
    
    json_obj = {'type':'Feature',
                'properties' : {
                        'count'       : row['Tweets'],
                        'tweets'      : [],
                        'displayName' : row['displayName']
                },
                'geometry'  : row['geojson']
               }
    
    #Stupid Twitter --> They only have 4 coordinates, need to make it a valid polygon
    json_obj['geometry']['coordinates'][0].append(json_obj['geometry']['coordinates'][0][0])
    
    for idx, tweet_id in enumerate(row['IDs']):
        
        json_obj['properties']['tweets'].append({
                 'id': tweet_id,
                 'text': row['text'][idx],
                 'user': row['user'][idx],
                 'day':  row['day'][idx]
            })
        
    regions.append(json_obj)
print(len(regions))

In [None]:
geojson_feature_collection = {'type':"FeatureCollection", "features" : regions}

In [None]:
with open(config['web_root']+'/image-tweets-by-polygon.geojson','w') as out:
    json.dump(geojson_feature_collection,out)