# Converting geotagged tweets to GEOJSON

1. Connect to the database and identify all tweets that are geolocated to a Polygon rather than a point.
1. Group these together and generate a GeoJSON Feature Collection of Polygons

In [1]:
import pymongo, json, pprint, urllib.request, os.path, sys, json
import pandas as pd

In [2]:
config = json.load(open('tweets-on-a-map.config','r'))
pprint.pprint(config)

{'client': 'epic-analytics.cs.colorado.edu',
 'collection': 'tweets',
 'database': 'matthew',
 'img_root': 'http://epic-analytics.cs.colorado.edu:9000/jennings/infovis/map_images',
 'start_date': '2016-9-25',
 'web_root': '/data/www/jennings/infovis'}


Connect to the database, per the config information

In [3]:
client = pymongo.MongoClient('mongodb://'+config['client'])
db = client[config['database']]
tweets = db[config['collection']]
print("Found {0} tweets ".format(tweets.count()))

Found 13830253 tweets 


# `Location` tag, not point locations

1. Query for just tweets with images (photos) and a `location.geo` field then put them into a dataframe

In [4]:
geo_image_tweets = {
    "verb":"post",
    "twitter_extended_entities.media.0":{"$exists":True}, # at least 1 media entity
    "twitter_extended_entities.media": {"$all":[{"$elemMatch": { "type": "photo" }}]}, # all media entities are photos
    "location.geo.coordinates":{"$exists":True} # has geolocation (not necessarily a point)
}
df = pd.DataFrame(list(tweets.find(geo_image_tweets)))

Check that this worked?

In [5]:
print(len(df))
df.head(2)

23789


Unnamed: 0,_id,actor,body,datetime,display_text_range,favoritesCount,generator,geo,gnip,id,...,object,objectType,postedTime,provider,retweetCount,twitter_entities,twitter_extended_entities,twitter_filter_level,twitter_lang,verb
0,58c1e007d8991e9a470000a9,"{'utcOffset': '-18000', 'statusesCount': 25329...",Mood: 3 months from now when we touchdown in J...,{'$date': 1474761687000},"[0, 70]",0,"{'link': 'http://twitter.com/download/iphone',...",,{'matching_rules': [{'id': 9056142625283655206...,"tag:search.twitter.com,2005:779833156773871616",...,"{'id': 'object:search.twitter.com,2005:7798331...",activity,2016-09-25 00:01:27,"{'link': 'http://www.twitter.com', 'displayNam...",0,"{'urls': [], 'symbols': [], 'media': [{'media_...",{'media': [{'media_url_https': 'https://pbs.tw...,low,en,post
1,58c1e008d8991e9a470001cc,"{'utcOffset': None, 'statusesCount': 22931, 'l...",ACTUALIZACIÓN \nBoletín 8pm \nEl CNH aumenta a...,{'$date': 1474761856000},"[0, 116]",0,{'link': 'http://twitter.com/download/android'...,,{'matching_rules': [{'id': 2199701286811376328...,"tag:search.twitter.com,2005:779833865082843136",...,"{'id': 'object:search.twitter.com,2005:7798338...",activity,2016-09-25 00:04:16,"{'link': 'http://www.twitter.com', 'displayNam...",0,"{'urls': [], 'symbols': [], 'media': [{'media_...",{'media': [{'media_url_https': 'https://pbs.tw...,low,es,post


Create something to group by on (can't group by the location column, as it's not hashable :) )

In [6]:
df['location_string'] = df.location.apply(lambda x: str(x['geo']['coordinates']))

Now group by Geometries. This method is okay because order is preserved with rows in between groups 
http://stackoverflow.com/questions/26456125/python-pandas-is-order-preserved-when-using-groupby-and-agg


In [7]:
start_date = pd.Timestamp(config['start_date'])
gb_loc = df.groupby('location_string').aggregate({
        'id' : {
            'Tweets' : 'count',
            'IDs'    : lambda x: [y.split(":")[2] for y in x.values]
         },
        'body': {
            'text'   : lambda x: [y for y in x.values]
         },
        'postedTime' : {
            'day'   : lambda x: [int( ( pd.Timestamp(y) - start_date ).total_seconds() /(3600*24) ) for y in x.values]
         },
        'actor'      : {
            'user'   : lambda x: [y['preferredUsername'] for y in x.values]
         },
        'location' : {
            'geojson'     : lambda x: x.values[0]['geo'],
            'displayName' : lambda x: x.values[0]['displayName']
         }
    })
gb_loc.columns = gb_loc.columns.droplevel()

Check that this worked?

In [10]:
gb_loc.sort_values(by='Tweets', ascending=False).head(2)

Unnamed: 0_level_0,IDs,Tweets,text,user,day,geojson,displayName
location_string,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"[[[-87.634643, 24.396308], [-87.634643, 31.001056], [-79.974307, 31.001056], [-79.974307, 24.396308]]]","[780024297909653504, 780103153597423616, 78013...",1419,[9/25/2016 8:00 AM EDT Tropical Weather Update...,"[TheWxReporter, karadapena, EricaABryan, hanna...","[0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 4, 4, 4, 4, ...","{'coordinates': [[[-87.634643, 24.396308], [-8...","Florida, USA"
"[[[-84.952008, 19.82646], [-84.952008, 23.594925], [-74.131649, 23.594925], [-74.131649, 19.82646]]]","[779838282410528768, 779845609415118849, 77984...",958,"[Seguimos..! Camagüey, cuba! https://t.co/HxkA...","[benq_09, KarelBecerra, KarelBecerra, JLucasMo...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","{'coordinates': [[[-84.952008, 19.82646], [-84...",Cuba


Now build GeoJSON Objects for each region. This is not atomic, it creates a copy to work with

In [11]:
regions = []
for idx, row in gb_loc.copy().iterrows():
    
    json_obj = {'type':'Feature',
                'properties' : {
                        'count'       : row['Tweets'],
                        'tweets'      : [],
                        'displayName' : row['displayName']
                },
                'geometry'  : row['geojson']
               }
    
    #Stupid Twitter --> They only have 4 coordinates, need to make it a valid polygon
    json_obj['geometry']['coordinates'][0].append(json_obj['geometry']['coordinates'][0][0])
    
    for idx, tweet_id in enumerate(row['IDs']):
        
        json_obj['properties']['tweets'].append({
                 'id': tweet_id,
                 'text': row['text'][idx],
                 'user': row['user'][idx],
                 'day':  row['day'][idx]
            })
        
    regions.append(json_obj)

In [12]:
print(len(regions))

4525


Store this as a geojson feature collection

In [13]:
geojson_feature_collection = {'type':"FeatureCollection", "features" : regions}

Write this feature collection to disk

In [14]:
with open('/data/www/jennings/infovis/image-tweets-by-polygon.geojson','w') as out:
    json.dump(geojson_feature_collection,out)

<br><br><br><br><br><br><br><hr>
## Troubleshooting

In [13]:
regions[0]

{'geometry': {'coordinates': [[[-0.019481, 51.525469],
    [-0.019481, 51.564174],
    [0.069473, 51.564174],
    [0.069473, 51.525469],
    [-0.019481, 51.525469]]],
  'type': 'Polygon'},
 'properties': {'count': 4,
  'displayName': 'Stratford, London',
  'tweets': [{'id': '783803471657046020',
    'text': '#prayforhaiti https://t.co/wOh1jIlFBW',
    'thumb': 'http://epic-analytics.cs.colorado.edu:9000/jennings/infovis/map_images/thumb/783803471657046020.jpg',
    'time': '2016-10-05T22:58:04.000000000',
    'user': 'symnicola'},
   {'id': '785327831375089665',
    'text': '#ebook \n#talent \n#jamaica \n#screenwriting\n#Scotland \n#Soldier\n#Police\n#filmmaking \nhttps://t.co/hZXw96olfh https://t.co/mJZRxeQKE0',
    'thumb': 'http://epic-analytics.cs.colorado.edu:9000/jennings/infovis/map_images/thumb/785327831375089665.jpg',
    'time': '2016-10-10T03:55:20.000000000',
    'user': 'rhpanton'},
   {'id': '786588675731628032',
    'text': 'Enter our @Crowdrise sweepstakes &amp; help ch