# Converting geotagged tweets to GEOJSON
(Reference: https://github.com/INFO-4602-5602/final-project-tweetsonamap/issues/4)

In [211]:
import pymongo, json, pprint, urllib.request, os.path, sys
import pandas as pd

Connect to the DB

    client = pymongo.MongoClient('mongodb://epic-analytics.cs.colorado.edu')
    db = client.matthew
    print(db.collection_names())
    tweets = db.tweets
    print(tweets.count(), "tweets")

Where are the thumbnail images stored?

In [3]:
thumbnail_prefix = "http://epic-analytics.cs.colorado.edu:9000/jennings/infovis/map_images/"

In [212]:
client = pymongo.MongoClient('localhost')
tweets = client['matthew'].tweets

### Example: Point Locations

# `Location` tag, not point locations

1. Query for just tweets with images (photos) and a `location.geo` field then put them into a dataframe

In [214]:
geo_image_tweets = {
    "verb":"post",
    "twitter_extended_entities.media.0":{"$exists":True}, # at least 1 media entity
    "twitter_extended_entities.media": {"$all":[{"$elemMatch": { "type": "photo" }}]}, # all media entities are photos
    "location.geo.coordinates":{"$exists":True} # has geolocation (not necessarily a point)
}
df = pd.DataFrame(list(tweets.find(geo_image_tweets)))

Check that this worked?

In [216]:
print(len(df))
df.head(2)

23789


Unnamed: 0,_id,actor,body,datetime,display_text_range,favoritesCount,generator,geo,gnip,id,...,object,objectType,postedTime,provider,retweetCount,twitter_entities,twitter_extended_entities,twitter_filter_level,twitter_lang,verb
0,58c1e007d8991e9a470000a9,"{'links': [{'rel': 'me', 'href': None}], 'pref...",Mood: 3 months from now when we touchdown in J...,{'$date': 1474761687000},"[0, 70]",0,"{'displayName': 'Twitter for iPhone', 'link': ...",,{'matching_rules': [{'id': 9056142625283655206...,"tag:search.twitter.com,2005:779833156773871616",...,{'summary': 'Mood: 3 months from now when we t...,activity,2016-09-25 00:01:27,"{'link': 'http://www.twitter.com', 'displayNam...",0,"{'symbols': [], 'media': [{'type': 'photo', 'i...","{'media': [{'type': 'photo', 'id_str': '779833...",low,en,post
1,58c1e008d8991e9a470001cc,"{'links': [{'rel': 'me', 'href': 'https://m.fa...",ACTUALIZACIÓN \nBoletín 8pm \nEl CNH aumenta a...,{'$date': 1474761856000},"[0, 116]",0,"{'displayName': 'Twitter for Android', 'link':...",,{'matching_rules': [{'id': 2199701286811376328...,"tag:search.twitter.com,2005:779833865082843136",...,{'summary': 'ACTUALIZACIÓN Boletín 8pm El CN...,activity,2016-09-25 00:04:16,"{'link': 'http://www.twitter.com', 'displayNam...",0,"{'symbols': [], 'media': [{'type': 'photo', 'i...","{'media': [{'type': 'photo', 'id_str': '779833...",low,es,post


Create something to group by on

In [217]:
df['location_string'] = df.location.apply(lambda x: str(x['geo']['coordinates']))

Now group by Geometries. This method is okay because order is preserved with rows in between groups 
http://stackoverflow.com/questions/26456125/python-pandas-is-order-preserved-when-using-groupby-and-agg


In [218]:
gb_loc = df.groupby('location_string').aggregate({
        'id' : {
            'Tweets' : 'count',
            'IDs'    : lambda x: [y.split(":")[2] for y in x.values]
         },
        'body': {
            'text'   : lambda x: [y for y in x.values]
         },
        'postedTime' : {
            'time'   : lambda x: [str(y) for y in x.values]
         },
        'actor'      : {
            'user'   : lambda x: [y['preferredUsername'] for y in x.values]
         },
        'location' : {
            'geojson'     : lambda x: x.values[0]['geo'],
            'displayName' : lambda x: x.values[0]['displayName']
         }
    })
gb_loc.columns = gb_loc.columns.droplevel()

Check that this worked?

In [219]:
gb_loc.sort_values(by='Tweets', ascending=False).head(2)

Unnamed: 0_level_0,user,IDs,Tweets,text,displayName,geojson,time
location_string,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"[[[-87.634643, 24.396308], [-87.634643, 31.001056], [-79.974307, 31.001056], [-79.974307, 24.396308]]]","[TheWxReporter, karadapena, EricaABryan, hanna...","[780024297909653504, 780103153597423616, 78013...",1419,[9/25/2016 8:00 AM EDT Tropical Weather Update...,"Florida, USA","{'type': 'Polygon', 'coordinates': [[[-87.6346...","[2016-09-25T12:40:59.000000000, 2016-09-25T17:..."
"[[[-84.952008, 19.82646], [-84.952008, 23.594925], [-74.131649, 23.594925], [-74.131649, 19.82646]]]","[benq_09, KarelBecerra, KarelBecerra, JLucasMo...","[779838282410528768, 779845609415118849, 77984...",958,"[Seguimos..! Camagüey, cuba! https://t.co/HxkA...",Cuba,"{'type': 'Polygon', 'coordinates': [[[-84.9520...","[2016-09-25T00:21:49.000000000, 2016-09-25T00:..."


Now build GeoJSON Objects for each region

In [220]:
regions = []
for idx, row in gb_loc.copy().iterrows():
    
    json_obj = {'type':'Feature',
                'properties' : {
                        'count'       : row['Tweets'],
                        'tweets'      : [],
                        'displayName' : row['displayName']
                },
                'geometry'  : row['geojson']
               }
    
    #Stupid Twitter
    json_obj['geometry']['coordinates'][0].append(json_obj['geometry']['coordinates'][0][0])
    
    for idx, tweet_id in enumerate(row['IDs']):
        
        json_obj['properties']['tweets'].append({
                'id': tweet_id,
                'thumb': thumbnail_prefix + tweet_id + '.jpg',
                'text': row['text'][idx],
                'user': row['user'][idx],
                'time': row['time'][idx]
            })
        
    regions.append(json_obj)

Store this as a geojson feature collection

In [222]:
geojson_feature_collection = {'type':"FeatureCollection", "features" : regions}

Write this feature collection to disk

In [223]:
with open('/data/www/jennings/infovis/image-tweets-by-polygon.geojson','w') as out:
    json.dump(geojson_feature_collection,out)

<br><br><br><br><br><br><br><hr>
## Troubleshooting

In [226]:
regions[0]

{'geometry': {'coordinates': [[[-0.019481, 51.525469],
    [-0.019481, 51.564174],
    [0.069473, 51.564174],
    [0.069473, 51.525469],
    [-0.019481, 51.525469]]],
  'type': 'Polygon'},
 'properties': {'count': 4,
  'displayName': 'Stratford, London',
  'tweets': [{'id': '783803471657046020',
    'text': '#prayforhaiti https://t.co/wOh1jIlFBW',
    'thumb': 'http://epic-analytics.cs.colorado.edu:9000/jennings/infovis/map_images/783803471657046020.jpg',
    'time': '2016-10-05T22:58:04.000000000',
    'user': 'symnicola'},
   {'id': '785327831375089665',
    'text': '#ebook \n#talent \n#jamaica \n#screenwriting\n#Scotland \n#Soldier\n#Police\n#filmmaking \nhttps://t.co/hZXw96olfh https://t.co/mJZRxeQKE0',
    'thumb': 'http://epic-analytics.cs.colorado.edu:9000/jennings/infovis/map_images/785327831375089665.jpg',
    'time': '2016-10-10T03:55:20.000000000',
    'user': 'rhpanton'},
   {'id': '786588675731628032',
    'text': 'Enter our @Crowdrise sweepstakes &amp; help children and f