In [71]:
import sys
from operator import itemgetter
import pandas as pd
import pymongo
import json
from bson.json_util import dumps

# this function contains a function to return lists of some twitter entities
# for each tweet, this function returns lists of the entities:  mentions, hashtags, URLs
# Parameter:  a tweet (as a Twitter json object)
# Result:  3 lists of the above entities
def get_entities(tweet):
    # make sure this is a tweet by checking that it has the 'entities' key
    if 'entities' in tweet.keys():
        # list of mentions comes from the 'screen_name' field of each user_mention
        mentions = [user_mention['screen_name'] for user_mention in tweet['entities']['user_mentions']]   
        # list of hashtags comes from the 'text' field of each hashtag
        hashtags = [hashtag['text'] for hashtag in tweet['entities']['hashtags']] 
        # list of urls can come either from the 'url' field  or 'expanded_url' field of each url
        urls = [urlitem['url'] for urlitem in tweet['entities']['urls']]    
        urls = urls + [urlitem['expanded_url'] for urlitem in tweet['entities']['urls']] 
        # we ignore the symbols and optional media entity fields
        return mentions, hashtags, urls
    else:
        # if no entities key, return empty lists
        return [], [], []
    
# This function gets data from an existing DB and collection
# Parameters:  
#   DBname and DBcollection- the name of the database and collection, either new or existing
# Result:
#   data - returns all the data in the collection as a list of JSON objects

def load_from_DB (DBname, DBcollection):
    # connect to database server and just let connection errors fail the program
    client = pymongo.MongoClient('localhost', 27017)
    # use the DBname and collection, which will create if not existing
    db = client[DBname]
    collection = db[DBcollection]           
    # get all the data from the collection as a cursor
    
    docs_json = []
    for doc in collection.find():
        hashtags = [tag['text'] for tag in doc['entities']['hashtags']]
        user = doc['user']['name']
        created_at = doc['created_at']
        text = doc['text']
        docs_json.append([user, created_at, text, hashtags])

    df = pd.DataFrame(docs_json)
    df.columns = ['user','created_at','text','hashtags']    return df

In [72]:

# load all the tweets
tweet_results = load_from_DB('bball', 'bbcoll')

In [73]:
print(tweet_results.head())

              user                      created_at  \
0             Will  Mon Mar 27 22:30:30 +0000 2017   
1  Kahlen Donatell  Mon Mar 27 22:30:22 +0000 2017   
2            Jesús  Mon Mar 27 22:30:21 +0000 2017   
3            plug,  Mon Mar 27 22:30:14 +0000 2017   
4   Joni Dickerson  Mon Mar 27 22:30:11 +0000 2017   

                                                text             hashtags  
0  RT @HowardWKYT: The final seconds of the Kentu...       [marchmadness]  
1  RT @WhistleSports: When you perfectly time the...  [UNC, MarchMadness]  
2  RT @HowardWKYT: The final seconds of the Kentu...       [marchmadness]  
3  RT @BleacherReport: And then there were four.....       [MarchMadness]  
4  RT @mycarolinastdnt: RT if you'll be cheering ...       [MarchMadness]  
