# Collect Tweets into MongoDB

## Install Python libraries

### You may need to restart your Jupyter Notebook instance after installed those libraries.

In [1]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-3.12.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (506 kB)
[K     |████████████████████████████████| 506 kB 29.2 MB/s eta 0:00:01
[?25hInstalling collected packages: pymongo
Successfully installed pymongo-3.12.1
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
!pip install pymongo[srv]

Collecting dnspython<3.0.0,>=1.16.0
  Downloading dnspython-2.1.0-py3-none-any.whl (241 kB)
[K     |████████████████████████████████| 241 kB 31.2 MB/s eta 0:00:01
[?25hInstalling collected packages: dnspython
Successfully installed dnspython-2.1.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
!pip install dnspython

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
!pip install tweepy

Collecting tweepy
  Downloading tweepy-4.3.0-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 4.2 MB/s  eta 0:00:01
[?25hCollecting requests-oauthlib<2,>=1.0.0
  Downloading requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
Collecting oauthlib>=3.0.0
  Downloading oauthlib-3.1.1-py2.py3-none-any.whl (146 kB)
[K     |████████████████████████████████| 146 kB 70.0 MB/s eta 0:00:01
[?25hInstalling collected packages: oauthlib, requests-oauthlib, tweepy
Successfully installed oauthlib-3.1.1 requests-oauthlib-1.3.0 tweepy-4.3.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [5]:
!pip install twitter

Collecting twitter
  Downloading twitter-1.19.3-py2.py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 1.1 MB/s  eta 0:00:01
[?25hInstalling collected packages: twitter
Successfully installed twitter-1.19.3
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


## Import Python libraries


In [6]:
import pymongo
from pymongo import MongoClient
import json
import tweepy
import twitter
from pprint import pprint
import configparser
import pandas as pd

## Load the Authorization Info


### Save database connection info and API Keys in a config.ini file and use the configparse to load the authorization info.

In [7]:
config = configparser.ConfigParser()
config.read('config.ini')

CONSUMER_KEY      = config['mytwitter']['api_key']
CONSUMER_SECRET   = config['mytwitter']['api_secrete']
OAUTH_TOKEN       = config['mytwitter']['access_token']
OATH_TOKEN_SECRET = config['mytwitter']['access_secrete']

mongod_connect = config['mymongo']['connection']


## Connect to the MongoDB Cluster

In [8]:
client = MongoClient(mongod_connect)
db = client.demo # use or create a database named demo
tweet_collection = db.tweet_collection #use or create a collection named tweet_collection
tweet_collection.create_index([("id", pymongo.ASCENDING)],unique = True) # make sure the collected tweets are unique

ConfigurationError: The DNS query name does not exist: _mongodb._tcp.freeclusterjd-ffp4c.mongodb.net.

## Use the Streaming API to Collect Tweets

### Authorize the Stream API

In [9]:
stream_auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
stream_auth.set_access_token(OAUTH_TOKEN, OATH_TOKEN_SECRET)

strem_api = tweepy.API(stream_auth)

### Define the query for the Stream API

In [10]:
track = ['vaccine'] # define the keywords, tweets contain vaccine

locations = [-78.9326449,38.4150904,-78.8816972,38.4450731] #define the location, in Harrisonburg, VA

### The collected tweets will contain 'election' OR are located in Harrisonburg, VA

In [11]:
class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print (status.id_str)
        try:
            tweet_collection.insert_one(status._json)
        except:
            pass
  
    def on_error(self, status_code):
        if status_code == 420:
            #returning False in on_data disconnects the stream
            return False
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = strem_api.auth, listener=myStreamListener)
myStream.filter(track=track)#  (locations = locations)   #Use either track or locations

AttributeError: module 'tweepy' has no attribute 'StreamListener'

## Use the REST API to Collect Tweets

### Authorize the REST API

In [12]:
rest_auth = twitter.oauth.OAuth(OAUTH_TOKEN,OATH_TOKEN_SECRET,CONSUMER_KEY,CONSUMER_SECRET)
rest_api = twitter.Twitter(auth=rest_auth)

### Define the query for the REST API

In [13]:
count = 100 #number of returned tweets, default and max is 100
geocode = "38.4392897,-78.9412224,50mi"  # defin the location, in Harrisonburg, VA
q = "vaccine"                               #define the keywords, tweets contain vaccine

### The collected tweets will contain 'vaccine' AND are located in Harrisonburg, VA

In [14]:
search_results = rest_api.search.tweets( count=count,q=q, geocode=geocode) #you can use both q and geocode
statuses = search_results["statuses"]
since_id_new = statuses[-1]['id']
for statuse in statuses:
    try:
        tweet_collection.insert_one(statuse)
        pprint(statuse['created_at'])# print the date of the collected tweets
    except:
        pass

### Continue fetching early tweets with the same query.

#### YOU WILL REACH YOUR RATE LIMIT VERY FAST

In [15]:
since_id_old = 0
while(since_id_new != since_id_old):
    since_id_old = since_id_new
    search_results = rest_api.search.tweets( count=count,q=q,
                        geocode=geocode, max_id= since_id_new)
    statuses = search_results["statuses"]
    since_id_new = statuses[-1]['id']
    for statuse in statuses:
        try:
            tweet_collection.insert_one(statuse)
            pprint(statuse['created_at']) # print the date of the collected tweets
        except:
            pass

## View the Collected Tweets

### Print the number of tweets and unique twitter users

In [16]:
print(tweet_collection.estimated_document_count())# number of tweets collected

user_cursor = tweet_collection.distinct("user.id")
print (len(user_cursor)) # number of unique Twitter users 

NameError: name 'tweet_collection' is not defined

In [17]:
tweet_collection.create_index([("text", pymongo.TEXT)], name='text_index', default_language='english') # create a text index

NameError: name 'tweet_collection' is not defined

In [18]:
tweet_cursor = tweet_collection.find({"$text": {"$search": "vote"}}) # return tweets contain vote

NameError: name 'tweet_collection' is not defined

In [19]:
for document in tweet_cursor[0:10]: # display the first 10 tweets from the query
    try:
        print ('----')
#         pprint (document) # use pprint to print the entire tweet document
   
        print ('name:', document["user"]["name"]) # user name
        print ('text:', document["text"])         # tweets
    except:
        print ("***error in encoding")
        pass

NameError: name 'tweet_cursor' is not defined

In [20]:
tweet_cursor = tweet_collection.find({"$text": {"$search": "vote"}}) # return tweets contain vote

NameError: name 'tweet_collection' is not defined

In [21]:
tweet_df = pd.DataFrame(list(tweet_cursor ))
tweet_df[:10] #display the first 10 tweets

NameError: name 'tweet_cursor' is not defined

In [22]:
tweet_df["favorite_count"].hist() # create a histogram show the favorite count

NameError: name 'tweet_df' is not defined