In [3]:
import json
import requests
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pymongo
import psycopg2
import sqlalchemy
import pandas as pd
from sqlalchemy import create_engine
from time import strptime, strftime

### Connecting to the MongoDB Server

In [28]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://ap2615:Vt0xtjCKB2kHulFj@cluster2.rh61mme.mongodb.net/?retryWrites=true&w=majority&appName=Cluster2"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Access or create a database
db = client['Twitter_DB']

# Create a new collection named 'citibikenyc'
collection = db['Twitter_Sample']

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [11]:
## Created the mongodb columns to add the desired data 
mongodb_columns = ['id', 'created_at', 'text', 'source', 'quote_count', 'reply_count', 
                   'retweet_count', 'favorite_count', 'entities']


In [12]:
def filter_func(item):
    key, value = item
    
    if key in mongodb_columns:
        return True
    else:
        return False

In [13]:
## Cleaning of tweet data is done here before uploading to the MongoDB
def cleaning(data):
        
        doc = dict(filter(filter_func, data.items()))
        doc['user_id'] = data['user']['id']      
        doc['_id'] = doc['id']
        
        ###formatting the date field
        format =  "%a %b %d %H:%M:%S %z %Y"
        obj = strptime(doc['created_at'], format)
        doc['created_at'] = strftime('%Y-%m-%d %H:%M:%S', obj)
        
        ###extracting hashtags
        hashtag_list = []
        for i in doc['entities']['hashtags']:
                hashtag_list.append(i['text'])
        doc['hashtags'] = hashtag_list  
        
        ###Extracting user mentions
        user_mentions = []
        for i in doc['entities']['user_mentions']:
            user_mentions.append(i['screen_name'])
        doc['user_mentions'] = user_mentions
        
        del doc['id']
        del doc['entities']
        return doc

In [14]:
### Loading the cleaned data 
def load_data(file_name):
    with open(file_name, "r") as f1:
        for line in f1:
            try:
                data = json.loads(line) 
            except ValueError:
                continue

            ## using the above created cleaning the data function
            doc = cleaning(data)

            ## Loading data into MongoDB
            if 'retweeted_status' in data:
                doc['retweeted_status_id'] = data['retweeted_status']['id']
                retweet = cleaning(data['retweeted_status'])

                try:
                    db.tweets.insert_one(retweet)
                except pymongo.errors.DuplicateKeyError:
                    pass

            try:            
                db.tweets.insert_one(doc)
            except pymongo.errors.DuplicateKeyError:
                continue


In [17]:
load_data("corona-out-3") ##Loading the given twitter dataset after cleaning 

In the case of retweets, the retweet data was inserted first, followed by the original tweet data. 
This is because retweets are essentially copies of the original tweet with some additional metadata.
Once all the tweets have been inserted into the collection, various MongoDB queries can be used to retrieve and analyze the data as required.

In [39]:
try:
    ### Fetching a few documents from the MongoDB collection
    cursor = db.tweets.find({}).limit(5)  # Retrieving 5 documents

    ## Print a message to confirm the query is executed
    print("Query executed successfully.")

    ### Iterate over the cursor to print the data
    for document in cursor:
        print(document)
        print("Document printed successfully.")  # Printing a message for each document

except Exception as e:
    print("An error occurred:", e)


Query executed successfully.
{'_id': 1254022770679320576, 'created_at': 'Sat Apr 25 12:21:41 +0000 2020', 'text': 'É isto, ou vou morrer sem ar ou com o corona https://t.co/O0Y7B3Koj4', 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'user_id': 804046791348015107, 'hashtags': [], 'user_mentions': []}
Document printed successfully.
{'_id': 1254022770746372096, 'created_at': 'Sat Apr 25 12:21:41 +0000 2020', 'text': 'Schöne Runde mit dem Rennrad ✌️\n#sport #corona https://t.co/Uglj9YJPI1', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'user_id': 2242948745, 'hashtags': [{'text': 'sport', 'indices': [32, 38]}, {'text': 'corona', 'indices': [39, 46]}], 'user_mentions': []}
Document printed successfully.
{'_id': 1253949413191344128, 'created

In [6]:
db.tweets.count_documents({})

112023

In [7]:
col1 = db.tweets.find(
  {
    'retweet_count' : {'$gt' : 100000}
  }
)

for i in col1:
    print(i)

{'_id': 1243533984371523584, 'created_at': 'Fri Mar 27 13:43:00 +0000 2020', 'text': 'Due to Corona, we officially have three days of the week\n\n1. Yesterday \n2. Today\n3. Tomorrow', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 1424, 'reply_count': 468, 'retweet_count': 107516, 'favorite_count': 369496, 'user_id': 1241112951991517188, 'hashtags': [], 'user_mentions': []}
{'_id': 1237436114887041024, 'created_at': 'Tue Mar 10 17:52:14 +0000 2020', 'text': 'THIS MAN IS A GENIUS he figured out the Corona virus problem 😮 https://t.co/EZP7IqTtxV', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 7940, 'reply_count': 1578, 'retweet_count': 179037, 'favorite_count': 515867, 'user_id': 1131227186, 'hashtags': [], 'user_mentions': []}
{'_id': 1240334979701395458, 'created_at': '2020-03-18 17:51:18', 'text': 'When this Corona shit passes we have to promise each other 

In [27]:
col1 = db.tweets.find(
  {'retweet_count' : {'$gt' : 150500}})

for i in col1:
    print(i)

{'_id': 1237436114887041024, 'created_at': 'Tue Mar 10 17:52:14 +0000 2020', 'text': 'THIS MAN IS A GENIUS he figured out the Corona virus problem 😮 https://t.co/EZP7IqTtxV', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 7940, 'reply_count': 1578, 'retweet_count': 179037, 'favorite_count': 515867, 'user_id': 1131227186, 'hashtags': [], 'user_mentions': []}
{'_id': 1240334979701395458, 'created_at': '2020-03-18 17:51:18', 'text': 'When this Corona shit passes we have to promise each other that we’re going to tell our kids that we survived a zombie apocalypse in 2020', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 4442, 'reply_count': 1595, 'retweet_count': 181584, 'favorite_count': 764405, 'user_id': 1112592502727548928, 'hashtags': [], 'user_mentions': []}
{'_id': 1238264431320215553, 'created_at': '2020-03-13 00:43:40', 'text': '*corona virus enters my body

### Hashtags

In [68]:
## Printing the particular hastag details
col = db.tweets.find({"hashtags": "groepsimmuniteit"})
doc = {}
for i in col:
    print(i)

{'_id': 1254058979917017090, 'created_at': '2020-04-25 14:45:34', 'text': '@mvanderKist @wouterkeller Mensen kunnen opnieuw ziek worden. Wat dat betekent voor strategie #groepsimmuniteit? En… https://t.co/8TppZrbNRe', 'source': '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>', 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'user_id': 41191256, 'hashtags': ['groepsimmuniteit'], 'user_mentions': ['mvanderKist', 'wouterkeller']}


In [16]:
doc

{'_id': 1254058979917017090,
 'created_at': '2020-04-25 14:45:34',
 'text': '@mvanderKist @wouterkeller Mensen kunnen opnieuw ziek worden. Wat dat betekent voor strategie #groepsimmuniteit? En… https://t.co/8TppZrbNRe',
 'source': '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>',
 'quote_count': 0,
 'reply_count': 0,
 'retweet_count': 0,
 'favorite_count': 0,
 'user_id': 41191256,
 'hashtags': ['groepsimmuniteit'],
 'user_mentions': ['mvanderKist', 'wouterkeller']}

### User mentions

In [22]:
## Printing the document with the particular user mention
col = db.tweets.find({"user_mentions": "BJP4India"})

for doc in col:
    print(doc)


{'_id': 1254032706691895297, 'created_at': '2020-04-25 13:01:10', 'text': 'RT @BJP4India: BJP National President Shri @JPNadda had discussion with eminent personalities on political &amp; economic scenario and the meas…', 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'user_id': 1153698734464651265, 'hashtags': [], 'user_mentions': ['BJP4India', 'JPNadda'], 'retweeted_status_id': 1254021819469684739}
{'_id': 1254032955124731904, 'created_at': '2020-04-25 13:02:09', 'text': 'RT @BJP4India: India’s war with Corona is ongoing.\n\nPlay your part and make sure no one around you starves.\n\nYour contribution to defeat hu…', 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'user_id': 786986096, 'hashtags': [], 'user_mentions': ['BJP4India'], '

In [65]:
## Finding the document details for the particular user mention with a limit of doc is set to 3

col = db.tweets.find({"user_mentions": "BJP4India"}).limit(3)

for doc in col:
    print(doc)


{'_id': 1254032706691895297, 'created_at': '2020-04-25 13:01:10', 'text': 'RT @BJP4India: BJP National President Shri @JPNadda had discussion with eminent personalities on political &amp; economic scenario and the meas…', 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'user_id': 1153698734464651265, 'hashtags': [], 'user_mentions': ['BJP4India', 'JPNadda'], 'retweeted_status_id': 1254021819469684739}
{'_id': 1254032955124731904, 'created_at': '2020-04-25 13:02:09', 'text': 'RT @BJP4India: India’s war with Corona is ongoing.\n\nPlay your part and make sure no one around you starves.\n\nYour contribution to defeat hu…', 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'user_id': 786986096, 'hashtags': [], 'user_mentions': ['BJP4India'], '

In [66]:
## Finding the document details for the particular user mention with a limit of doc is set to 3

col = db.tweets.find({"user_mentions": "BJP4India"}).limit(3)

for doc in col:
    print("\nuser_id:", doc["user_id"]) 
    print("text:", doc["text"])
    print("user_mentions:", doc["user_mentions"])


user_id: 1153698734464651265
text: RT @BJP4India: BJP National President Shri @JPNadda had discussion with eminent personalities on political &amp; economic scenario and the meas…
user_mentions: ['BJP4India', 'JPNadda']

user_id: 786986096
text: RT @BJP4India: India’s war with Corona is ongoing.

Play your part and make sure no one around you starves.

Your contribution to defeat hu…
user_mentions: ['BJP4India']

user_id: 944198054231515136
text: RT @BJP4India: BJP National President Shri @JPNadda had discussion with eminent personalities on political &amp; economic scenario and the meas…
user_mentions: ['BJP4India', 'JPNadda']


### Retweets

In [9]:
#### Finding the document with the highest retweet count
highest_retweet_doc = db.tweets.find({}, {"_id": 0, "retweet_count": 1}).sort("retweet_count", pymongo.DESCENDING).limit(1)

# Printing the document with the highest retweet count
for doc in highest_retweet_doc:
    print(doc)

{'retweet_count': 237307}


In [57]:
## Finding the documents with the highest retweet count
highest_reply_docs = db.tweets.find().sort("retweet_count", pymongo.DESCENDING).limit(1)
for doc in highest_reply_docs:
    print("\nuser_id:", doc["user_id"]) 
    print("text:", doc["text"])
    print("retweet_count:", doc["retweet_count"])


user_id: 1100261477989126145
text: *corona virus enters my body*

The 4 Flintstone gummies I ate in 2005: https://t.co/3STfdIQtaT
retweet_count: 237307


In [43]:
### Find the document(s) with the highest retweet count
highest_retweet_docs = db.tweets.find().sort("retweet_count", pymongo.DESCENDING).limit(1)

### Print the document(s) with the highest retweet count
for doc in highest_retweet_docs:
    print(doc)

{'_id': 1238264431320215553, 'created_at': '2020-03-13 00:43:40', 'text': '*corona virus enters my body*\n\nThe 4 Flintstone gummies I ate in 2005: https://t.co/3STfdIQtaT', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 6828, 'reply_count': 1811, 'retweet_count': 237307, 'favorite_count': 811062, 'user_id': 1100261477989126145, 'hashtags': [], 'user_mentions': []}


In [36]:
# Find the document(s) with the highest retweet count
highest_retweet_docs = db.tweets.find().sort("retweet_count", pymongo.DESCENDING).limit(5)

# Print the document(s) with the highest retweet count
for doc in highest_retweet_docs:
    print(doc)

{'_id': 1238264431320215553, 'created_at': '2020-03-13 00:43:40', 'text': '*corona virus enters my body*\n\nThe 4 Flintstone gummies I ate in 2005: https://t.co/3STfdIQtaT', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 6828, 'reply_count': 1811, 'retweet_count': 237307, 'favorite_count': 811062, 'user_id': 1100261477989126145, 'hashtags': [], 'user_mentions': []}
{'_id': 1240334979701395458, 'created_at': '2020-03-18 17:51:18', 'text': 'When this Corona shit passes we have to promise each other that we’re going to tell our kids that we survived a zombie apocalypse in 2020', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 4442, 'reply_count': 1595, 'retweet_count': 181584, 'favorite_count': 764405, 'user_id': 1112592502727548928, 'hashtags': [], 'user_mentions': []}
{'_id': 1237436114887041024, 'created_at': 'Tue Mar 10 17:52:14 +0000 2020', 'text': 'THIS MAN 

In [30]:
col2 = db.tweets.find(
  {'retweet_count' : {'$gt' : 20000}})

for i in col2:
    print(i)

{'_id': 1251431001999527936, 'created_at': 'Sat Apr 18 08:42:55 +0000 2020', 'text': 'China‘s embassy in Berlin wrote me an open letter because they weren‘t too happy with our Corona coverage. I respon… https://t.co/hVu3jyMDdS', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 3717, 'reply_count': 3007, 'retweet_count': 24747, 'favorite_count': 48075, 'user_id': 19069018, 'hashtags': [], 'user_mentions': []}
{'_id': 1252579243575427072, 'created_at': 'Tue Apr 21 12:45:38 +0000 2020', 'text': 'eu quando lembro q meu ano tava perfeito ate o corona chegar e estragar tudo', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 1113, 'reply_count': 129, 'retweet_count': 36682, 'favorite_count': 54386, 'user_id': 1168995568686895105, 'hashtags': [], 'user_mentions': []}
{'_id': 1243172541214216195, 'created_at': 'Thu Mar 26 13:46:45 +0000 2020', 'text': 'lavem suas mãos coro

### Reply Count

In [39]:
## Finding the document(s) with the highest reply count
highest_reply_docs = db.tweets.find().sort("reply_count", pymongo.DESCENDING).limit(3)

### Printing the document(s) with the highest reply count
for doc in highest_reply_docs:
    print(doc)

{'_id': 1251997329999179778, 'created_at': 'Sun Apr 19 22:13:19 +0000 2020', 'text': 'O mundo inteiro está unido contra o coronavírus. No Brasil, temos de lutar contra o corona e o vírus do autoritaris… https://t.co/1TF2CcBL9B', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 6642, 'reply_count': 58440, 'retweet_count': 18093, 'favorite_count': 165110, 'user_id': 35260027, 'hashtags': [], 'user_mentions': []}
{'_id': 1240371160078000128, 'created_at': '2020-03-18 20:15:04', 'text': "“If I get corona, I get corona. At the end of the day, I'm not gonna let it stop me from partying”: Spring breakers… https://t.co/VmAReDLNLp", 'source': '<a href="https://www.sprinklr.com" rel="nofollow">Sprinklr</a>', 'quote_count': 99477, 'reply_count': 32267, 'retweet_count': 29424, 'favorite_count': 96367, 'user_id': 15012486, 'hashtags': [], 'user_mentions': []}
{'_id': 1243376809363640322, 'created_at': '2020-03-27 03:18:26', 'text': 'Coro

### Favourite Count

In [40]:
## Finding the document(s) with the highest favorite count
highest_favorite_docs = db.tweets.find().sort("favorite_count", pymongo.DESCENDING).limit(4)

# Print the document(s) with the highest favorite count
for doc in highest_favorite_docs:
    print(doc)


{'_id': 1238264431320215553, 'created_at': '2020-03-13 00:43:40', 'text': '*corona virus enters my body*\n\nThe 4 Flintstone gummies I ate in 2005: https://t.co/3STfdIQtaT', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 6828, 'reply_count': 1811, 'retweet_count': 237307, 'favorite_count': 811062, 'user_id': 1100261477989126145, 'hashtags': [], 'user_mentions': []}
{'_id': 1240334979701395458, 'created_at': '2020-03-18 17:51:18', 'text': 'When this Corona shit passes we have to promise each other that we’re going to tell our kids that we survived a zombie apocalypse in 2020', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'quote_count': 4442, 'reply_count': 1595, 'retweet_count': 181584, 'favorite_count': 764405, 'user_id': 1112592502727548928, 'hashtags': [], 'user_mentions': []}
{'_id': 1237436114887041024, 'created_at': 'Tue Mar 10 17:52:14 +0000 2020', 'text': 'THIS MAN 

In [54]:
# Finding the documents with the highest favorite count
highest_favorite_docs = db.tweets.find().sort("favorite_count", pymongo.DESCENDING).limit(3)
for doc in highest_favorite_docs:
    print("\nuser_id:", doc["user_id"]) 
    print("text:", doc["text"])
    print("favorite_count:", doc["favorite_count"])



user_id: 1100261477989126145
text: *corona virus enters my body*

The 4 Flintstone gummies I ate in 2005: https://t.co/3STfdIQtaT
favorite_count: 811062

user_id: 1112592502727548928
text: When this Corona shit passes we have to promise each other that we’re going to tell our kids that we survived a zombie apocalypse in 2020
favorite_count: 764405

user_id: 1131227186
text: THIS MAN IS A GENIUS he figured out the Corona virus problem 😮 https://t.co/EZP7IqTtxV
favorite_count: 515867


### Quote Count

In [41]:
# Find the document(s) with the highest quote count
highest_quote_docs = db.tweets.find().sort("quote_count", pymongo.DESCENDING).limit(1)

# Print the document(s) with the highest quote count
for doc in highest_quote_docs:
    print(doc)


{'_id': 1240371160078000128, 'created_at': '2020-03-18 20:15:04', 'text': "“If I get corona, I get corona. At the end of the day, I'm not gonna let it stop me from partying”: Spring breakers… https://t.co/VmAReDLNLp", 'source': '<a href="https://www.sprinklr.com" rel="nofollow">Sprinklr</a>', 'quote_count': 99477, 'reply_count': 32267, 'retweet_count': 29424, 'favorite_count': 96367, 'user_id': 15012486, 'hashtags': [], 'user_mentions': []}


In [50]:
# Finding the documents with the maximum quote count
highest_quote_docs = db.tweets.find().sort("quote_count", pymongo.DESCENDING).limit(1)
for doc in highest_quote_docs:
    print("user_id:", doc["user_id"])
    print("text:", doc["text"])
    print("quote_count:", doc["quote_count"])
    

user_id: 15012486
text: “If I get corona, I get corona. At the end of the day, I'm not gonna let it stop me from partying”: Spring breakers… https://t.co/VmAReDLNLp
quote_count: 99477
