## 3.1 Preparation

### Import

In [1]:
import pymysql
import pymongo
from pymongo import MongoClient

import pandas as pd
import time # add timestamps to cache entries
import json # serialize/deserialize data when saving/loading the cache to/from disk
import threading # call the checkpoint() method periodically

In [2]:
import os
os.getcwd()

'/Users/cyan/Documents/python_workspace/694_DBMngm_23SP'

### Connecting to MongoDB

In [3]:
# Connect to the MongoDB database
client = MongoClient('localhost', 27017)
db = client['694db_nsdb']
tweets = db['twt_datetime']

#### test for tweets load

In [4]:
documents = tweets.find().limit(5)
pd.DataFrame(documents)

Unnamed: 0,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,644b36afa1b284129a82b31b,"{'id': 301470336, 'id_str': '301470336', 'name...","ahap , low cut off merchants","{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254022778371571712,,,,1.2540162971611054e+18,0,0,0,0,en
1,644b36afa1b284129a82b318,"{'id': 375777294, 'id_str': '375777294', 'name...",im making 17 a hr doing nothing 😭😭😭😭 i love th...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254022776094105602,,,,,0,0,0,0,en
2,644b36afa1b284129a82b316,"{'id': 2929344220, 'id_str': '2929344220', 'na...",Was sollen 150 Euro Computerzuschuss bringen? ...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022772575043586,,,,1.2540164346588488e+18,0,0,0,0,de
3,644b36afa1b284129a82b31a,"{'id': 1132273796138905600, 'id_str': '1132273...",Oh brother and he gave the kid his corona type...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022776752615430,,,,,0,0,0,0,en
4,644b36afa1b284129a82b317,"{'id': 1091660129894838272, 'id_str': '1091660...",@VinceMcMahon @TripleH We hereby honor to anno...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022773598572544,VinceMcMahon,1222639789.0,1.253768179517657e+18,,0,0,0,0,en


### Connecting to MySQL

In [5]:
#Connect to MySQL
conn = pymysql.connect(host='localhost', port=3306, user='root',password="led08854",charset="utf8mb4", database='694RDBMS')
cur = conn.cursor()

In [6]:
cur.execute("SELECT userid_str, screen_name, name, followers_count FROM users;")
users = pd.DataFrame(cur.fetchall())
users.columns = [desc[0] for desc in cur.description]
users.head(10)

Unnamed: 0,userid_str,screen_name,name,followers_count
0,1000006582896295938,yourgirlsarag,sara,246
1,1000027886915637250,cheche04125843,cheche,33
2,1000034375973646337,clarice8967,clarih ?,10748
3,100004211,karundhiravidan,azakiya tamilmagan,445
4,1000045322859634688,barcelona9581,barcelona958,131
5,1000046756120363013,isisevang,áisis,306
6,1000047984103575552,esesesrarara34,EsRaAa (EN YENI HESAP)?,4310
7,1000055605263065088,zolesa_ke,Tsika Zolesa,3083
8,1000061119921352705,lejayk54,☮️Axel☮️,27
9,1000062434126725120,mahin37418807,Mahin Mohades,206


### Cache
Requirements  
- Design and implement a cache for storing "popular" (frequently accessed) data so that this data does not have to be retrieved from the database each time it is accessed.   
    - Some hashtags/users may be popular and their data may be cached.   
- You can use a Python dictionary for implementing the cache, but you must :
    - limit the size of the dictionary by evicting entries using a strategy (E.g. least accessed). 
    - You must checkpoint your data on disk at periodic intervals. 
    - When your search application starts up, you must reload the state of the cache from the disk. 
- Questions to consider 
    - Can an entry in the cache get stale (is not representing the correct state)? 
    - How will you update or purge stale data? 
    - An advanced feature that you could implement is an expiry mechanism for an entry in the cache by having a Time-To-Live field for each entry that determines the amount of time the entry will be retained in the cache. 
- Timings of your test search queries (make sure you are hitting cached and non cached data)

#### Cache_f.json content initiated

In [7]:
# store the 100 most popular user info into cache_f.json
cur.execute("SELECT userid_str, screen_name, name, followers_count FROM users order by followers_count desc limit 100") # there are 80,943 entries in MySQL in total
cache = pd.DataFrame(cur.fetchall())
cache.columns = [desc[0] for desc in cur.description]
cache['name_as_index']=cache['screen_name']
cache['access_count']=0

# get the wanted subset
subset_data = cache[["userid_str", "screen_name", "name", "followers_count", "name_as_index"]]
subset_access_count = cache[["screen_name", "access_count", "name_as_index"]]

# convert df to dict
cache_data_dict = subset_data.set_index("name_as_index").to_dict("index")
cache_access_count_dict = subset_access_count.set_index("name_as_index").to_dict("index")

# combine into cache_dict
cache_dict = {}

# set keys
key1 = "data"
key2 = "access_count"
cache_dict = {key1: cache_data_dict, key2: cache_access_count_dict}

In [8]:
# Write the dictionary to a JSON file
with open("cache_f.json", "w") as f:
    json.dump(cache_dict, f)

#### Define the Cache Class

In [10]:
class Cache:
    def __init__(self, max_size = 1000, eviction_strategy = "least_accessed", checkpoint_interval = 8, filename = "cache_f.json"):
        self.max_size = max_size
        self.eviction_strategy = eviction_strategy
        self.checkpoint_interval = checkpoint_interval
        self.filename = filename
        self.checkpoint_file = None
        self.data = {}
        self.access_count = {}
        # laod the cache from the file
        self.load() 
        
        # Start a background thread to periodically checkpoint the cache
        self.checkpoint_thread = threading.Thread(target=self.periodic_checkpoint)
        self.checkpoint_thread.daemon = True
        self.checkpoint_thread.start()

    def get(self, key):
        # Get an item from the cache and update access count
        self.access_count[key]["access_count"] += 1
        return self.data[key]

    def set(self, key, value):
        # Add an item to the cache and evict least popular item if max_size is reached
        self.data[key] = value
        self.access_count[key] = {'screen_name': key, 'access_count': 1} # access 1 time
        if len(self.data) > self.max_size:
            self.evict()

    def __contains__(self, key):
        return key in self.data

    def evict(self):
        # Evict the least popular item from the cache
        if self.eviction_strategy == "least_accessed":
            key_to_evict = min(self.access_count, key=self.access_count.get)
        else:
            # Other eviction strategies can be implemented here
            pass
        del self.data[key_to_evict]
        del self.access_count[key_to_evict]
        
    def load(self):
        # Load the cache from disk
        with open(self.filename, "r") as f:
            cache_data = json.load(f)
            self.data = cache_data["data"]
            self.access_count = cache_data["access_count"]
        
    def periodic_checkpoint(self):
        while True:
            # Wait for the checkpoint interval
            time.sleep(self.checkpoint_interval)
            
            # Checkpoint the cache to the file if it is the correct file
            if self.filename == self.checkpoint_file:
                self.checkpoint()
                
            # Update the checkpoint file
            self.checkpoint_file = self.filename

    def checkpoint(self):
        # Save the cache to disk
        with open(self.filename, "w") as f:
            json.dump({
                "data": self.data,
                "access_count": self.access_count
            }, f)
            
        # update the checkpoint file
        self.checkpoint_file = self.filename
    
    def is_stale(self, key, ttl = 60*60):
        # Check if an item in the cache is stale (i.e. expired)
        if key not in self.access_count:
            return True
        return time.time() - self.access_count[key] > ttl
    
    def update_or_purge_stale_data(self, ttl):
        # Update or purge stale data in the cache
        keys_to_purge = []
        for key in self.data.keys():
            if self.is_stale(key, ttl):
                keys_to_purge.append(key)
        for key in keys_to_purge:
            del self.data[key]
            del self.access_count[key]

    def __del__(self):
        # Save the cache to disk before exiting
        self.checkpoint()

In [11]:
# initialize the cache
cache = Cache(
    max_size=1000,
    eviction_strategy="least_accessed",
    checkpoint_interval = 8, # checkpoint every 8 seconds
    filename="cache_f.json"
)

In [12]:
# load cache from disk
cache.load()

#### example usage & test

##### get test

In [13]:
user_id = "detikcom"
if user_id in cache:
    # cache hit: get user data from cache directly
    user_data = pd.DataFrame.from_dict(cache.get(user_id), orient = "index").T
    print("in cache")
else:
    print("not in cache")
    # cache miss: get data from database
    cur.execute("SELECT userid_str, screen_name, name, followers_count FROM users WHERE screen_name = %s;", (user_id,))
    user_data = pd.DataFrame(cur.fetchall(),columns=["userid_str", "screen_name", "name", "followers_count"])
    # Add user data to cache
    user_data_dict = user_data.to_dict("index")[0]
    cache.set(user_id, user_data_dict)

in cache


In [14]:
user_data

Unnamed: 0,userid_str,screen_name,name,followers_count
0,69183155,detikcom,detikcom,15927642


In [15]:
# memory test
cache.access_count[user_id]

{'screen_name': 'detikcom', 'access_count': 1}

In [17]:
# disk test
with open("cache_f.json", "r") as f:
    # Load the JSON data into a dictionary
    cache_disk = json.load(f)
    
cache_disk["access_count"].get(user_id)

{'screen_name': 'detikcom', 'access_count': 1}

##### set test

In [18]:
# set test
user_id = "narsingdhone"
if user_id in cache:
    # cache hit: get user data from cache directly
    user_data = pd.DataFrame.from_dict(cache.get(user_id), orient = "index").T
    print("in cache")
else:
    print("not in cache")
    # cache miss: get data from database
    cur.execute("SELECT userid_str, screen_name, name, followers_count FROM users WHERE screen_name = %s;", (user_id,))
    user_data = pd.DataFrame(cur.fetchall(),columns=["userid_str", "screen_name", "name", "followers_count"])
    # Add user data to cache
    user_data_dict = user_data.to_dict("index")[0]
    cache.set(user_id, user_data_dict)
    # del cache.data[user_id]
    # del cache.access_count[user_id]

not in cache


In [19]:
user_data

Unnamed: 0,userid_str,screen_name,name,followers_count
0,970864196,narsingdhone,narsing dhone,0


In [20]:
# memory test
cache.access_count[user_id]

{'screen_name': 'narsingdhone', 'access_count': 1}

In [21]:
# disk test
with open("cache_f.json", "r") as f:
    # Load the JSON data into a dictionary
    cache_disk = json.load(f)
    
cache_disk["access_count"].get(user_id)

{'screen_name': 'narsingdhone', 'access_count': 1}

## 3.2 Single Queries in NRDB for the search application

### 3.2.1 Search by keyword

In [22]:
#Find tweets with keyword
tweets.create_index([("text", pymongo.TEXT)])
def search_word():
    user_input = input("Please enter a keyword: ")
    # search in NRDB
    if user_input != "":
        myquery = {"$text":{"$search": user_input}}
        twts = pd.DataFrame(tweets.find(myquery).sort([("favorite_count", pymongo.DESCENDING), ("retweet_count", pymongo.DESCENDING)]).limit(10))
    #Processing
    twts['userid_str'] = pd.json_normalize(twts['user'])['id_str']
    #JOIN
    testItems = pd.merge(users, twts, on='userid_str')
    return testItems

In [24]:
search_word()

Please enter a keyword:  corona


Unnamed: 0,userid_str,screen_name,name,followers_count,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,1040018437,tuiktok,zaara dxt.,154,644b36bea1b284129a82e234,"{'id': 1040018437, 'id_str': '1040018437', 'na...",@Nrxcba corona corona,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254034220856438789,Nrxcba,1.2484008914151667e+18,1.2540339447538278e+18,,0,0,0,0,it
1,1067309925473370112,ColdHallow,Cold Hallow/ Yung Exodus,113,644b36bda1b284129a82e120,"{'id': 1067309925473370112, 'id_str': '1067309...",corona corona corona corona corona corona coro...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254033974411886592,,,,,0,0,0,0,it
2,1134413874369667073,fleroviumc,dee ✪,939,644b36b5a1b284129a82c5a2,"{'id': 1134413874369667073, 'id_str': '1134413...",@scamandor Corona corona,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254027464692490240,scamandor,1.1437136384541981e+18,1.2540199481851904e+18,,0,0,0,0,it
3,1170661876096618496,kentankjawa,lita,2507,644b36c7a1b284129a82fdf2,"{'id': 1170661876096618496, 'id_str': '1170661...",@culawmetan corona corona,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254040694362013696,culawmetan,1.192382470966014e+18,1.2540405422219878e+18,,0,0,0,0,it
4,1227474923158953984,jiaadje,Ahriana Djenala ?,4,644b36cfa1b284129a83176a,"{'id': 1227474923158953984, 'id_str': '1227474...",@ratihhhws Corona pergi corona pergi corona pergi,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254046553334939649,ratihhhws,1.1464239580124568e+18,1.2540261366752133e+18,,0,0,0,0,it
5,1250113122385330177,doyvibes,↺kala²⁶,1855,644b36afa1b284129a82b3ec,"{'id': 1250113122385330177, 'id_str': '1250113...",@dustyybear CORONA CORONA DADAR CORONA,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254022981459689474,dustyybear,1.16524761029206e+18,1.2540227247389696e+18,,0,0,0,0,es
6,202081087,number3007,Yo,386,644b36c4a1b284129a82f49b,"{'id': 202081087, 'id_str': '202081087', 'name...",@nzaqqiaa Corona corona,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254038539148259329,nzaqqiaa,1.059804419305304e+18,1.25401737249246e+18,,0,0,0,0,it
7,24682822,budi00000,budi kurniawan,12,644b36b7a1b284129a82cbe7,"{'id': 24682822, 'id_str': '24682822', 'name':...",Corona oh corona,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254028973882798080,,,,,0,0,0,0,it
8,2658954247,shafiq_rosli12,??????_,407,644b36e2a1b284129a8345ed,"{'id': 2658954247, 'id_str': '2658954247', 'na...",Me: mak kau corona mak kau corona mak kau coro...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254057001245532160,,,,1.2534704216446976e+18,0,0,0,0,in
9,299619597,agitwldn,Terpaksa aku,1129,644b36d1a1b284129a831d9a,"{'id': 299619597, 'id_str': '299619597', 'name...",@adetuwuable Corona ilang\nCorona ilang\nCoron...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254047903322013698,adetuwuable,630893681.0,1.2540465084977766e+18,,0,0,0,0,tl


### 3.2.2 Search by hashtag

In [25]:
#Find tweets with hastage
def search_hashtag():
    user_input = input("Please enter a hashtag: ")
    if user_input !="":
        myquery = {"entities.hashtags.text": user_input}
        twts = pd.DataFrame(tweets.find(myquery).sort([("favorite_count", pymongo.DESCENDING), ("retweet_count", pymongo.DESCENDING)]).limit(10))
    #Processing
    twts['userid_str'] = pd.json_normalize(twts['user'])['id_str']
    #JOIN
    testItems = pd.merge(users, twts, on='userid_str')
    return testItems

In [26]:
search_hashtag()

Please enter a hashtag:  sport


Unnamed: 0,userid_str,screen_name,name,followers_count,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,2242948745,tho1965,Thomas Krause,173,644b36afa1b284129a82b315,"{'id': 2242948745, 'id_str': '2242948745', 'na...",Schöne Runde mit dem Rennrad ✌️\n#sport #coron...,"{'hashtags': [{'text': 'sport', 'indices': [32...",2020-04-25,1254022770746372096,,,,,0,0,0,0,de


### 3.2.3 Search by user name

In [27]:
def search_user_name():
    user_input = input("Please enter a user_name: ")
    if user_input !="":
        # my sql
        myquery = "SELECT userid_str, screen_name, name, followers_count FROM users WHERE screen_name = %s;"
        cur.execute(myquery,(user_input,))
        uers = pd.DataFrame(cur.fetchall(),columns=["userid_str", "screen_name", "name", "followers_count"])
        # mongodb
        userid_str=uers["userid_str"].tolist()
        myquery = {"user.id_str": {"$in":userid_str}}
        twts = pd.DataFrame(tweets.find(myquery).sort([("favorite_count", pymongo.DESCENDING), ("retweet_count", pymongo.DESCENDING)]).limit(10))
        #Processing
        twts['userid_str'] = pd.json_normalize(twts['user'])['id_str']
        #JOIN
        testItems = pd.merge(users, twts, on='userid_str')
        return testItems

In [28]:
search_user_name()

Please enter a user_name:  tho1965


Unnamed: 0,userid_str,screen_name,name,followers_count,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,2242948745,tho1965,Thomas Krause,173,644b36afa1b284129a82b315,"{'id': 2242948745, 'id_str': '2242948745', 'na...",Schöne Runde mit dem Rennrad ✌️\n#sport #coron...,"{'hashtags': [{'text': 'sport', 'indices': [32...",2020-04-25,1254022770746372096,,,,,0,0,0,0,de


### 3.2.4 Search by time range

In [29]:
def search_date():
    myquery={}
    start_at = input("Please enter a start date(format:yyyy-mm-dd hh:mm:ss): ")
    end_at = input("Please enter a end date(format:yyyy-mm-dd hh:mm:ss): ")
    if start_at and end_at != "":
        myquery['created_at'] = {'$gte': start_at, '$lte': end_at}
    elif start_at != "":
        myquery['created_at'] = {'$gte': start_at}
    elif end_at != "":
        myquery['created_at'] = {'$lte': end_at}
    twts = pd.DataFrame(tweets.find(myquery).sort([("favorite_count", pymongo.DESCENDING), ("retweet_count", pymongo.DESCENDING)]).limit(10))
    return twts

In [30]:
search_date()

Please enter a start date(format:yyyy-mm-dd hh:mm:ss):  2020-04-20
Please enter a end date(format:yyyy-mm-dd hh:mm:ss):  2020-04-30


Unnamed: 0,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,644b36afa1b284129a82b314,"{'id': 804046791348015107, 'id_str': '80404679...","É isto, ou vou morrer sem ar ou com o corona h...","{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254022770679320576,,,,,0,0,0,0,pt
1,644b36afa1b284129a82b31e,"{'id': 1120761000561606656, 'id_str': '1120761...",tony montana yoongi live #kpop trump bp lisa d...,"{'hashtags': [{'text': 'kpop', 'indices': [25,...",2020-04-25,1254022781710274566,,,,,0,0,0,0,en
2,644b36afa1b284129a82b316,"{'id': 2929344220, 'id_str': '2929344220', 'na...",Was sollen 150 Euro Computerzuschuss bringen? ...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022772575043586,,,,1.2540164346588488e+18,0,0,0,0,de
3,644b36afa1b284129a82b31d,"{'id': 923169415054680064, 'id_str': '92316941...",Weekly mortality graphs show in some European ...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022780695252993,,,,,0,0,0,0,en
4,644b36afa1b284129a82b319,"{'id': 865733987561381888, 'id_str': '86573398...",@MichaelTouby @ChinaDaily @RVsmtown @YRFairyta...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022776207429633,MichaelTouby,7.196255706157588e+17,1.2540135434705633e+18,,0,0,0,0,en
5,644b36afa1b284129a82b317,"{'id': 1091660129894838272, 'id_str': '1091660...",@VinceMcMahon @TripleH We hereby honor to anno...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022773598572544,VinceMcMahon,1222639789.0,1.253768179517657e+18,,0,0,0,0,en
6,644b36afa1b284129a82b31b,"{'id': 301470336, 'id_str': '301470336', 'name...","ahap , low cut off merchants","{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254022778371571712,,,,1.2540162971611054e+18,0,0,0,0,en
7,644b36afa1b284129a82b318,"{'id': 375777294, 'id_str': '375777294', 'name...",im making 17 a hr doing nothing 😭😭😭😭 i love th...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254022776094105602,,,,,0,0,0,0,en
8,644b36afa1b284129a82b31a,"{'id': 1132273796138905600, 'id_str': '1132273...",Oh brother and he gave the kid his corona type...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022776752615430,,,,,0,0,0,0,en
9,644b36afa1b284129a82b315,"{'id': 2242948745, 'id_str': '2242948745', 'na...",Schöne Runde mit dem Rennrad ✌️\n#sport #coron...,"{'hashtags': [{'text': 'sport', 'indices': [32...",2020-04-25,1254022770746372096,,,,,0,0,0,0,de


## 3.3 Search Function by all types

#### 3.3.1 using no cache

In [95]:
def search_all_RNR():
    myquery={}
    search_keyword = input("Please enter a keyword: ")
    search_hashtag = input("Please enter a hashtag: ")
    search_user = input("Please enter a user_name: ")
    start_at = input("Please enter a start date(format:yyyy-mm-dd hh:mm:ss): ")
    end_at = input("Please enter a end date(format:yyyy-mm-dd hh:mm:ss): ")
    # screen name
    if search_user != "":
        # my sql
        cur.execute("SELECT userid_str, screen_name, name, followers_count FROM users WHERE screen_name = %s;", (search_user,))
        uer = pd.DataFrame(cur.fetchall(),columns=["userid_str", "screen_name", "name", "followers_count"])
        # mongodb
        userid_str = uer["userid_str"].tolist()
        myquery["user.id_str"] = {"$in":userid_str}
    else:
        cur.execute("SELECT userid_str, screen_name, name, followers_count FROM users;")
        uer = pd.DataFrame(cur.fetchall(),columns=["userid_str", "screen_name", "name", "followers_count"])
    if search_keyword != "":
        myquery["$text"] = {"$search": search_keyword}
    if search_hashtag != "":
        myquery["entities.hashtags.text"] = search_hashtag
    # time range
    if start_at and end_at != "":
        myquery['created_at'] = {'$gte': start_at, '$lte': end_at}
    elif start_at != "":
        myquery['created_at'] = {'$gte': start_at}
    elif end_at != "":
        myquery['created_at'] = {'$lte': end_at}
    twts = pd.DataFrame(tweets.find(myquery).sort([("favorite_count", pymongo.DESCENDING), ("retweet_count", pymongo.DESCENDING)]).limit(10))
    #Processing
    twts['userid_str'] = pd.json_normalize(twts['user'])['id_str']
    #JOIN
    testItems = pd.merge(uer, twts, on='userid_str')
    return testItems

###### take a view

In [96]:
search_all_RNR()

Please enter a keyword:  
Please enter a hashtag:  
Please enter a user_name:  
Please enter a start date(format:yyyy-mm-dd hh:mm:ss):  
Please enter a end date(format:yyyy-mm-dd hh:mm:ss):  


Unnamed: 0,userid_str,screen_name,name,favourites_count,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,1091660129894838272,milliteknoloj,Milli teknoloji,352,644b36afa1b284129a82b317,"{'id': 1091660129894838272, 'id_str': '1091660...",@VinceMcMahon @TripleH We hereby honor to anno...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022773598572544,VinceMcMahon,1222639789.0,1.253768179517657e+18,,0,0,0,0,en
1,1120761000561606656,cloudseokjinie,?,1957,644b36afa1b284129a82b31e,"{'id': 1120761000561606656, 'id_str': '1120761...",tony montana yoongi live #kpop trump bp lisa d...,"{'hashtags': [{'text': 'kpop', 'indices': [25,...",2020-04-25,1254022781710274566,,,,,0,0,0,0,en
2,1132273796138905600,RampTheresa,Terri Kamp,29166,644b36afa1b284129a82b31a,"{'id': 1132273796138905600, 'id_str': '1132273...",Oh brother and he gave the kid his corona type...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022776752615430,,,,,0,0,0,0,en
3,2242948745,tho1965,Thomas Krause,2184,644b36afa1b284129a82b315,"{'id': 2242948745, 'id_str': '2242948745', 'na...",Schöne Runde mit dem Rennrad ✌️\n#sport #coron...,"{'hashtags': [{'text': 'sport', 'indices': [32...",2020-04-25,1254022770746372096,,,,,0,0,0,0,de
4,2929344220,RusticusArat,Ralf Schmitz,32024,644b36afa1b284129a82b316,"{'id': 2929344220, 'id_str': '2929344220', 'na...",Was sollen 150 Euro Computerzuschuss bringen? ...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022772575043586,,,,1.2540164346588488e+18,0,0,0,0,de
5,301470336,_FreshAA,BABA ALI,857,644b36afa1b284129a82b31b,"{'id': 301470336, 'id_str': '301470336', 'name...","ahap , low cut off merchants","{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254022778371571712,,,,1.2540162971611054e+18,0,0,0,0,en
6,375777294,TWD40_,TeéLaneeë?,1325,644b36afa1b284129a82b318,"{'id': 375777294, 'id_str': '375777294', 'name...",im making 17 a hr doing nothing 😭😭😭😭 i love th...,"{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254022776094105602,,,,,0,0,0,0,en
7,804046791348015107,B_King69,Bi Sex Uau,5446,644b36afa1b284129a82b314,"{'id': 804046791348015107, 'id_str': '80404679...","É isto, ou vou morrer sem ar ou com o corona h...","{'hashtags': [], 'urls': [], 'user_mentions': ...",2020-04-25,1254022770679320576,,,,,0,0,0,0,pt
8,865733987561381888,BritishKatieKim,ANH,244132,644b36afa1b284129a82b319,"{'id': 865733987561381888, 'id_str': '86573398...",@MichaelTouby @ChinaDaily @RVsmtown @YRFairyta...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022776207429633,MichaelTouby,7.196255706157588e+17,1.2540135434705633e+18,,0,0,0,0,en
9,923169415054680064,QuantumAspect,Quantum,119803,644b36afa1b284129a82b31d,"{'id': 923169415054680064, 'id_str': '92316941...",Weekly mortality graphs show in some European ...,"{'hashtags': [], 'urls': [{'url': 'https://t.c...",2020-04-25,1254022780695252993,,,,,0,0,0,0,en


#### 3.3.2 using cache

In [14]:
def search_all_C():
    myquery={}
    search_keyword = input("Please enter a keyword: ")
    search_hashtag = input("Please enter a hashtag: ")
    search_user = input("Please enter a user_name: ")
    start_at = input("Please enter a start date(format:yyyy-mm-dd hh:mm:ss): ")
    end_at = input("Please enter a end date(format:yyyy-mm-dd hh:mm:ss): ")
    # screen name
    if search_user != "":
        # my sql
        cur.execute("SELECT userid_str, screen_name, name, followers_count FROM users WHERE screen_name = %s;", (search_user,))
        uer = pd.DataFrame(cur.fetchall(),columns=["userid_str", "screen_name", "name", "followers_count"])
        # mongodb
        userid_str = uer["userid_str"].tolist()
        myquery["user.id_str"] = {"$in":userid_str}
    else:
        uer = users
    if search_keyword != "":
        myquery["$text"] = {"$search": search_keyword}
    if search_hashtag != "":
        myquery["entities.hashtags.text"] = search_hashtag
    # time range
    if start_at and end_at != "":
        myquery['created_at'] = {'$gte': start_at, '$lte': end_at}
    elif start_at != "":
        myquery['created_at'] = {'$gte': start_at}
    elif end_at != "":
        myquery['created_at'] = {'$lte': end_at}
    twts = pd.DataFrame(tweets.find(myquery).sort([("favorite_count", pymongo.DESCENDING), ("retweet_count", pymongo.DESCENDING)]).limit(10))
    #Processing
    twts['userid_str'] = pd.json_normalize(twts['user'])['id_str']
    #JOIN
    testItems = pd.merge(uer, twts, on='userid_str')
    return testItems

###### take a view

In [15]:
search_all_C()

Please enter a keyword: 
Please enter a hashtag: 
Please enter a user_name: tho1965
Please enter a start date(format:yyyy-mm-dd hh:mm:ss): 
Please enter a end date(format:yyyy-mm-dd hh:mm:ss): 


Unnamed: 0,userid_str,screen_name,name,favourites_count,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,2242948745,tho1965,Thomas Krause,2184,644aeb6165c7444702bc28fd,"{'id_str': '2242948745', 'created_at': '2013-1...",Schöne Runde mit dem Rennrad ✌️\n#sport #coron...,"{'hashtags': [{'text': 'sport', 'indices': [32...",2020-04-25,1254022770746372096,,,,,0,0,0,0,de


## 3.4 time test

#### for uncached data

In [31]:
def test_uncached_performance():
    myquery={}
    search_keyword = input("Please enter a keyword: ")
    search_hashtag = input("Please enter a hashtag: ")
    search_user = input("Please enter a user_name: ")
    start_at = input("Please enter a start date(format:yyyy-mm-dd hh:mm:ss): ")
    end_at = input("Please enter a end date(format:yyyy-mm-dd hh:mm:ss): ")
    # search start
    start_time = time.time()
    # screen name
    if search_user != "":
        # my sql
        cur.execute("SELECT userid_str, screen_name, name, followers_count FROM users WHERE screen_name = %s;", (search_user,))
        uer = pd.DataFrame(cur.fetchall(),columns=["userid_str", "screen_name", "name", "followers_count"])
        # mongodb
        userid_str = uer["userid_str"].tolist()
        myquery["user.id_str"] = {"$in":userid_str}
    else:
        cur.execute("SELECT userid_str, screen_name, name, followers_count FROM users;")
        uer = pd.DataFrame(cur.fetchall(),columns=["userid_str", "screen_name", "name", "followers_count"])
    if search_keyword != "":
        myquery["$text"] = {"$search": search_keyword}
    if search_hashtag != "":
        myquery["entities.hashtags.text"] = search_hashtag
    # time range
    if start_at and end_at != "":
        myquery['created_at'] = {'$gte': start_at, '$lte': end_at}
    elif start_at != "":
        myquery['created_at'] = {'$gte': start_at}
    elif end_at != "":
        myquery['created_at'] = {'$lte': end_at}
    twts = pd.DataFrame(tweets.find(myquery).sort([("favorite_count", pymongo.DESCENDING), ("retweet_count", pymongo.DESCENDING)]).limit(10))
    #Processing
    twts['userid_str'] = pd.json_normalize(twts['user'])['id_str']
    #JOIN
    testItems = pd.merge(uer, twts, on='userid_str')
    # search end
    end_time = time.time()
    print(f''' searching in cached information took {end_time - start_time:.4f} seconds''')
    return testItems

In [38]:
test_uncached_performance() # first time

Please enter a keyword:  
Please enter a hashtag:  
Please enter a user_name:  DVSadanandGowda
Please enter a start date(format:yyyy-mm-dd hh:mm:ss):  
Please enter a end date(format:yyyy-mm-dd hh:mm:ss):  


 searching in cached information took 0.0686 seconds


Unnamed: 0,userid_str,screen_name,name,followers_count,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,2381667174,DVSadanandGowda,Sadananda Gowda,379817,644b36d0a1b284129a8319aa,"{'id': 2381667174, 'id_str': '2381667174', 'na...",#Relief work in #BengaluruNorth\nಬ್ಯಾಟರಾಯನಪುರ ...,"{'hashtags': [{'text': 'Relief', 'indices': [0...",2020-04-25,1254047066914840576,,,,,0,0,0,0,kn


In [44]:
test_uncached_performance() # forth time

Please enter a keyword:  
Please enter a hashtag:  
Please enter a user_name:  DVSadanandGowda
Please enter a start date(format:yyyy-mm-dd hh:mm:ss):  
Please enter a end date(format:yyyy-mm-dd hh:mm:ss):  


 searching in cached information took 0.0434 seconds


Unnamed: 0,userid_str,screen_name,name,followers_count,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,2381667174,DVSadanandGowda,Sadananda Gowda,379817,644b36d0a1b284129a8319aa,"{'id': 2381667174, 'id_str': '2381667174', 'na...",#Relief work in #BengaluruNorth\nಬ್ಯಾಟರಾಯನಪುರ ...,"{'hashtags': [{'text': 'Relief', 'indices': [0...",2020-04-25,1254047066914840576,,,,,0,0,0,0,kn


#### for cached data

In [39]:
def test_cached_performance():
    myquery={}
    search_keyword = input("Please enter a keyword: ")
    search_hashtag = input("Please enter a hashtag: ")
    search_user = input("Please enter a user_name: ")
    start_at = input("Please enter a start date(format:yyyy-mm-dd hh:mm:ss): ")
    end_at = input("Please enter a end date(format:yyyy-mm-dd hh:mm:ss): ")
    # search start
    start_time = time.time()
    # search in NRDB
    # screen name
    if search_user != "":
        # MySQL & Cache: user in cache or not in cache
        if search_user in cache:
            # cache hit: get user data from cache directly
            user_data = pd.DataFrame.from_dict(cache.get(search_user), orient = "index").T
            print("in cache")
        else:
            print("not in cache")
            # cache miss: get data from database
            cur.execute("SELECT userid_str, screen_name, name, followers_count FROM users WHERE screen_name = %s;", (search_user,))
            user_data = pd.DataFrame(cur.fetchall(),columns=["userid_str", "screen_name", "name", "followers_count"])
            # Add user data to cache
            user_data_dict = user_data.to_dict("index")[0]
            cache.set(search_user, user_data_dict)
            # del cache.data[user_id]
            # del cache.access_count[user_id]
        # MongoDB
        userid_str = user_data["userid_str"].tolist()
        myquery["user.id_str"] = {"$in":userid_str}
    if search_keyword != "":
        myquery["$text"] = {"$search": search_keyword}
    if search_hashtag != "":
        myquery["entities.hashtags.text"] = search_hashtag
    # time range
    if start_at and end_at != "":
        myquery['created_at'] = {'$gte': start_at, '$lte': end_at}
    elif start_at != "":
        myquery['created_at'] = {'$gte': start_at}
    elif end_at != "":
        myquery['created_at'] = {'$lte': end_at}   
    twts = pd.DataFrame(tweets.find(myquery).sort([("favorite_count", pymongo.DESCENDING), ("retweet_count", pymongo.DESCENDING)]).limit(10))
    #Processing
    twts['userid_str'] = pd.json_normalize(twts['user'])['id_str']
    #JOIN
    testItems = pd.merge(user_data, twts, on='userid_str')
    # search end
    end_time = time.time()      
    print(f''' searching in cached information took {end_time - start_time:.4f} seconds''')
    return testItems

In [40]:
test_cached_performance() # first time not in cache

Please enter a keyword:  
Please enter a hashtag:  
Please enter a user_name:  DVSadanandGowda
Please enter a start date(format:yyyy-mm-dd hh:mm:ss):  
Please enter a end date(format:yyyy-mm-dd hh:mm:ss):  


not in cache
 searching in cached information took 0.0561 seconds


Unnamed: 0,userid_str,screen_name,name,followers_count,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,2381667174,DVSadanandGowda,Sadananda Gowda,379817,644b36d0a1b284129a8319aa,"{'id': 2381667174, 'id_str': '2381667174', 'na...",#Relief work in #BengaluruNorth\nಬ್ಯಾಟರಾಯನಪುರ ...,"{'hashtags': [{'text': 'Relief', 'indices': [0...",2020-04-25,1254047066914840576,,,,,0,0,0,0,kn


In [43]:
test_cached_performance() # second time in cache

Please enter a keyword:  
Please enter a hashtag:  
Please enter a user_name:  DVSadanandGowda
Please enter a start date(format:yyyy-mm-dd hh:mm:ss):  
Please enter a end date(format:yyyy-mm-dd hh:mm:ss):  


in cache
 searching in cached information took 0.0366 seconds


Unnamed: 0,userid_str,screen_name,name,followers_count,_id,user,text,entities,created_at,id_str,in_reply_to_screen_name,in_reply_to_user_id_str,in_reply_to_status_id_str,quoted_status_id_str,quote_count,reply_count,retweet_count,favorite_count,lang
0,2381667174,DVSadanandGowda,Sadananda Gowda,379817,644b36d0a1b284129a8319aa,"{'id': 2381667174, 'id_str': '2381667174', 'na...",#Relief work in #BengaluruNorth\nಬ್ಯಾಟರಾಯನಪುರ ...,"{'hashtags': [{'text': 'Relief', 'indices': [0...",2020-04-25,1254047066914840576,,,,,0,0,0,0,kn
