# Process data & Store into Redis

In [340]:
import json
import os
import pandas as pd
import time
import datetime
from datetime import datetime, date

## process data
Select the attributes to be stored.

In [103]:
# selected attributes in tweets
twt_attributes = ['created_at',
                       'id_str',
                       'text',
                       'in_reply_to_status_id_str',
                       'in_reply_to_user_id_str',
                       'in_reply_to_screen_name',
                       'quoted_status_id_str',
                       # 'quoted_status',
                       'quote_count',
                       'reply_count',
                       'retweet_count',
                       'favorite_count',
                       'entities.hashtags',
                       'extended_entities',
                       'user.id_str', # for matching
                       'user.name',
                       'user.screen_name',
                       'user.url',
                       'user.description',
                       'user.protected',
                       'user.verified',
                       'user.followers_count',
                       'user.friends_count',
                       'user.listed_count',
                       'user.favourites_count',
                       'user.statuses_count',
                       'user.created_at'
                      ]
len(twt_attributes) # 26

26

In [104]:
# so the selected retweets attributes has an extra "retweeted_status.id_str"
retwt_attributes = ['created_at',
                    'id_str',
                    'text',
                    'in_reply_to_status_id_str',
                    'in_reply_to_user_id_str',
                    'in_reply_to_screen_name',
                    'quoted_status_id_str',
                    # 'quoted_status',
                    'quote_count',
                    'reply_count',
                    'retweet_count',
                    'favorite_count',
                    'entities.hashtags',
                    'extended_entities',
                    'retweeted_status.id_str',
                    'user.id_str', # for matching
                    'user.name',
                    'user.screen_name',
                    'user.url',
                    'user.description',
                    'user.protected',
                    'user.verified',
                    'user.followers_count',
                    'user.friends_count',
                    'user.listed_count',
                    'user.favourites_count',
                    'user.statuses_count',
                    'user.created_at'
                   ]
len(retwt_attributes) # 27

27

In [105]:
# selected attributes in user
user_attr_tracking = ['id_str',
                      'name',
                      'screen_name',
                      'url',
                      'description',
                      'protected',
                      'verified',
                      'followers_count',
                      'friends_count',
                      'listed_count',
                      'favourites_count',
                      'statuses_count',
                      'created_at']

len(user_attr_tracking) #13

13

### user_store function

In [106]:
def user_store(user, userid_table):
    
    # if user has not been seen before, add id to userid_table
    if user['id_str'] not in userid_table:
        userid_table.append(user['id_str'])
        
        # update datastores with defined tracking attributes
        user_attr_selected = {key: user[key] for key in user_attr_tracking if key in user}
        user_list.append(user_attr_selected)

In [107]:
# test test

twt_id_record = []
userid_table = [] # store userid
user_list = [] # store every attributes of user needed tracking

with open("./data/corona-out-3", "r") as f1:
    for line in f1:
        try:
            data = json.loads(line)
            if data["id_str"] not in twt_id_record:
                twt_id_record.append(data["id_str"])
            # if tweet has been seen before 
            else:
                # ignore this tweet, go to next iteration of loop
                continue
            
            user = data['user']
            user_store(user, userid_table)
        except:
            continue

In [108]:
len(user_list)

80943

### tweets / retweet store

In [125]:
# this is the whole procedure of extract tweets/retweets to store into Cache from original dataset

twt_id_record = []
userid_table = [] # store userid
user_list = [] # store tracking attributes of user
retwt_list = [] # list to store retweets
retwt_lines = 0 # count # of retweets
twt_list = []
twt_lines = 0

with open("./data/corona-out-3", "r") as f1:
    for line in f1:
        try:
            data = json.loads(line)
            if data["id_str"] not in twt_id_record:
                twt_id_record.append(data["id_str"])
            # if tweet has been seen before 
            else:
                # ignore this tweet, go to next iteration of loop
                continue
            
            user = data['user']
            user_store(user, userid_table)
            
            
            if ( data['text'].startswith('RT') ):
                # try:
                    # update retweet information
                    selected_dict = {}
                    retwt_lines = retwt_lines + 1
                    for attribute in retwt_attributes:
                        try:
                            # assigns the reference: any changes made to one will affect the other
                            current_dict = data
                            current_selected_dict = selected_dict
                            # parent-child attributes
                            for sub_attribute in attribute.split('.'):
                                # parent attributes
                                current_dict = current_dict[sub_attribute]
                                # child attributes
                                if sub_attribute == attribute.split('.')[-1]:
                                    current_selected_dict[sub_attribute] = current_dict
                                else:
                                    if sub_attribute not in current_selected_dict:
                                        current_selected_dict[sub_attribute] = {}
                                    current_selected_dict = current_selected_dict[sub_attribute]
                        except:
                            continue
                    retwt_list.append(selected_dict)
                # except:
                    # continue

                # pass  # does nothing right now
                
            else:
                # add the new tweet to datastore
                    selected_dict = {}
                    twt_lines = twt_lines + 1
                    for attribute in twt_attributes:
                        try:
                            # assigns the reference: any changes made to one will affect the other
                            current_dict = data
                            current_selected_dict = selected_dict
                            # parent-child attributes
                            for sub_attribute in attribute.split('.'):
                                # parent attributes
                                current_dict = current_dict[sub_attribute]
                                # child attributes
                                if sub_attribute == attribute.split('.')[-1]:
                                    current_selected_dict[sub_attribute] = current_dict
                                else:
                                    if sub_attribute not in current_selected_dict:
                                        current_selected_dict[sub_attribute] = {}
                                    current_selected_dict = current_selected_dict[sub_attribute]
                        except:
                            continue
                    twt_list.append(selected_dict)
        except:
            continue

In [126]:
def parse_date(date_string):
    # parse the created_at attributes to datetime formart
    
    date_format = "%a %b %d %H:%M:%S %z %Y"
    date_object = time.strptime(date_string, date_format)
    
    year = date_object.tm_year
    month = date_object.tm_mon
    day = date_object.tm_mday
    
    new_date_object = datetime(year=year, month=month, day=day)
    formatted_date = new_date_object.strftime("%Y-%m-%d")
    
    return(formatted_date)

In [127]:
# parse created_at in twt_list into datetime format

for i in range(len(twt_list)):
    twt_list[i]['created_at'] = parse_date(twt_list[i]['created_at'])
    twt_list[i]['user']['created_at'] = parse_date(twt_list[i]['user']['created_at'])

In [128]:
# parse created_at in retwt_list into datetime format

for i in range(len(retwt_list)):
    retwt_list[i]['created_at'] = parse_date(retwt_list[i]['created_at'])
    retwt_list[i]['user']['created_at'] = parse_date(retwt_list[i]['user']['created_at'])

In [129]:
len(userid_table)

80943

In [130]:
len(user_list)

80943

In [131]:
len(twt_list)

40793

In [132]:
user_list[1]

{'id_str': '2242948745',
 'name': 'Thomas Krause',
 'screen_name': 'tho1965',
 'url': None,
 'description': 'Sportredakteur @nordkurier 🏃\u200d♂️🚴\u200d♂️⚽️',
 'protected': False,
 'verified': False,
 'followers_count': 173,
 'friends_count': 685,
 'listed_count': 9,
 'favourites_count': 2184,
 'statuses_count': 1865,
 'created_at': 'Wed Dec 25 09:13:33 +0000 2013'}

In [133]:
twt_lines

40793

In [134]:
len(twt_list)

40793

In [135]:
twt_list[1]

{'created_at': '2020-04-25',
 'id_str': '1254022770746372096',
 'text': 'Schöne Runde mit dem Rennrad ✌️\n#sport #corona https://t.co/Uglj9YJPI1',
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'quote_count': 0,
 'reply_count': 0,
 'retweet_count': 0,
 'favorite_count': 0,
 'entities': {'hashtags': [{'text': 'sport', 'indices': [32, 38]},
   {'text': 'corona', 'indices': [39, 46]}]},
 'extended_entities': {'media': [{'id': 1254022763834155008,
    'id_str': '1254022763834155008',
    'indices': [47, 70],
    'media_url': 'http://pbs.twimg.com/media/EWcvD0KWoAAIMuu.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/EWcvD0KWoAAIMuu.jpg',
    'url': 'https://t.co/Uglj9YJPI1',
    'display_url': 'pic.twitter.com/Uglj9YJPI1',
    'expanded_url': 'https://twitter.com/tho1965/status/1254022770746372096/photo/1',
    'type': 'photo',
    'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
     'small': {'w': 680, 'h': 680,

In [136]:
retwt_lines

61101

In [137]:
len(retwt_list)

61101

In [138]:
retwt_list[1]

{'created_at': '2020-04-25',
 'id_str': '1254022772877131777',
 'text': 'RT @schrodingerk42: @ozkan_yalim @DurmusYillmaz \nAçık kapalı görüşler yasak olduğu için sadece telefon görüşlerinde kendisinden haber alabi…',
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'quote_count': 0,
 'reply_count': 0,
 'retweet_count': 0,
 'favorite_count': 0,
 'entities': {'hashtags': []},
 'retweeted_status': {'id_str': '1252576316135739392'},
 'user': {'id_str': '1206650133976408064',
  'name': 'Büşra Öztaş',
  'screen_name': 'schrodingerk42',
  'url': None,
  'description': None,
  'protected': False,
  'verified': False,
  'followers_count': 318,
  'friends_count': 220,
  'listed_count': 0,
  'favourites_count': 1974,
  'statuses_count': 405,
  'created_at': '2019-12-16'}}

In [139]:
# # # encapsulation test: don't know why it didn't work as a function

# # store tweets/retweets data & count nums
# def retwt_store(data):
#     # list for store tweets/retweets data
#     selected_dict = {}
#     retwt_lines = retwt_lines + 1
#     for attribute in retwt_attributes:
#         try:
#             # assigns the reference: any changes made to one will affect the other
#             current_dict = data
#             current_selected_dict = selected_dict
#             # parent-child attributes
#             for sub_attribute in attribute.split('.'):
#                 # parent attributes
#                 current_dict = current_dict[sub_attribute]
#                 # child attributes
#                 if sub_attribute == attribute.split('.')[-1]:
#                     current_selected_dict[sub_attribute] = current_dict
#                 else:
#                     if sub_attribute not in current_selected_dict:
#                         current_selected_dict[sub_attribute] = {}
#                     current_selected_dict = current_selected_dict[sub_attribute]
#         except:
#             continue
#     retwt_list.append(selected_dict)

## Store in Redis (test: user_list; tweet)

In [341]:
import redis # for cache
import pymongo # for NoSQL data store

from redis.commands.json.path import Path
import redis.commands.search.aggregation as aggregations
import redis.commands.search.reducers as reducers
from redis.commands.search.field import TextField, NumericField, TagField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import NumericFilter, Query

# connect to a Redis cluster
from redis.cluster import RedisCluster
from string import ascii_letters
import random

In [None]:
# for remote connection
# redis_client = redis.Redis(host='172.31.139.108', port=6379, decode_responses=True, password = "7ptbtptp")
# redis_client = redis.Redis(host=172.31.139.108, port=6379, decode_responses=True, password = "7ptbtptp")

In [363]:
# connect to redis
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True, password = "7ptbtptp")
redis_pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True, password = "7ptbtptp")

In [364]:
redis_client.config_set('maxmemory', '50mb') # ~ 20000 entries
redis_client.config_set('maxmemory-policy', 'volatile-lfu') # eviction policy determines what happens when Redis reaches its memory limit
# volatile-lfu: Evicts the least frequently used keys from those that have a TTL set.

True

In [448]:
# initiate: remove all the data exists.
for key in redis_client.keys():
    redis_client.delete(key)

In [449]:
# check keys in the Redis database
len(redis_client.keys())

0

In [367]:
import time
import datetime
from datetime import datetime, date
from typing import List, Optional
# Redis OM uses Pydantic to validate data based on the type annotations assigned to fields in a model class.
from pydantic import BaseModel, NonNegativeInt, HttpUrl, AnyUrl, ValidationError # every Redis OM model is also a Pydantic model
from redis_om import (EmbeddedJsonModel, Field, JsonModel, HashModel, Migrator, NotFoundError, get_redis_connection)

[Field Types supported by Pydantic](https://docs.pydantic.dev/usage/types/)

In [368]:
# del twt_user_model # for initiating
# del InnerHashtagModel
# del EntitiesModel
# del twt_model

In [445]:
# build an embedded sub-model twt_user with Redis OM

class twt_user_model(EmbeddedJsonModel):
    id_str: str = Field(index=True)
    name: str = Field(index=True, full_text_search=True)
    screen_name: str = Field(index=True, full_text_search=True)
    url: Optional[AnyUrl]
    description: Optional[str] = Field(index=False, full_text_search=True) # perform full text searches on the values
    protected: Optional[bool]
    verified: Optional[bool]
    # followers_count: Optional[NonNegativeInt] = Field(index=True)
    # friends_count: Optional[NonNegativeInt] = Field(index=True)
    followers_count: int = Field(index=True)
    friends_count: int = Field(index=True)
    listed_count: Optional[NonNegativeInt]
    favourites_count: Optional[NonNegativeInt]
    statuses_count: Optional[NonNegativeInt] = Field(index=True)
    created_at: date = Field(index=True)
    
    class Meta:
        # A redis.asyncio.Redis or redis.Redis client instance that the model will use to communicate with Redis.
        database = get_redis_connection(password = "7ptbtptp")

In [446]:
# ver.2 build embedded sub-models for hashtags in entities

class InnerHashtagModel(EmbeddedJsonModel):
    text: Optional[str] = Field(index=True)
    indices: Optional[List[int]]
    
    class Meta:
        database = get_redis_connection(password = "7ptbtptp")
        index_fields = ['text']

# class EntitiesModel(EmbeddedJsonModel):
#     hashtags: List[InnerHashtagModel]


class EntitiesModel(EmbeddedJsonModel):
    hashtags: List[InnerHashtagModel]
    
    class Meta:
        # A redis.asyncio.Redis or redis.Redis client instance that the model will use to communicate with Redis.
        database = get_redis_connection(password = "7ptbtptp")

In [371]:
twt_list[1]

{'created_at': '2020-04-25',
 'id_str': '1254022770746372096',
 'text': 'Schöne Runde mit dem Rennrad ✌️\n#sport #corona https://t.co/Uglj9YJPI1',
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'quote_count': 0,
 'reply_count': 0,
 'retweet_count': 0,
 'favorite_count': 0,
 'entities': {'hashtags': [{'text': 'sport', 'indices': [32, 38]},
   {'text': 'corona', 'indices': [39, 46]}]},
 'extended_entities': {'media': [{'id': 1254022763834155008,
    'id_str': '1254022763834155008',
    'indices': [47, 70],
    'media_url': 'http://pbs.twimg.com/media/EWcvD0KWoAAIMuu.jpg',
    'media_url_https': 'https://pbs.twimg.com/media/EWcvD0KWoAAIMuu.jpg',
    'url': 'https://t.co/Uglj9YJPI1',
    'display_url': 'pic.twitter.com/Uglj9YJPI1',
    'expanded_url': 'https://twitter.com/tho1965/status/1254022770746372096/photo/1',
    'type': 'photo',
    'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
     'small': {'w': 680, 'h': 680,

### twt_model for original tweets

In [447]:
# build a twt_model (JsonModel) for original tweet data
class twt_model(JsonModel):
    created_at: date = Field(index=True) # so many outdated docs...finally find this...orz
    id_str: str = Field(index=True)
    text: str = Field(index=True, full_text_search=True) # perform full text searches on the values
    in_reply_to_status_id_str: Optional[str] = Field(index=False, full_text_search=True)
    in_reply_to_user_id_str: Optional[str] = Field(index=False, full_text_search=True)
    in_reply_to_screen_name: Optional[str] = Field(index=False, full_text_search=True)
    quoted_status_id_str: Optional[str] = Field(index=False, full_text_search=True)
    # quoted_status: Optional[str]
    # quote_count: Optional[NonNegativeInt] = Field(index=True)
    # reply_count: Optional[NonNegativeInt] = Field(index=True)
    # retweet_count: Optional[NonNegativeInt] = Field(index=True)
    # favorite_count: Optional[NonNegativeInt] = Field(index=True)
    quote_count: int = Field(index=True)
    reply_count: int = Field(index=True)
    retweet_count: int = Field(index=True)
    favorite_count: int = Field(index=True)
    # entities: Optional[dict] = Field(index=True)
    entities: EntitiesModel
    extended_entities: Optional[dict]
    user: twt_user_model
    
    class Meta:
        # A redis.asyncio.Redis or redis.Redis client instance that the model will use to communicate with Redis.
        database = get_redis_connection(password = "7ptbtptp")

In [373]:
# figure out what to store in cache, based on the query

In [450]:
# store twt_list data(1/10) into Redis line-wise

for i in range(int(len(twt_list)/10)):
    new_twt = twt_model(**twt_list[i])
    new_twt.save()
    new_twt.expire(24 * 3600) # time to live: 1 day

In [451]:
# len(twt_list) # 40793

In [452]:
# use Migrator to create the indexes for any models that have indexed fields
Migrator().run()

In [453]:
# check storage status
len(redis_client.keys()) # 40793/10

4084

### retwt_model for retweets

In [None]:
# TBA...

## Querying: Querying for Models With Expressions | by & select

### by pk

In [393]:
# twt_model query by key

# twt_model.get("01GYVBCYF7PV4NR6R7BGMKP0YQ").dict() # Models generate a globally unique primary key automatically without needing to talk to Redis.

### by user

In [463]:
redis_client.get("01GYWT4GA4Z6NP80NH88Y1QYJC")

In [411]:
def rsearch_by_screen_name(name):
    # by screen_name
      try:
            person = twt_model.find(twt_model.user.screen_name == name).all()
            for i in range(len(person)):
                print(test[i])
        
      except NotFoundError:
        return "No such content"

In [413]:
rsearch_by_screen_name("tho1965")

pk='01GYWT4BD8QSC2Q9GWQXBPPN59' created_at=datetime.date(2020, 4, 25) id_str='1254023924314185735' text='Corona Vaccine Ka Tajurba Bandaron (Monkey) Pr Kamiyab.\n*\nChin Ne Corona Vaccine Ki Testing K Liye Pakistan Se Madad Mang Li.' in_reply_to_status_id_str=None in_reply_to_user_id_str=None in_reply_to_screen_name=None quoted_status_id_str=None quote_count=0 reply_count=0 retweet_count=0 favorite_count=0 entities=EntitiesModel(pk='01GYWT4BD8NQXRSB6BDX78222J', hashtags=[]) extended_entities=None user=twt_user_model(pk='01GYWT4BD8BFNRJD4ZDVV9VTH5', id_str='635137913', name='GeoTankTez Official', screen_name='GeoTankTez', url=None, description='Welcome To GeoTankTez Thankyou for activation.For  Breaking News , On your Mobile Also, Type Follow GeoTankTez And send to 40404.Ph# (03469491857)', protected=False, verified=False, followers_count=1932, friends_count=11, listed_count=5, favourites_count=5, statuses_count=15048, created_at=datetime.date(2012, 7, 14))


In [395]:
twt_model.find(twt_model.user.screen_name == "GeoTankTez").all()

[twt_model(pk='01GYWT4BD8QSC2Q9GWQXBPPN59', created_at=datetime.date(2020, 4, 25), id_str='1254023924314185735', text='Corona Vaccine Ka Tajurba Bandaron (Monkey) Pr Kamiyab.\n*\nChin Ne Corona Vaccine Ki Testing K Liye Pakistan Se Madad Mang Li.', in_reply_to_status_id_str=None, in_reply_to_user_id_str=None, in_reply_to_screen_name=None, quoted_status_id_str=None, quote_count=0, reply_count=0, retweet_count=0, favorite_count=0, entities=EntitiesModel(pk='01GYWT4BD8NQXRSB6BDX78222J', hashtags=[]), extended_entities=None, user=twt_user_model(pk='01GYWT4BD8BFNRJD4ZDVV9VTH5', id_str='635137913', name='GeoTankTez Official', screen_name='GeoTankTez', url=None, description='Welcome To GeoTankTez Thankyou for activation.For  Breaking News , On your Mobile Also, Type Follow GeoTankTez And send to 40404.Ph# (03469491857)', protected=False, verified=False, followers_count=1932, friends_count=11, listed_count=5, favourites_count=5, statuses_count=15048, created_at=datetime.date(2012, 7, 14)))]

In [621]:
# test = twt_model.find(twt_model.user.screen_name == "Sumaj_Warmi").all()
# test[0].entities["hashtags"][0]['text'] # extract text from entities: dict -> list -> dict

In [379]:
twt_model.find(twt_model.user.id_str == "635137913").all()

[twt_model(pk='01GYWT4BD8QSC2Q9GWQXBPPN59', created_at=datetime.date(2020, 4, 25), id_str='1254023924314185735', text='Corona Vaccine Ka Tajurba Bandaron (Monkey) Pr Kamiyab.\n*\nChin Ne Corona Vaccine Ki Testing K Liye Pakistan Se Madad Mang Li.', in_reply_to_status_id_str=None, in_reply_to_user_id_str=None, in_reply_to_screen_name=None, quoted_status_id_str=None, quote_count=0, reply_count=0, retweet_count=0, favorite_count=0, entities=EntitiesModel(pk='01GYWT4BD8NQXRSB6BDX78222J', hashtags=[]), extended_entities=None, user=twt_user_model(pk='01GYWT4BD8BFNRJD4ZDVV9VTH5', id_str='635137913', name='GeoTankTez Official', screen_name='GeoTankTez', url=None, description='Welcome To GeoTankTez Thankyou for activation.For  Breaking News , On your Mobile Also, Type Follow GeoTankTez And send to 40404.Ph# (03469491857)', protected=False, verified=False, followers_count=1932, friends_count=11, listed_count=5, favourites_count=5, statuses_count=15048, created_at=datetime.date(2012, 7, 14)))]

In [380]:
# by screen name

twt_model.find(twt_model.user.screen_name == "tho1965").all()

[twt_model(pk='01GYWT47S36QHNQFYP25JMNJFG', created_at=datetime.date(2020, 4, 25), id_str='1254022770746372096', text='Schöne Runde mit dem Rennrad ✌️\n#sport #corona https://t.co/Uglj9YJPI1', in_reply_to_status_id_str=None, in_reply_to_user_id_str=None, in_reply_to_screen_name=None, quoted_status_id_str=None, quote_count=0, reply_count=0, retweet_count=0, favorite_count=0, entities=EntitiesModel(pk='01GYWT47S4FTP6TYQS7TNBN4X4', hashtags=[InnerHashtagModel(pk='01GYWT47SB6DQA3YYV4QW9RECK', text='sport', indices=[32, 38]), InnerHashtagModel(pk='01GYWT47SBSPSZVYK526KD86B5', text='corona', indices=[39, 46])]), extended_entities={'media': [{'id': 1254022763834155008, 'id_str': '1254022763834155008', 'indices': [47, 70], 'media_url': 'http://pbs.twimg.com/media/EWcvD0KWoAAIMuu.jpg', 'media_url_https': 'https://pbs.twimg.com/media/EWcvD0KWoAAIMuu.jpg', 'url': 'https://t.co/Uglj9YJPI1', 'display_url': 'pic.twitter.com/Uglj9YJPI1', 'expanded_url': 'https://twitter.com/tho1965/status/12540227707

### by str

In [381]:
# search by str

len(twt_model.find(twt_model.text % "happy").all())

5

### by hashtag

In [382]:
len(twt_model.find(twt_model.entities.hashtags.text == "Corona").all())

210

In [385]:
twt_model.find(twt_model.entities.hashtags.text == "BlueTwitter").all()[1]

twt_model(pk='01GYWT4FKKN9GJ4P362JXW9KZN', created_at=datetime.date(2020, 4, 25), id_str='1254025437602250754', text='@DesiPoliticks Was he trying to spread corona by urinating? #BlueTwitter Looks like he is on a "mission".', in_reply_to_status_id_str='1254003610909839360', in_reply_to_user_id_str='21211093', in_reply_to_screen_name='DesiPoliticks', quoted_status_id_str=None, quote_count=0, reply_count=0, retweet_count=0, favorite_count=0, entities=EntitiesModel(pk='01GYWT4FKKEJDTK5EKEHVP2783', hashtags=[InnerHashtagModel(pk='01GYWT4FKM8YSETMPM3GMPA920', text='BlueTwitter', indices=[60, 72])]), extended_entities=None, user=twt_user_model(pk='01GYWT4FKMNF1R3VJVK9K8VSPT', id_str='4874472577', name='bilalpha', screen_name='bilalpha', url=None, description='|Indian | |Intrested in Current Affairs | ♥ @dota2', protected=False, verified=False, followers_count=996, friends_count=441, listed_count=4, favourites_count=3521, statuses_count=2290, created_at=datetime.date(2016, 2, 7)))

In [190]:
# len(twt_model.find(twt_model.entities.hashtags.text == "noHashTag").all())

4079

In [339]:
# twt_model.find(twt_model.entities.hashtags.text == "COVID19InTurkeysPrisons").all()

[]

### by some number

In [432]:
twt_list[1]["user"]

{'id_str': '2242948745',
 'name': 'Thomas Krause',
 'screen_name': 'tho1965',
 'url': None,
 'description': 'Sportredakteur @nordkurier 🏃\u200d♂️🚴\u200d♂️⚽️',
 'protected': False,
 'verified': False,
 'followers_count': 173,
 'friends_count': 685,
 'listed_count': 9,
 'favourites_count': 2184,
 'statuses_count': 1865,
 'created_at': '2013-12-25'}

In [419]:
top_tweeter = twt_model.find(
    twt_model.user.followers_count >= 2000000).sort_by("id_str").all()

In [50]:
top_tweeter = twt_model.find(
    twt_model.user.followers_count >= 2000000).all()

In [51]:
schema = {'user_screen_name': [], 'followers_count': []}
top_tweeter_df = pd.DataFrame(schema)

for i in range(len(top_tweeter)):
    top_tweeter_df.at[i, 'user_screen_name'] = top_tweeter[i].user.screen_name
    top_tweeter_df.at[i, 'followers_count'] = top_tweeter[i].user.followers_count
    
print(top_tweeter_df)

  user_screen_name  followers_count
0    skynewsarabia        5652459.0
1    skynewsarabia        5652410.0
2       la_patilla        7121282.0
3         detikcom       15884929.0
4   casspernyovest        2702055.0


In [657]:
# # search by some number: tweeter with the mamixmum number of followers

# top_tweeter = twt_model.find(
#     twt_model.user.followers_count >= 2000000
# ).sort_by("-followers_count").all()

In [268]:
# for i in range(len(top_tweeter)):
#     print(top_tweeter[i].screen_name + "\t" + str(top_tweeter[i].followers_count) + "\n")

detikcom	15884929

la_patilla	7121282

skynewsarabia	5652410

virsanghvi	4329132

NTelevisa_com	3047122

repubblica	2992272

casspernyovest	2702055



### by time range

In [52]:
len(twt_model.find((twt_model.created_at == "2020-04-12")).all()) # all data are from 2020-04-12???

7349

In [53]:
len(twt_model.find((twt_model.user.created_at == "2017-11-29")).all())

2