In [1]:
from civirank import analyzers, parsers, rankers

import json
import numpy as np

import time
import pickle

import warnings
with warnings.catch_warnings():
    warnings.simplefilter(action='ignore', category=FutureWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load sample data
fname = "twitter_test.json"
with open(fname, "r") as fin:
    sample_data_twitter = json.load(fin)
fname = "reddit_test.json"
with open(fname, "r") as fin:
    sample_data_reddit = json.load(fin)
fname = "facebook_test.json"
with open(fname, "r") as fin:
    sample_data_facebook = json.load(fin)

In [5]:
sample_data_facebook

{'session': {'session_id': 'fqeupdgwedzoiywzfbaapkibaxwgnpsf',
  'user_id': 'ef80987c82f99d3ede090805bffd1df40473042f190ca34040bad9abddf6c73c',
  'user_name_hash': 'bceb40f9d8e2a8d706bab16f7466e7a45f8a71053eddc1257f96b15e43b739fc',
  'cohort': 'XX',
  'platform': 'facebook',
  'url': 'https://facebook.com/opwoolwt',
  'current_time': '2024-05-21T15:04:18.768722'},
 'survey': None,
 'items': [{'id': '83c1f44bd0ba196f8475fcbddb5b603fe45a972aff48b942ec953abdea07826e',
   'post_id': None,
   'parent_id': '',
   'title': None,
   'text': "GIANT GEM: One of the world's largest diamonds was up for sale today, but the auction didn't go exactly as planned. Lillian has details.",
   'author_name_hash': '4a51edb69aca8ba40dd0121e6302615c9c266bc87a37846e2e17d32c181e4747',
   'type': 'post',
   'embedded_urls': [],
   'created_at': '2017-05-11T23:11:00',
   'engagements': {'like': 58,
    'love': 2,
    'care': 0,
    'haha': 1,
    'wow': 15,
    'sad': 1,
    'angry': 0,
    'comment': 5,
    'sha

In [3]:
# individual data parsing functions for every platform
parse_funcs = {
    "twitter":parsers.parse_twitter_posts,
    "reddit":parsers.parse_reddit_posts,
    "facebook":parsers.parse_facebook_posts
}

# analyzers to calculate the individual scores
TrustworthinessAnalyzer = analyzers.TrustworthinessAnalyzer()
ToxicityAnalyzer = analyzers.ToxicityAnalyzer()
PolarizationAnalyzer = analyzers.PolarizationAnalyzer()
ProsocialityAnalyzer = analyzers.ProsocialityAnalyzer()
LexicalDensityAnalyzer = analyzers.LexicalDensityAnalyzer()

# relative weights of the scores in the final compound score
weights = {
    "no_toxicity":1,
    "no_polarization":1,
    "mtld":0.5,
    "trustworthiness":2,
    "prosociality":1
}

# scores that are considered in the compound score
scores = weights.keys()

# minimum scores necessary to calculate a compound score
min_scores = 3



In [4]:
datasets = [sample_data_twitter, sample_data_reddit, sample_data_facebook]
rankings = {}
all_posts = {}

lim = 100
print(f"{lim} posts")
for dataset in datasets:
    platform = dataset["session"]["platform"]

    #times it by platform
    tic = time.time()
    posts = parse_funcs[platform](dataset["items"], lim=lim)
    posts["trustworthiness"] = TrustworthinessAnalyzer.get_trustworthiness_scores(posts)
    posts["toxicity"] = ToxicityAnalyzer.get_toxicity_scores(posts)
    posts["polarization"] = PolarizationAnalyzer.get_polarization_similarity(posts)
    posts["prosociality"] = ProsocialityAnalyzer.get_prosociality_similarity(posts)
    posts["mtld"] = LexicalDensityAnalyzer.get_mtld(posts)

    # remove prosociality and polarization values for non-english posts
    posts.loc[posts[posts["lang"] != "en"].index, "polarization"] = np.nan
    posts.loc[posts[posts["lang"] != "en"].index, "prosociality"] = np.nan
    posts = analyzers.normalize(posts)

    # calculate the number of scores a post has
    posts["N_scores"] = posts[scores].apply(lambda x: len(x.dropna()), axis=1)

    # calculate the compound score for posts with > (min_scores - 1) scores
    posts["compound_score"] = posts[scores].apply(analyzers.calculate_compound_score, args=(weights, min_scores,), axis=1)

    # remove all posts that do not have a compound score because they received
    # too few individual scores
    posts = posts.dropna(subset=["compound_score"])
    
    # sort posts in descending order based on compound score
    posts = posts.sort_values(by="compound_score", ascending=False)
    posts = posts.reset_index(drop=True)

    rankings[platform] = list(posts["id"])
    all_posts[platform] = posts

    toc = time.time()
    print(f"{platform} parse rate: {lim/(toc-tic):.2f} posts/s ({toc-tic:.2f}s)")

100 posts


  attn_output = torch.nn.functional.scaled_dot_product_attention(


twitter parse rate: 85.19 posts/s (1.17s)
reddit parse rate: 51.48 posts/s (1.94s)
facebook parse rate: 70.39 posts/s (1.42s)


In [3]:
lim = 100
batch_size = 8
print(f"{lim} posts")
v = rankers.LocalRanker(batch_size=batch_size, lim=lim)
datasets = [sample_data_twitter, sample_data_reddit, sample_data_facebook]

tictoc = {}

for dataset in datasets:
    platform = dataset["session"]["platform"]
    
    tic = time.time()
    v2 = v.rank(dataset)
    toc = time.time()

    tictoc[platform] = toc - tic

for dataset in datasets:
    platform = dataset["session"]["platform"]
    print(f"{platform} rank rate: {lim/tictoc[platform]:.2f} posts/s ({tictoc[platform]:.2f}s)")

100 posts


[0;93m2024-06-15 05:26:21.479081532 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 05:26:21.479105447 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


twitter rank rate: 359.60 posts/s (0.28s)
reddit rank rate: 406.23 posts/s (0.25s)
facebook rank rate: 689.21 posts/s (0.15s)


In [4]:
lim = 100
batch_size = 8
print(f"{lim} posts")
v = rankers.LocalRanker(batch_size=batch_size, lim=lim)
datasets = [sample_data_twitter, sample_data_reddit, sample_data_facebook]

tictoc = {}

for dataset in datasets:
    platform = dataset["session"]["platform"]
    
    tic = time.time()
    v2 = v.rank(dataset)
    toc = time.time()

    tictoc[platform] = toc - tic

for dataset in datasets:
    platform = dataset["session"]["platform"]
    print(f"{platform} rank rate: {lim/tictoc[platform]:.2f} posts/s ({tictoc[platform]:.2f}s)")

100 posts


[0;93m2024-06-17 17:36:07.558849763 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-17 17:36:07.558877416 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


twitter rank rate: 300.89 posts/s (0.33s)
reddit rank rate: 393.14 posts/s (0.25s)
facebook rank rate: 638.11 posts/s (0.16s)


In [3]:
import timeit
import numpy as np

lim = 100
batch_size = 8
num_runs = 10  # Define the number of runs for averaging
print(f"{lim} posts")
v = rankers.LocalRanker(batch_size=batch_size, lim=lim)
datasets = [sample_data_twitter, sample_data_reddit, sample_data_facebook]

tictoc = {}

for dataset in datasets:
    platform = dataset["session"]["platform"]
    
    # Measure the time taken to rank the dataset multiple times using timeit
    times = timeit.repeat(lambda: v.rank(dataset), number=1, repeat=num_runs)
    
    # Calculate the average and standard deviation
    avg_time = np.mean(times)
    std_dev_time = np.std(times)
    
    tictoc[platform] = {'avg_time': avg_time, 'std_dev_time': std_dev_time}

# Print the results
for dataset in datasets:
    platform = dataset["session"]["platform"]
    avg_time = tictoc[platform]['avg_time']
    std_dev_time = tictoc[platform]['std_dev_time']
    print(f"{platform} rank rate: {lim/avg_time:.2f} posts/s (avg: {avg_time:.2f}s, std dev: {std_dev_time:.2f}s)")



100 posts


[0;93m2024-06-15 05:43:57.484932140 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 05:43:57.484956656 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


twitter rank rate: 1051.97 posts/s (avg: 0.10s, std dev: 0.02s)
reddit rank rate: 649.70 posts/s (avg: 0.15s, std dev: 0.01s)
facebook rank rate: 652.98 posts/s (avg: 0.15s, std dev: 0.02s)


In [4]:
# glove, wsl, unbiased, onnx, XLM
lim = 100
v = rankers.LocalRankerEN(lim=lim)
tic = time.time()
v2 = v.rank(sample_data_reddit)
toc = time.time()
print(f"Local ranker rate: {lim/(toc-tic):.2f} posts/s ({toc-tic:.2f}s)")

[0;93m2024-06-19 01:44:30.822015115 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-19 01:44:30.822045953 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Local ranker rate: 18.46 posts/s (5.42s)


In [6]:
# glove, wsl, unbiased, onnx
lim = 100
v = rankers.LocalRanker(lim=lim)
tic = time.time()
v2 = v.rank(sample_data_reddit)
toc = time.time()
print(f"Local ranker rate: {lim/(toc-tic):.2f} posts/s ({toc-tic:.2f}s)")

[0;93m2024-06-15 04:43:49.289723593 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 04:43:49.289753530 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Local ranker rate: 269.88 posts/s (0.37s)


In [3]:
# glove, wsl, unbiased, onnx, XLM
%load_ext line_profiler

v = rankers.LocalRankerEN()

%lprun -u 1 -f v.rank v.rank(sample_data_reddit)

[0;93m2024-06-19 01:42:59.715269477 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-19 01:42:59.715300846 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Timer unit: 1 s

Total time: 6.01264 s
File: /home/joaopn/repo/civirank/civirank/rankers.py
Function: rank at line 125

Line #      Hits         Time  Per Hit   % Time  Line Contents
   126                                           
   127                                                   # Check if ranking_request is a RankingRequest object or a dictionary
   128         1          0.0      0.0      0.0          if isinstance(ranking_request, RankingRequest):
   129                                                       dataset = ranking_request.dict()
   130                                                   else:
   131         1          0.0      0.0      0.0              dataset = ranking_request
   132                                           
   133         1          0.0      0.0      0.0          platform = dataset["session"]["platform"]
   134                                                   
   135                                                   # Detect language of each p

In [5]:
# glove, wsl, unbiased, onnx
%load_ext line_profiler

v = rankers.LocalRanker()

%lprun -u 1 -f v.rank v.rank(sample_data_reddit)

[0;93m2024-06-15 04:42:07.286910769 [W:onnxruntime:, session_state.cc:1166 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2024-06-15 04:42:07.286938521 [W:onnxruntime:, session_state.cc:1168 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


Timer unit: 1 s

Total time: 0.568338 s
File: /home/joaopn/repo/civirank/civirank/rankers.py
Function: rank at line 38

Line #      Hits         Time  Per Hit   % Time  Line Contents
    38                                               def rank(self, dataset):
    39                                           
    40         1          0.0      0.0      0.0          rankings = {}
    41         1          0.0      0.0      0.0          all_posts = {}
    42                                           
    43         1          0.0      0.0      0.0          platform = dataset["session"]["platform"]
    44                                           
    45       201          0.0      0.0      0.0          for i in range(len(dataset["items"])):
    46       200          0.0      0.0      0.5              dataset['items'][i]['lang'] = self.LanguageAnalyzer.detect_language(dataset['items'][i]['text'].replace('\n', ' '))
    47                                           
    48         1        

In [3]:
# glove, wsl linux, unbiased
%load_ext line_profiler

v = rankers.LocalRanker()

%lprun -u 1 -f v.rank v.rank(sample_data_reddit)



Timer unit: 1 s

Total time: 0.842216 s
File: /home/joaopn/repo/civirank/civirank/rankers.py
Function: rank at line 39

Line #      Hits         Time  Per Hit   % Time  Line Contents
    39                                               def rank(self, dataset):
    40                                           
    41         1          0.0      0.0      0.0          rankings = {}
    42         1          0.0      0.0      0.0          all_posts = {}
    43                                           
    44         1          0.0      0.0      0.0          platform = dataset["session"]["platform"]
    45                                           
    46       201          0.0      0.0      0.0          for i in range(len(dataset["items"])):
    47       200          0.0      0.0      0.5              dataset['items'][i]['lang'] = self.LanguageAnalyzer.detect_language(dataset['items'][i]['text'].replace('\n', ' '))
    48                                           
    49         1        

In [7]:
# glove, wsl linux
%load_ext line_profiler

v = rankers.LocalRanker()

%lprun -u 1 -f v.rank v.rank(sample_data_reddit)

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler




Timer unit: 1 s

Total time: 0.594668 s
File: /home/joaopn/repo/civirank/civirank/rankers.py
Function: rank at line 39

Line #      Hits         Time  Per Hit   % Time  Line Contents
    39                                               def rank(self, dataset):
    40                                           
    41         1          0.0      0.0      0.0          rankings = {}
    42         1          0.0      0.0      0.0          all_posts = {}
    43                                           
    44         1          0.0      0.0      0.0          platform = dataset["session"]["platform"]
    45                                           
    46       201          0.0      0.0      0.0          for i in range(len(dataset["items"])):
    47       200          0.0      0.0      0.3              dataset['items'][i]['lang'] = self.LanguageAnalyzer.detect_language(dataset['items'][i]['text'].replace('\n', ' '))
    48                                           
    49         1        

In [5]:
# glove
%load_ext line_profiler

v = rankers.LocalRanker()

%lprun -u 1 -f v.rank v.rank(sample_data_reddit)

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler




Timer unit: 1 s

Total time: 1.19834 s
File: c:\Users\Joao\repo\civirank\civirank\rankers.py
Function: rank at line 39

Line #      Hits         Time  Per Hit   % Time  Line Contents
    39                                               def rank(self, dataset):
    40                                           
    41         1          0.0      0.0      0.0          rankings = {}
    42         1          0.0      0.0      0.0          all_posts = {}
    43                                           
    44         1          0.0      0.0      0.0          platform = dataset["session"]["platform"]
    45                                           
    46       201          0.0      0.0      0.0          for i in range(len(dataset["items"])):
    47       200          0.0      0.0      0.4              dataset['items'][i]['lang'] = self.LanguageAnalyzer.detect_language(dataset['items'][i]['text'].replace('\n', ' '))
    48                                           
    49         1        

In [3]:
#original-small
lim = 150
v = rankers.LocalRanker(lim=lim)
tic = time.time()
v2 = v.rank(sample_data_reddit)
toc = time.time()
print(f"Local ranker rate: {lim/(toc-tic):.2f} posts/s ({toc-tic:.2f}s)")



Local ranker rate: 110.40 posts/s (1.36s)


In [9]:
lim = 150
v = rankers.LocalRanker(lim=lim)
tic = time.time()
v2 = v.rank(sample_data_reddit)
toc = time.time()
print(f"Local ranker rate: {lim/(toc-tic):.2f} posts/s ({toc-tic:.2f}s)")



Local ranker rate: 103.23 posts/s (1.45s)


In [8]:
v = rankers.LocalRanker()
tic = time.time()
v2 = v.rank(sample_data_reddit)
toc = time.time()
print(f"Local ranker rate: {lim/(toc-tic):.2f} posts/s ({toc-tic:.2f}s)")



Local ranker rate: 29.06 posts/s (3.44s)


In [4]:
%load_ext line_profiler

v = rankers.LocalRanker()

%lprun -u 1 -f v.rank v.rank(sample_data_twitter)



Timer unit: 1 s

Total time: 0.843736 s
File: c:\Users\Joao\repo\civirank\civirank\rankers.py
Function: rank at line 36

Line #      Hits         Time  Per Hit   % Time  Line Contents
    36                                               def rank(self, dataset):
    37                                           
    38         1          0.0      0.0      0.0          rankings = {}
    39         1          0.0      0.0      0.0          all_posts = {}
    40                                           
    41         1          0.0      0.0      0.0          platform = dataset["session"]["platform"]
    42                                           
    43       201          0.0      0.0      0.0          for i in range(len(dataset["items"])):
    44       200          0.0      0.0      0.5              dataset['items'][i]['lang'] = self.LanguageAnalyzer.detect_language(dataset['items'][i]['text'].replace('\n', ' '))
    45                                           
    46         1       

In [3]:
%load_ext line_profiler

v = rankers.LocalRanker()

%lprun -u 1 -f v.rank v.rank(sample_data_twitter)


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Timer unit: 1 s

Total time: 1.95094 s
File: c:\Users\Joao\repo\civirank\civirank\rankers.py
Function: rank at line 36

Line #      Hits         Time  Per Hit   % Time  Line Contents
    36                                               def rank(self, dataset):
    37                                           
    38         1          0.0      0.0      0.0          rankings = {}
    39         1          0.0      0.0      0.0          all_posts = {}
    40                                           
    41         1          0.0      0.0      0.0          platform = dataset["session"]["platform"]
    42                                           
    43       201          0.0      0.0      0.0          for i in range(len(dataset["items"])):
    44       200          0.0      0.0      0.2              dataset['items'][i]['lang'] = self.LanguageAnalyzer.detect_language(dataset['items'][i]['text'].replace('\n', ' '))
    45                                           
    46         1        

In [6]:
v = rankers.LocalRanker()
tic = time.time()
v2 = v.rank(sample_data_twitter)
toc = time.time()
print(f"Local ranker rate: {len(sample_data_twitter['items'])/(toc-tic):.2f} posts/s ({toc-tic:.2f}s)")



Local ranker rate: 164.85 posts/s (1.21s)


In [9]:
%load_ext line_profiler

v = rankers.LocalRanker()

%lprun -u 1 -f v.rank v.rank(sample_data_twitter)




Timer unit: 1 s

Total time: 5.27185 s
File: c:\Users\Joao\repo\civirank\civirank\rankers.py
Function: rank at line 35

Line #      Hits         Time  Per Hit   % Time  Line Contents
    35                                               def rank(self, dataset):
    36                                           
    37         1          0.0      0.0      0.0          rankings = {}
    38         1          0.0      0.0      0.0          all_posts = {}
    39                                           
    40         1          0.0      0.0      0.0          platform = dataset["session"]["platform"]
    41                                           
    42         1          0.0      0.0      0.0          if platform == "twitter":
    43         1          3.8      3.8     72.2              posts = parsers.parse_twitter_posts(dataset["items"])
    44                                                   elif platform == "reddit":
    45                                                       post

In [10]:
v2[1]['reddit']['trustworthiness'].value_counts()

Series([], Name: count, dtype: int64)

In [11]:
tic = time.time()
parsers.parse_reddit_posts(sample_data_reddit["items"])
toc = time.time()
print(f"parse rate: {lim/(toc-tic):.2f} posts/s ({toc-tic:.2f}s)")

platform = sample_data_reddit["session"]["platform"]
tic = time.time()
parse_funcs = {
    "twitter":parsers.parse_twitter_posts,
    "reddit":parsers.parse_reddit_posts,
    "facebook":parsers.parse_facebook_posts
}       

posts = parse_funcs[platform](sample_data_reddit["items"])
toc = time.time()
print(f"parse rate: {lim/(toc-tic):.2f} posts/s ({toc-tic:.2f}s)")

parse rate: 132.75 posts/s (0.75s)
parse rate: 133.55 posts/s (0.75s)


In [12]:
sample_data_reddit["items"]

[{'id': '07735a67be89638837a84f9873878d3a846e55850dc15e01a5ec5af15f6e42b8',
  'post_id': '',
  'parent_id': '',
  'title': 'Stand menu name spoofing',
  'text': "I've been spoofing my name and it isn't working bc people keep calling me by my real username in chat even after turning it on and changing servers. is stand name spoofing broken or is it a problem on my end?",
  'author_name_hash': 'aa7b62dbc9d9f32e8d8ec8ee89b19e41edcafe7312df4872375532f6419fd9ef',
  'type': 'post',
  'embedded_urls': [],
  'created_at': '1970-01-01T00:00:01',
  'engagements': {'upvote': 1, 'downvote': 0, 'comment': 0, 'award': 0},
  'lang': 'en'},
 {'id': '06be9c690a05b0c1372cc76d978a3e8e640d0be1df73424f2cdb096560cbfbef',
  'post_id': '',
  'parent_id': '',
  'title': 'PunchedByGoliath',
  'text': '',
  'author_name_hash': 'acfacec3871b19b75a60d5cbcf236e77f9cc962c4f4b7bdc17339629616d2330',
  'type': 'post',
  'embedded_urls': [],
  'created_at': '1970-01-01T00:00:01',
  'engagements': {'upvote': 1, 'downvote

In [13]:
%lprun -u 1 -f parsers.parse_reddit_posts parsers.parse_reddit_posts(sample_data_reddit["items"])


Timer unit: 1 s

Total time: 5.24615 s
File: c:\Users\Joao\repo\civirank\civirank\parsers.py
Function: parse_reddit_posts at line 86

Line #      Hits         Time  Per Hit   % Time  Line Contents
    86                                           def parse_reddit_posts(posts_json, lim=False, debug=False):
    87         1          0.0      0.0      0.0      if lim:
    88                                                   posts_json = posts_json[0:lim]
    89                                                   
    90         1          0.0      0.0      0.0      IDs = [post.get("id") for post in posts_json]
    91         1          0.0      0.0      0.0      texts = [combine_reddit_text(post.get("title"), post.get("text")) for post in posts_json]
    92         1          0.0      0.0      0.0      url_lists = [extract_urls(text) for text in texts]
    93         1          0.0      0.0      0.0      domain_lists = [extract_domains(url_list) for url_list in url_lists]
    94             

In [14]:
%lprun -u 1 -f parsers.parse_twitter_posts parsers.parse_twitter_posts(sample_data_twitter["items"])


Timer unit: 1 s

Total time: 3.80512 s
File: c:\Users\Joao\repo\civirank\civirank\parsers.py
Function: parse_twitter_posts at line 36

Line #      Hits         Time  Per Hit   % Time  Line Contents
    36                                           def parse_twitter_posts(posts_json, lim=False, debug=False):
    37         1          0.0      0.0      0.0      if lim:
    38                                                   posts_json = posts_json[0:lim]
    39         1          0.0      0.0      0.0      IDs = [post.get("id") for post in posts_json]
    40         1          0.0      0.0      0.0      texts = [post.get("text") for post in posts_json]
    41                                           
    42                                               # the current data format seems to include a maximum of one "expanded_url",
    43                                               # therefore no need to deal with lists of urls/domains in posts
    44                                       

In [15]:
%lprun -u 1 -f parsers.parse_facebook_posts parsers.parse_facebook_posts(sample_data_facebook["items"])


Timer unit: 1 s

Total time: 3.00347 s
File: c:\Users\Joao\repo\civirank\civirank\parsers.py
Function: parse_facebook_posts at line 113

Line #      Hits         Time  Per Hit   % Time  Line Contents
   113                                           def parse_facebook_posts(posts_json, lim=False, debug=False):
   114         1          0.0      0.0      0.0      if lim:
   115                                                   posts_json = posts_json[0:lim]
   116                                                   
   117         1          0.0      0.0      0.0      IDs = [post.get("id") for post in posts_json]
   118         1          0.0      0.0      0.0      texts = [post.get("text") for post in posts_json]
   119         1          0.0      0.0      0.0      url_lists = [extract_urls(text) for text in texts]
   120         1          0.0      0.0      0.0      domain_lists = [extract_domains(url_list) for url_list in url_lists]
   121                                               
