# **CT-NASA** (**C**rowd**T**angle-**N**ew **A**ctor **S**earching **A**lgorithm)


**Pypi:** https://pypi.org/project/PyCrowdTangle/

**Github:** https://github.com/UPB-SS1/PyCrowdTangle

## Install PyCrowdTangle and import libraries

In [None]:
!pip install PyCrowdTangle -q

Import Libraries

In [None]:
import PyCrowdTangle as pct
import pandas as pd

In [None]:
dir(pct)

['PyCrowdTangle',
 '__author__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'ct_get_links',
 'ct_get_lists',
 'ct_get_posts']

In [None]:
# get version
pct.__version__

'0.5.0'

In [None]:
# get the api_token from https://apps.crowdtangle.com/
# you can locate your API token via your crowdtangle dashboard
# under Settings > API Access.
token="XYZZZZZZYYYYYYXXXXXXUUUUUWWWWW" #put your token here

## Load CrowdTangle dataset 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/MyDrive/YOUR_DIRECTORY_PATH")

In [None]:
import pandas as pd
import time

In [None]:
# put the data file name below
csv_data = pd.read_csv("data_file.csv", low_memory=False,  lineterminator='\n', sep=';', error_bad_lines=False)

In [None]:
csv_data.shape

In [None]:
csv_data.columns

Index(['account.name', 'account.handle', 'platformId', 'Page Category',
       'Page Admin Top Country', 'Page Description', 'Page Created',
       'subscriberCount', 'Followers at Posting', 'date', 'Post Created Date',
       'Post Created Time', 'type', 'totalInteraction',
       'statistics.actual.likeCount', 'statistics.actual.commentCount',
       'statistics.actual.shareCount', 'statistics.actual.loveCount',
       'statistics.actual.wowCount', 'statistics.actual.hahaCount',
       'statistics.actual.sadCount', 'statistics.actual.angryCount',
       'statistics.actual.careCount', 'Video Share Status', 'Is Video Owner?',
       'statistics.actual.videoPostViewCount',
       'statistics.actual.videoTotalViewCount',
       'statistics.actual.videoAllCrosspostsViewCount', 'Video Length',
       'postUrl', 'message', 'expandedLinks.original',
       'expandedLinks.expanded', 'imageText', 'title', 'description',
       'brandedContentSponsor.platformId', 'brandedContentSponsor.name',
 

## Statistics of the dataset

In [None]:
# identify unique actors and print the list
actors_list = csv_data['account.name'].dropna().unique()
print ("Total number of unique actors within the dataset:", actors_list.size)

for i in range (actors_list.size):
  print (actors_list [i]) 

In [None]:
# top links 

csv_data ['expandedLinks.original'].value_counts()

http://www.akhandbharatimes.com/                                                                   30
https://www.facebook.com/hanumansinghsirana/videos/1951535514949751/                               28
https://www.facebook.com/pushpendrakuldelhi001/videos/332266691549294/                             15
https://sachkhabar.co.in/now-biden-wants-modis-help-immediately-only-india-can-save-the-world/     15
https://sachkhabar.co.in/modi-governments-big-blow-to-zakir-naik/                                  15
                                                                                                   ..
https://www.facebook.com/KapilMishraFans/photos/a.184870732871889/477180080307618/?type=3           1
https://www.facebook.com/unitehindumovement/photos/a.115062400186484/278440053848717/?type=3        1
https://www.facebook.com/hindurastrabharat1010/photos/a.103210884669597/261357038854980/?type=3     1
https://www.facebook.com/photo.php?fbid=1169882913483961&set=p.1169882913483961&ty

In [None]:
# top N links
N=8

URL_list = csv_data ['expandedLinks.original'].dropna().value_counts() [:N].index.tolist()

import numpy as np
for i in range (np.size(URL_list)):
  print (URL_list[i]) 

http://www.akhandbharatimes.com/
https://www.facebook.com/hanumansinghsirana/videos/1951535514949751/
https://www.facebook.com/pushpendrakuldelhi001/videos/332266691549294/
https://sachkhabar.co.in/now-biden-wants-modis-help-immediately-only-india-can-save-the-world/
https://sachkhabar.co.in/modi-governments-big-blow-to-zakir-naik/
https://appearnews.com/ambesy/
https://khabarbharattak.com/due-to-this-big-belief-mukesh-ambani-got-200-years-old-olive-trees-for-his-house-know-why/
https://khabarbharattak.com/rohingya-go-back-to-myanmar/


##Use *ct_get_links* function to retrieve a set of posts matching a certain link

In [None]:
print(pct.ct_get_links.__doc__)

 Retrieve a set of posts matching a certain link.

    Args:
        link (str): The link to query by. Required.
        platforms (str, optional): The platforms from which to retrieve links. This value can be comma-separated.
                                   options: facebook, instagram, reddit. Defaults to 'facebook'.
        count (int, optional): The number of posts to return. Defaults to 100. options [1-100]
        start_date (str, optional): The earliest date at which a post could be posted. Time zone is UTC. 
                                    Format is ‚Äúyyyy-mm-ddThh:mm:ss‚Äù or ‚Äúyyyy-mm-dd‚Äù 
                                    (defaults to time 00:00:00).
        end_date (str, optional):  The latest date at which a post could be posted.
                                  Time zone is UTC. Format is ‚Äúyyyy-mm-ddThh:mm:ss‚Äù
                                  or ‚Äúyyyy-mm-dd‚Äù (defaults to time 00:00:00).
                                  Defaults to "now".
        i

In [None]:
# function to get all accounts who are associated with a link

def get_all_posts (URL, start_date, api_token):
  data = pct.ct_get_links(link=URL, include_history = 'true', platforms= ('facebook'), start_date=start_date,api_token=api_token)
  df = pd.DataFrame(data['result']['posts'])
  return df

In [None]:
# function to extract particular account details from the dictionary

def get_dict (df, output_df):
  for i in range (len(df)):
    date_dict = {'date' : df['date'][i]}
    updated_dict = {'updated' : df['updated'][i]}
    account_dict = df['account'][i]
    message_dict = {'message' : df['message'][i]}
    link_dict = {'link' : df['link'][i]}
    postUrl_dict = {'postUrl' : df['postUrl'][i]}

    extracted_dict = {**date_dict, **updated_dict, **message_dict, **link_dict, **postUrl_dict, **account_dict}
    
    extracted_dict_df = pd.DataFrame([extracted_dict])
    output_df = pd.concat ([output_df, extracted_dict_df], ignore_index=True)
  return output_df

In [None]:
start_date = '2019-01-01'

output_df = pd.DataFrame()

for i in range (np.size(URL_list)):
  df = get_all_posts (str(URL_list[i]), start_date, token)
  output_df = get_dict (df, output_df)

  if i < (np.size(URL_list)-1):
    time.sleep (31)

print (output_df)

                    date              updated  \
0    2019-01-16 10:59:44  2021-11-22 09:50:21   
1    2019-01-16 10:45:19  2021-11-22 09:50:21   
2    2019-01-16 10:44:28  2021-11-22 09:50:21   
3    2022-03-13 11:24:58  2022-03-20 19:03:11   
4    2022-03-10 09:32:46  2022-04-02 11:09:46   
..                   ...                  ...   
495  2021-06-30 07:52:44  2021-09-23 16:58:13   
496  2021-06-30 07:52:37  2021-12-05 02:17:31   
497  2021-06-30 07:52:30  2021-12-03 11:42:29   
498  2021-06-30 07:52:20  2022-03-01 12:43:14   
499  2021-06-30 07:43:15  2021-09-20 09:54:13   

                                               message  \
0    ‡§Ü‡§≤‡•ã‡§ï ‡§µ‡§∞‡•ç‡§Æ‡§æ ‡§™‡§∞ NSA ‡§Ö‡§ú‡•Ä‡§§ ‡§°‡•ã‡§≠‡§æ‡§≤ ‡§ï‡§æ ‡§´‡•ã‡§® ‡§ü‡•á‡§™ ‡§ï‡§∞‡§µ‡§æ‡§®‡•á...   
1    ‡§ó‡§ø‡§®‡§§‡•Ä ‡§ï‡•á ‡§≤‡§ø‡§è ‡§ï‡§∞‡•ç‡§®‡§æ‡§ü‡§ï ‡§ï‡§æ‡§Ç‡§ó‡•ç‡§∞‡•á‡§∏ ‡§®‡•á ‡§¨‡•Å‡§≤‡§æ‡§à ‡§¨‡•à‡§†‡§ï , ...   
2    ‡§∂‡•Ä‡§≤‡§æ ‡§¶‡•Ä‡§ï‡•ç‡§∑‡§ø‡§§ ‡§ï‡•Ä ‡§§‡§æ‡§ú‡§™‡•ã‡§∂‡•Ä ‡§Æ‡•á‡§Ç ‡§™‡§π‡•Å‡§Ç‡§ö‡•á

In [None]:
# drop the actors that matches the original actors list 
output_new_actors = output_df[~output_df['name'].isin(actors_list)]
print (output_new_actors)

                    date              updated  \
3    2022-03-13 11:24:58  2022-03-20 19:03:11   
4    2022-03-10 09:32:46  2022-04-02 11:09:46   
5    2022-03-10 09:32:45  2022-04-02 11:04:14   
6    2022-03-10 09:32:44  2022-03-23 13:39:52   
7    2022-03-10 09:32:43  2022-03-16 04:42:39   
..                   ...                  ...   
494  2021-06-30 07:52:51  2021-09-19 03:14:52   
496  2021-06-30 07:52:37  2021-12-05 02:17:31   
497  2021-06-30 07:52:30  2021-12-03 11:42:29   
498  2021-06-30 07:52:20  2022-03-01 12:43:14   
499  2021-06-30 07:43:15  2021-09-20 09:54:13   

                                               message  \
3                                                  NaN   
4                                                  NaN   
5                                                  NaN   
6                                                  NaN   
7                                                  NaN   
..                                                 ...   
494  

In [None]:
# save the new actors dataframe in a csv file 
output_new_actors.to_csv('/content/drive/MyDrive/PATH_TO_YOUR_DIRECTORY/new_actors_full_dataset.csv') 

In [None]:
# drop duplicates
output_new_actors_unique = output_new_actors.drop_duplicates("url", keep='first', ignore_index=True)
print (output_new_actors_unique)

                    date              updated  \
0    2022-03-13 11:24:58  2022-03-20 19:03:11   
1    2022-03-10 09:32:46  2022-04-02 11:09:46   
2    2022-03-10 09:32:45  2022-04-02 11:04:14   
3    2022-03-10 09:32:44  2022-03-23 13:39:52   
4    2022-03-10 09:32:43  2022-03-16 04:42:39   
..                   ...                  ...   
174  2021-07-01 05:34:50  2022-03-23 08:29:11   
175  2021-07-01 05:09:27  2022-03-11 15:07:04   
176  2021-06-30 11:35:28  2022-01-24 09:39:26   
177  2021-06-30 09:53:24  2022-03-13 21:55:55   
178  2021-06-30 08:20:36  2021-11-28 19:47:18   

                                               message  \
0                                                  NaN   
1                                                  NaN   
2                                                  NaN   
3                                                  NaN   
4                                                  NaN   
..                                                 ...   
174  

In [None]:
# save the new actors dataframe in a csv file 
output_new_actors_unique.to_csv('/content/drive/MyDrive/PATH_TO_YOUR_DIRECTORY/new_actors.csv') 