# Script to get twitter friends

## Special requirements for the environment

In [None]:
version_id = 'V14'

run_in = 'local'
#run_in = 'colab'

# Install requirements via pip etc.

## Import statements

In [None]:
if run_in == 'local':
  import csv
  import os
  import pandas as pd
  import numpy as np
  import tweepy
  import time
  import json
  import sys
  import datetime
  import traceback
  print('Local runtime has packages')
  
if run_in == 'colab':
  print('Downloading packages has packages')
  !pip install tweepy
  !pip install xlrd
  !pip install XlsxWriter
  import csv
  import os
  import pandas as pd
  import numpy as np
  import tweepy
  import time
  import json
  import sys
  import datetime
  import traceback

# Get access to *your personal *GDrive

## Define your personal path to the folder we will work in

We expect two things on your *personal* GDrive:
1. We expect a csv file with startup IDs
2. We expect an output folder in which we can dump the results and the log files

In [None]:
if run_in == 'local':
  base_path = "/users/USERNAME/Google Drive/USERNAME/"
if run_in == 'colab':
  base_path = "/content/drive/My Drive/USERNAME/"
  from google.colab import drive
  drive.mount('/content/drive')
  
print('base_path set to:')
print(base_path)

### Check that we can access the GDrive and your personal folder

Make sure that we can see the two things:
1. A file called startups.csv with one column that lists all the twitter handles
2. A folder called 'outputs' in which this script will store all results and log files

In [None]:
os.listdir(base_path)

In [None]:
os.listdir(base_path + 'Pitchbook_Crunchbase_Tracxn_Raw')

In [None]:
os.listdir(base_path + "Pitchbook_Crunchbase_Tracxn_Raw/Don't touch " + version_id)

In [None]:
output_path = os.path.join(base_path + 'outputs/' + version_id + '/friends/id_lists/')
os.listdir(output_path)

## Get twitter Friends

### Authentication and generate the Tweepy API

In [None]:
# Documentation: http://docs.tweepy.org/en/v3.5.0/api.html

consumer_key = 'consumer_key' # (API key)
consumer_secret = 'consumer_secret' # (API secret key)

access_token = 'access_token' # (Access token)
access_token_secret = 'access_token_secret' # (Access token secret)

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

auth.set_access_token(access_token, access_token_secret)

# Construct the API instance
# Wait upon reaching rate limit
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify=True, compression=True)

### Read all start-ups

In [None]:
df = pd.read_excel(base_path + "Pitchbook_Crunchbase_Tracxn_Raw/Don't touch " + version_id + '/' + version_id + '.xlsx',  version_id + 'twitter_handle_scrape', encoding = "ISO-8859-1", header=None, skiprows=1, dtype={'uid':str}, names=['company_name','twitter_handle'], usecols="A,O")
df.head(10)

In [None]:
df.shape

In [None]:
find_rows = df.loc[df['twitter_handle'] == "aitrading_com"]
find_rows

### Clean start-ups without Twitter handles

Filter rows so that "NOTAPPLICABLE" is not in the data

In [None]:
useful_rows = df.loc[df['twitter_handle'] != "NOTAPPLICABLE"]

useful_rows.head(5)

In [None]:
useful_rows.shape

In [None]:
# Check if we have any duplicate handles

useful_rows['twitter_handle'].value_counts()

In [None]:
# Check if we have null results

# pd.isnull(useful_rows)

useful_rows.dropna()
useful_rows[:10]

In [None]:
useful_rows.shape

### Extract the remaining Twitter handles

In [None]:
list_of_handles_from_csv = useful_rows['twitter_handle'].tolist()

# Remove nan values=
list_of_handles_from_csv = [x for x in list_of_handles_from_csv if x == x]

print("We have this many handles: " + str(len(list_of_handles_from_csv)))
print(list_of_handles_from_csv)

## Instead of looping through all handles, we only take those where we do not have complete results yet

* Do we have a folder already? 
  * If not, add handle to list.
  * If yes, do we have all 4 files we need? If not, add handle to list.

In [None]:
list_of_handles_we_still_need_to_process = []

# Convert list of handles from CSV into a set
set_of_handles_from_csv = set(list_of_handles_from_csv)
print(len(set_of_handles_from_csv))

#### Check folder names against the set of handles

In [None]:
list_of_folders = [dI for dI in os.listdir(output_path) if os.path.isdir(os.path.join(output_path,dI))]
print(list_of_folders)

In [None]:
# Convert list into set and compare it to above list
# What has not a folder yet, needs to be processed in any case

set_of_folder_names_we_have_already = set(list_of_folders)
print(set_of_folder_names_we_have_already)

In [None]:
print('We have ' + str(len(set_of_folder_names_we_have_already)) + ' folders stored already.')

In [None]:
set_of_handles_without_a_folder = set_of_handles_from_csv - set_of_folder_names_we_have_already

print(set_of_handles_without_a_folder)
print('We do not have folders for ' + str(len(set_of_handles_without_a_folder)) + ' handles.')

In [None]:
# Add them to the todo list
list_of_handles_we_still_need_to_process = list(set_of_handles_without_a_folder)

In [None]:
print(list_of_handles_we_still_need_to_process)

In [None]:
counter = 0

for folder_name in set_of_folder_names_we_have_already:
  
  # Every x is a root, dirs, files
  list_of_files = [x[2] for x in os.walk(output_path + str(folder_name))]
  # print(list_of_files[0])
  
  if len(list_of_files[0]) <= 0:
    
    list_of_handles_we_still_need_to_process.append(folder_name)
    counter += 1

print('We added ' + str(counter) + ' additional handles where the download was incomplete.')

### Loop through Twitter handles and obtain the Friends IDs


* Sometimes we want to include lists.
 * Which can be indented.

1. Take one start-up Twitter handle and determine the Friends IDs

In [None]:
list_of_handles_we_still_need_to_process.sort()
print(list_of_handles_we_still_need_to_process)

In [None]:
for twitter_id in list_of_handles_we_still_need_to_process[1:]:
  print(twitter_id)
  start_up = api.get_user(twitter_id)
  print("-------------" + str(list_of_handles_we_still_need_to_process.index(twitter_id)) + "/" + str(len(list_of_handles_we_still_need_to_process))+ "---------------")
  print('Start-up: ' + str(twitter_id)) 
  print('Start-up_ID: ' + str(start_up.id))
  print('Start-up_screen_name: ' + str(start_up.screen_name)) 
  print('Start-up_description: ' + str(start_up.description))
  print('Start-up_url: ' + str(start_up.url))
  print('Start-up_created_at: ' + str(start_up.created_at))
  
  result = {
      "twitter_handle": twitter_id
  }

  # Create a folder to store this twitter handle's results
  path_to_folder_for_this_handle = output_path + str(twitter_id) + '/'
  print(path_to_folder_for_this_handle)
  if not os.path.exists(path_to_folder_for_this_handle):
    os.mkdir(path_to_folder_for_this_handle)


  ########################################################################
  # Get friends_ids, i.e. the ids of people that are being followed
  ########################################################################

  friends_ids = []
  page_counter_friends = 0

  try:

    # We can use screen_name (screen_name = ) or id as a parameter (id = 'McDonalds')
    # friends_ids has a max count per page of 5000
    for page in tweepy.Cursor(api.friends_ids, id = twitter_id, count = 5000).pages():
      friends_ids.extend(page)
      print('This is page ' + str(page_counter_friends))
      page_counter_friends += 1
      time.sleep(2)

    print("Number of ids in the list: " + str(len(friends_ids)))
    print("Friends IDs:")
    print(friends_ids)
    print(datetime.datetime.now())
    
    result['friends_ids'] = friends_ids

    with open(path_to_folder_for_this_handle + str(twitter_id)+'_friends_ids.txt', 'w') as outfile:
      json.dump(result, outfile)


  except tweepy.TweepError:
    print("tweepy.TweepError=", tweepy.TweepError)
    # print("error code",tweepy.TweepError.message[0]['code'])
    print("This error occurred for id " + str(twitter_id) + " and page " + str(page_counter_friends))
    checker=True
    traceback.print_exc()

    with open(path_to_folder_for_this_handle + str(twitter_id)+'_ERROR.txt', 'w') as outfile:
      json.dump('error', outfile)

  except:
    e = sys.exc_info()[0]
    print("Error: %s" % e)
    checker=True
    traceback.print_exc()
    with open(path_to_folder_for_this_handle + str(twitter_id)+'_ERROR.txt', 'w') as outfile:
      json.dump('error', outfile)