# Twitter Scraper

In [1]:
import re
import csv
from getpass import getpass
from time import sleep
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Chrome

def get_tweet_data(card):
    """Extract data from tweet card"""
    username = card.find_element_by_xpath('.//span').text
    try:
        handle = card.find_element_by_xpath('.//span[contains(text(), "@")]').text
    except NoSuchElementException:
        return
    
    try:
        postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')
    except NoSuchElementException:
        return
    
    comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text
    responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text
    text = comment + responding
    reply_cnt = card.find_element_by_xpath('.//div[@data-testid="reply"]').text
    retweet_cnt = card.find_element_by_xpath('.//div[@data-testid="retweet"]').text
    like_cnt = card.find_element_by_xpath('.//div[@data-testid="like"]').text
    
    # get a string of all emojis contained in the tweet
    """Emojis are stored as images... so I convert the filename, which is stored as unicode, into 
    the emoji character."""
    emoji_tags = card.find_elements_by_xpath('.//img[contains(@src, "emoji")]')
    emoji_list = []
    for tag in emoji_tags:
        filename = tag.get_attribute('src')
        try:
            emoji = chr(int(re.search(r'svg\/([a-z0-9]+)\.svg', filename).group(1), base=16))
        except AttributeError:
            continue
        if emoji:
            emoji_list.append(emoji)
    emojis = ' '.join(emoji_list)
    
    tweet = (username, handle, postdate, text, emojis, reply_cnt, retweet_cnt, like_cnt)
    return tweet    

In [None]:
#원격의료 until:2021-07-18 since:2019-11-19
#uY9xKrnUSt4FAac

# application variables
user = input('username: ')
my_password = getpass('Password: ')
search_term = input('search term: ')

# create instance of web driver
driver = Chrome('/Users/monica_air/chromedriver')

# navigate to login screen
driver.get('https://www.twitter.com/login')
driver.maximize_window()

username = driver.find_element_by_xpath('//input[@name="session[username_or_email]"]')
username.send_keys(username)

password = driver.find_element_by_xpath('//input[@name="session[password]"]')
password.send_keys(password)
password.send_keys(Keys.RETURN)
sleep(1)

# find search input and search for term
search_input = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
search_input.send_keys(search_term)
search_input.send_keys(Keys.RETURN)
sleep(1)

# navigate to historical 'latest' tab
driver.find_element_by_link_text('Latest').click()

In [2]:
driver = Chrome('/Users/monica_air/chromedriver')
driver.get('https://twitter.com/search?f=live&q=%EC%9B%90%EA%B2%A9%EC%9D%98%EB%A3%8C%20until%3A2021-07-18%20since%3A2021-07-01&src=typed_query')
# get all tweets on the page
data = []
tweet_ids = set()
last_position = driver.execute_script("return window.pageYOffset;")
scrolling = True

while scrolling:
    page_cards = driver.find_elements_by_xpath('//div[@data-testid="tweet"]')
    for card in page_cards[-15:]:
        tweet = get_tweet_data(card)
        if tweet:
            tweet_id = ''.join(tweet)
            if tweet_id not in tweet_ids:
                tweet_ids.add(tweet_id)
                data.append(tweet)
            
    scroll_attempt = 0
    while True:
        # check scroll position
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        sleep(1)
        curr_position = driver.execute_script("return window.pageYOffset;")
        if last_position == curr_position:
            scroll_attempt += 1
            
            # end of scroll region
            if scroll_attempt >= 3:
                scrolling = False
                break
            else:
                sleep(2) # attempt another scroll
        else:
            last_position = curr_position
            break

# close the web driver
driver.close()

In [3]:
len(data)

19

## Saving the tweet data

In [24]:
with open('0701_0718_telemedicine.csv', 'w', newline='', encoding='utf-8') as f:
    header = ['UserName', 'Handle', 'Timestamp', 'Text', 'Emojis', 'Comments', 'Likes', 'Retweets']
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(data)

In [9]:
import pandas as pd

In [10]:
data = pd.read_csv('0701_0718_telemedicine.csv')

In [11]:
data.Text[0]

'원격의료 확대 시급한데 시간을 조율 하다니~~~한심!\n전국이 7,8,9월에 엄청난 확진자가 발생하면 어찌 할건가?\n보건소는 코로나와 관계없이 바이러스 수시 검사소가 필요한 시대라는걸 인식 했으면 한다.\n원격진료 그동안 얼마나 준비했나? 하루라도 빨리 전면 시행 해라.  답답!!!'

In [12]:
link =[]
tw_link = data.Text.tolist()
for i in tw_link:
    if 'http' in i:
        i = i.split('http')[-1]
        i = 'http'+ i
        link.append(i)
    else:
        i = '링크없음'
        link.append(i)
data['link'] = link

data.to_csv('final_result.csv')

In [14]:
final_df = pd.read_csv('final_result.csv', index_col=0)
final_df.head()

Unnamed: 0,UserName,Handle,Timestamp,Text,Emojis,Comments,Likes,Retweets,link
0,윤상운,@pSo0XuO8NTfDQrZ,2021-07-10T00:43:59.000Z,"원격의료 확대 시급한데 시간을 조율 하다니~~~한심!\n전국이 7,8,9월에 엄청난...",,,,,링크없음
1,#한미FTA폐기 #민주사회복원,@myid01,2021-07-10T00:14:09.000Z,"삼성, SK 같은 굴지의 재벌들이 원격의료단말기와 프로그램 개발을 완료했고, 이들 ...",,,1.0,,링크없음
2,#한미FTA폐기 #민주사회복원,@myid01,2021-07-09T23:13:50.000Z,의료사각지대에 ‘원격의료’ 단말기 한 대를 가져다 놓고 국가의 책무를 다했다 하려는...,,,,,http://m.vop.co.kr/view.php?cid=695435… #한미FTA폐기
3,#한미FTA폐기 #민주사회복원,@myid01,2021-07-09T21:44:03.000Z,"박근혜는 노골적 민영화보다 모호하고 언뜻 좋아보이는 언사를 사용한다. 선별적복지, ...",,,4.0,5.0,http://m.vop.co.kr/view.php?cid=695435… #TPP반대...
4,#한미FTA폐기 #민주사회복원,@myid01,2021-07-09T10:44:10.000Z,"박근혜 의료민영화 추진일지-1)집권초 원격의료, 의료관광호텔 추진 2)영리자회사 추...",,,,,링크없음


In [17]:
print(link[3])

http://m.vop.co.kr/view.php?cid=695435… 
