In [1]:
from urllib.request import urlopen, Request
from lxml import html
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}

In [3]:
def thread_ids(pages, verbose=True):
    print('Fetching pages...')
    count = 0
    thread_ids = []
    for i in pages:
        request = Request(url='https://www.fxp.co.il/forumdisplay.php?f=46&page=%s' % i,
                          headers=headers)
        tree = html.fromstring(urlopen(request).read().decode('utf-8'))

        # slice [7:] to eliminate prefix
        thread_ids += [int(x[7:]) for x
                       in tree.xpath('//ol[@id=\'threads\']/li/@id')]

        count += 1
        if verbose and (count % 10 == 0):
            print('Fetched %s pages' % (count))
            print(datetime.now().time())
    print(pd.Series(thread_ids).value_counts().head(20))
    return list(set(thread_ids))


In [4]:
def page_content(thread_id):
    #print(f'starting fetch {thread_id}')
    df_new = pd.DataFrame(columns=['thread', 'post'])
    request = Request(
            url='https://www.fxp.co.il/showthread.php?t=%s' %
            (thread_id),
            headers=headers)
    
    tree = html.fromstring(urlopen(request).read().decode('utf-8'))
    title = tree.xpath('//div[@class=\'titleshowt greengr\']/h1/text()')
    df_new.append({'thread': thread_id, 'post': title[0]},
              ignore_index=True)
    
    messages = tree.xpath('//div[@id=\'postlist\']//blockquote\
        [@class=\'postcontent restore\']')
    for m in messages:
        message_text = ' '.join(m.xpath('./text()'))
        df_new = df_new.append({'thread': thread_id, 'post': message_text},
                       ignore_index=True)
    last_page = tree.xpath('//span[@class=\'first_last\']/a/@href')
    if len(last_page) > 0:
        last_page = last_page[0]
        last_page = int(re.search('page=([0-9]+)', last_page).group(1))
        for j in range(2, last_page + 1):
            request = Request(
                url='https://www.fxp.co.il/showthread.php?t=%s&page=%s' %
                (thread_id, j),
                headers=headers)
            tree = html.fromstring(urlopen(request).read().decode('utf-8'))
            messages = tree.xpath('//div[@id=\'postlist\']//\
                blockquote[@class=\'postcontent restore\']')
            for m in messages:
                message_text = ' '.join(m.xpath('./text()'))
                df_new = df_new.append({'thread': thread_id, 'post': message_text},
                               ignore_index=True)
    return df_new

In [5]:
def thread_content(thread_ids, file, verbose=True):
    print('Fetching threads...')
    count = 0
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(thread_ids)) as executor:
        future_to_url = {executor.submit(page_content, thread_id): thread_id for thread_id in thread_ids}
        print(f'finish to sumbit all jobs')
        df = pd.DataFrame(columns=['thread', 'post'])
        for future in concurrent.futures.as_completed(future_to_url):
            df_new=future.result()
            df = pd.concat([df,df_new])
            count += 1
    df.to_csv(file, encoding='utf-8')

In [6]:
%time thread_content(thread_ids(range(1, 2)), '1to240.csv')

Fetching pages...
20168007    1
20168907    1
20169039    1
20157551    1
20168729    1
20169125    1
20167142    1
20163704    1
20169160    1
20165737    1
20169131    1
20169164    1
20169081    1
20167136    1
20169135    1
20168839    1
20168748    1
20168499    1
20168566    1
20168695    1
dtype: int64
Fetching threads...
finish to sumbit all jobs
Wall time: 12.9 s
