In [2]:
"""
This script scrapes the data offered by the Cryptocurrency Pump & Dump
developed by CoinCheckup.
"""

__author__ = 'Georgios Goniotakis'
__email__ = 'georgios.goniotakis@outlook.com'
__license__ = 'MIT'
__date__ = 'August 13, 2018'
__version__ = '1.0'

In [3]:
import os
import time

import csv
import requests
import pandas as pd

from bs4 import BeautifulSoup

In [4]:
FILE_PATH = "data.csv" # File path to export file
COIN_URL = "https://pumpdump.coincheckup.com/page/" # URL to retrieve info
N_PAGES = 15 # Number of pages to export default: 15 (1-15)

In [5]:
def create_file():
    """
    Creates the file into the disk
    """
    clear_file()
    labels = ["Cryptocurrency", "Pump % Gain", "Timeframe", "Start time", "Start price: BTC", "Start price: USD",
              "End price: BTC", "End price: USD", "Publishing time (GMT +2)"]
    
    with open(FILE_PATH, 'w') as file:
        wf = csv.writer(file, quoting=csv.QUOTE_ALL)
        wf.writerow(labels)

In [1]:
def write_file(data):
    """
    Appends the data into the file
    :param data: Data to be appended (new record)
    """
    with open(FILE_PATH, 'a') as file:
        wf = csv.writer(file, quoting=csv.QUOTE_ALL)
        wf.writerow(data)

In [6]:
def clear_file():
    """
    Deletes the old file
    """
    # Basic OS handling in case the file is missing
    try:
        os.remove(FILE_PATH)
    except OSError:
        pass

In [7]:
def replace_del(data):
    """
    Filters and splits by delimeter, removes empty characters from element
    :param data: Row element
    """
    results = [d.strip() for c in [",", "/"] if c in data for d in data.split(c)]
    return results if len(results) > 0 else [data.strip()]

In [8]:
def replace_keyword(data):
    """
    If keyword is found it returns a list with sanitized string. Otherwise, it
    returns back the initial string.
    :param data: Row element
    """
    results = [d.replace(k, "").strip() for d in data for k in ["BTC", "USD"] if k in d]
    return results if len(results) > 0 else data

In [9]:
def process_record(r):
    """
    Calls the above methods to perform record sanitization and preparation.
    :param r: Row data
    """
    return replace_keyword(replace_del(r))

In [10]:
def fetch_data():
    """
    Initiates the process of data collection.
    """
    create_file()
    
    print("Process is starting. Extracting Pump & Dump info for the first {} pages.".format(N_PAGES))
    start_time = time.process_time()

    for p in range(1, N_PAGES + 1):
        perform_request(COIN_URL + str(p))

    print("Process complete! Data saved in: {}. Time elapsed: {}".format(FILE_PATH, time.process_time() - start_time))

In [11]:
def perform_request(url):
    """
    Performs the scraping of a given URL.
    :param url: URL address to be scraped
    """
    print("Extracting info from page: {}".format(url))
    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')

    # Replace the <br> tag with comma
    for br in soup.find_all("br"):
        br.replace_with(",")

    # Find all the divs that have this class and extract them from source
    records = soup.findAll("div", class_="arbitrage-row")

    # For each div (record) in list
    for r in records:
        details = list()

        # Name of the coin
        details.append(r.find("div", class_="col-1").text.strip())

        # Sanitize rest data and append to list
        for i in range(2, 6):
            for e in process_record(r.find("div", class_="col-" + str(i)).text):
                if e is '' or e is None:
                    details.append(None)
                else:
                    details.append(e)
    
        # Date and time of pump and dump
        details.append(process_record(r.find("div", class_="timerow").text.strip())[1])

        # Ensures that only records with full details are included into the produced file
        if all(v is not None for v in details) and len(details) == 9:
            write_file(details)

In [12]:
# Initiate the process by calling this method
fetch_data()

Process is starting. Extracting Pump & Dump info for the first 15 pages.
Extracting info from page: https://pumpdump.coincheckup.com/page/1
Extracting info from page: https://pumpdump.coincheckup.com/page/2
Extracting info from page: https://pumpdump.coincheckup.com/page/3
Extracting info from page: https://pumpdump.coincheckup.com/page/4
Extracting info from page: https://pumpdump.coincheckup.com/page/5
Extracting info from page: https://pumpdump.coincheckup.com/page/6
Extracting info from page: https://pumpdump.coincheckup.com/page/7
Extracting info from page: https://pumpdump.coincheckup.com/page/8
Extracting info from page: https://pumpdump.coincheckup.com/page/9
Extracting info from page: https://pumpdump.coincheckup.com/page/10
Extracting info from page: https://pumpdump.coincheckup.com/page/11
Extracting info from page: https://pumpdump.coincheckup.com/page/12
Extracting info from page: https://pumpdump.coincheckup.com/page/13
Extracting info from page: https://pumpdump.coinchec

In [None]:
# Visualize the data to see that everything worked fine
df = pd.read_csv()