# Scraping

In [1]:
import json
import os
import pickle
import requests
import time
import bs4 as bs
import pandas as pd
import spacy
import locale
from spacy import displacy
from datetime import datetime
from pprint import pprint
from collections import defaultdict
import iso3166
import time

In [2]:
#!python3 -m pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz


In [3]:
start_time = time.time()
remove = ["'", "‘", "’"]
images_path = "./images/"
data_path = "./data/"
organizations_tags = set()
individuals_tags = set()
countries = [country.lower() for country in iso3166.countries_by_name.keys()]
countries_set = set()
organizations_set = set()
individuals_set = set()
individual_titles = ["History", "Criminal Activities", "Geography", "Allies and Enemies", "Prospects"]
organization_titles = ["History", "Leadership", "Criminal Activity", "Geography", "Allies and Enemies", "Prospects"]
individuals_url = "https://insightcrime.org/criminal-actors/?filter=personalities&country=0&orderby="
organizations_url = "https://insightcrime.org/criminal-actors/?filter=armed_groups&country=0&orderby="

In [4]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

url = "https://insightcrime.org/caribbean-organized-crime-news/400-mawozo/"
req = requests.get(url, headers)
soup = bs.BeautifulSoup(req.content, 'html.parser')

In [5]:
#print(soup.prettify())

In [6]:
# Get url content (soup)
def get_soup(url):
    req = requests.get(url, headers)
    soup = bs.BeautifulSoup(req.content, 'html.parser')
    
    return soup

In [7]:
def get_name_and_alias(soup, entity_set):
    target = soup.find("h1").text

    # Get name and alias
    if "alias" in target:
        name = target.split(", alias")[0].strip()
        alias = target.split(", alias")[1].strip()
        alias = ''.join(c for c in alias if not c in remove)
    else:
        name = target
        alias = None
    entity_set.add(name)
    return name, alias

def get_image(soup, name, folder):
    images = soup.find_all('img', alt=True)
    for image in images:
        if name in image["alt"]:
            image_url = image["src"]
            image_data = requests.get(image_url).content
            image_destination = f"{images_path}{folder}/{name.replace(' ', '-').replace('/', '-')}.jpg".lower()
            with open(image_destination, "wb") as file: 
                file.write(image_data)
    return image_destination

def get_tags_and_country(soup, countries, entity_tags):
    tags_list = soup.find("div", {"class":"list-tags"}).find_all("li")
    tags = [tag.text.lower() for tag in tags_list]
    country = None
    for tag in tags:
        entity_tags.add(tag)
        if any(tag in country for country in countries):
            country = tag
        elif any(country.split(",")[0] in tag for country in countries):
            country = tag.split(" ")[0]
        countries_set.add(country)
    return tags, country

def get_details(soup, entity_set, titles):
    output = {}
    details = soup.find("div", {"class":"single-content"})
    title = "Summary"
    i = 0
    detail = ""
    for item in details:
        # Deal with Ex-FARC Mafia page specificities
        if (item.name == "h2") or (item.name == "h3") or ((get_name_and_alias(soup, entity_set) == "Ex-FARC Mafia") and (item.text == "Leadership")):
            # Stop at next header
            output[title] = detail#.replace("\n", " ") \
                                  #.replace("\xa0", " ") \
                                  #.replace("\\", "") \
            
            title = titles[i]
            i += 1
            detail = ""
        if (item.name == "p") and (item.get("style") == None):# or ("SEE ALSO" not in item.contents)):
            detail += item.text
        output[title] = detail#.replace("\n", " ") \
                              #.replace("\xa0", " ") \
                              #.replace("\\", "") \
                              #.strip()

    return output
    
def ensure_page_is_profile(soup, url):
    target = soup.find("div", {"class": "text"})
    if target != None:
        url = target.find("a", href=True)["href"]
        soup = get_soup(url)
        return soup, url
    else:
        return soup, url
        

In [8]:
def get_latest_update_in_english(soup):
    locale.setlocale(locale.LC_TIME, 'en_US.UTF-8')
    latest_update = soup.find("span", {"class":"autor"}).text.strip().replace("LATEST UPDATE ", "") \
                                                                     .replace(" BY INSIGHT CRIME", "") \
                                                                     .replace("\n", "")
    try:
        latest_update = datetime.strptime(latest_update, "%B %d, %Y").date()
    except ValueError:
        try:
            latest_update = datetime.strptime(latest_update, "%d %b %Y").date()
        except ValueError:
            try:
                latest_update = datetime.strptime(latest_update, "%Y-%m-%d").date()
            except:
                try:
                    latest_update = datetime.strptime(latest_update, "%d %B, %Y").date()
                except:
                    latest_update = datetime.strptime(latest_update, "%b %d, %Y").date()
    return latest_update
            
def get_latest_update_in_spanish(soup):
    locale.setlocale(locale.LC_TIME, 'es_US.UTF-8')
    latest_update = soup.find("span", {"class":"autor"}).text.strip().replace("LATEST UPDATE ", "") \
                                                                     .replace(" BY INSIGHT CRIME", "") \
                                                                     .replace("\n", "")
    try:
        latest_update = datetime.strptime(latest_update, "%B %d, %Y").date()
    except ValueError:
        try:
            latest_update = datetime.strptime(latest_update, "%d %b %Y").date()
        except ValueError:
            try:
                latest_update = datetime.strptime(latest_update, "%Y-%m-%d").date()
            except:
                try:
                    latest_update = datetime.strptime(latest_update, "%d %B DE %Y").date()
                except:
                    try:
                        latest_update = datetime.strptime(latest_update, "%Y-%m-%d %H:%M:%S").date()
                    except:
                        latest_update = datetime.strptime(latest_update, "%d %B, %Y").date()
    locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
    return latest_update
            
def get_latest_update(soup):
    try:
        latest_update = get_latest_update_in_english(soup)
    except:
        latest_update = get_latest_update_in_spanish(soup)
    return latest_update

In [9]:
# Get profile
def get_profile(url, entity_set, entity_tags, titles, folder):
    # Get target
    temp_soup = get_soup(url)
    soup, url = ensure_page_is_profile(temp_soup, url)
    
    profile = {}
    profile["url_profile"] = url
   
    # Get name and alias
    profile["name"], profile["alias"] = get_name_and_alias(soup, entity_set)

    # Get image
    profile["image"] = get_image(soup, profile["name"], folder)
    
    # Get latest update date
    profile["latest_update"] = get_latest_update(soup)
        
    # Get tags
    profile["tags"], profile["country"] = get_tags_and_country(soup, countries, entity_tags)

    # Get details
    try:
        for k, v in get_details(soup, entity_set, titles).items():
            profile[k] = v
    except IndexError:
        # Dealing with exceptional case of organization found in individual profiles
        # (not a problem, as organization will be caught elsewhere and leader as well)
        return profile

    # Tracking
    #print(profile)
    return profile

#get_organization(url)

In [10]:
def scrape_data(url, func, entity_set, entity_tags, titles, folder):
    output = {}
    soup = get_soup(url)
    pages = [page.text.strip() for page in soup.find_all("li", {"class":"page-item"}) if page.text.strip() != ""]
    for page in pages:
        if page == "1":
            pass
        else:
            next_url = f"{url.split('?')[0]}page/{page}/?{url.split('?')[1]}"
            soup = get_soup(next_url)
        
        boxes = [box for box in soup.find_all("div", {"class":"row row-main"})[0].children if box != "\n"]
        for box in boxes:
            image_box = box.find("div", {"class":"image-box"})
            target_url = image_box.find("a", href=True)["href"]
            target_name = box.find("h2").text
            output[target_name] = func(target_url, entity_set, entity_tags, titles, folder)

    return output

organizations = scrape_data(organizations_url, get_profile, organizations_set, organizations_tags, organization_titles, "organizations")
individuals = scrape_data(individuals_url, get_profile, individuals_set, individuals_tags, individual_titles, "individuals")
end_time = time.time()
execution_time = round((end_time - start_time) / 60, 2)
print(f"Execution time:{execution_time}")

Execution time:5.86


In [11]:
del individuals["Rastrojos"]
organizations_df = pd.DataFrame.from_dict(organizations, orient="index")
individuals_df = pd.DataFrame.from_dict(individuals,orient="index")

In [12]:
organizations_df.to_csv(f"{data_path}organizations.csv")
individuals_df.to_csv(f"{data_path}individuals.csv")

## Next
- Add a More context key with all links in text
- Turn into a function
- Do the same for organization
- Add sentiment analysis for 

In [13]:
with open(f"{data_path}organizations.txt", "w") as f:
    for organization in list(organizations_set):
        f.write(organization + "\n")
        
with open(f"{data_path}organizations_tags.txt", "w") as f:
    for organization_tag in list(organizations_tags):
        f.write(organization_tag + "\n")
        
with open(f"{data_path}individuals_tags.txt", "w") as f:
    for individual_tag in list(individuals_tags):
        f.write(individual_tag + "\n")
        
with open(f"{data_path}countries.txt", "w") as f:
    for country in list(countries_set):
        f.write(organization + "\n")
        
with open(f"{data_path}individuals.txt", "w") as f:
    for individual in individuals_df["name"].values:
        f.write(individual + "\n")
        
with open(f"{data_path}aliases.txt", "w") as f:
    for alias in individuals_df["alias"].values:
        if alias != None:
            f.write(alias + "\n")