In [2]:
from bs4 import BeautifulSoup
import requests 
import re
import pandas as pd

## First part - get Data: Basic functions that will be used 
Functions include:
getPagesToCrawl - a function that creates a list of pages that should be used in order to achieve the full data.
getTableDataFromPage - from a single page we get back a table that stores the data
cleanHtmlCode - the first step that we take in order to clean our data, we remove the HTML from it and leave the info.
createDf - gets 8 lists that contains data and return a df

In [3]:
def getPagesToCrawl():
    baseUrl="https://www.start.umd.edu/gtd/search/Results.aspx?page="
    amountPages = "&count=2000" 
    pageList = []
    
    for currentPageNumber in range(1,102):
        pageList.append(baseUrl + str(currentPageNumber) +amountPages)
        
    return pageList

In [4]:
def getTableDataFromPage(pageUrl):
    user_agent = {'User-agent': 'Mozilla/5.0'}
    response = requests.get(pageUrl,headers=user_agent)
    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find('table')
    return table

In [5]:
def cleanHtmlCode(tableData):
    GTD_ID = []
    DATE = []
    COUNTRY = []
    CITY = [] 
    PERPETRATOR_GROUP = []
    FATALITIES = []
    INJURED = []
    TARGET_TYPE = []

    listRows = []
    for tr in tableData.findAll("tr",attrs={}):
        for td in tr:
            if(td != "\n"):
                listRows.append(td.text)


    # A lot of data that we don't need (The name of the columns (8)) 
    for i in range(8):
        listRows.pop(0)
    
    for i in range(len(listRows)):
        iMod = i%8
        if(iMod == 0):
            GTD_ID.append(listRows[i])
        elif(iMod == 1):
            DATE.append(listRows[i])
        elif(iMod == 2):
            COUNTRY.append(listRows[i])
        elif(iMod == 3):
            CITY.append(listRows[i])
        elif(iMod == 4):
            PERPETRATOR_GROUP.append(listRows[i])
        elif(iMod == 5):
            FATALITIES.append(listRows[i])
        elif(iMod == 6):
            INJURED.append(listRows[i])
        elif(iMod == 7):
            TARGET_TYPE.append(listRows[i])
       
    
    return GTD_ID,DATE,COUNTRY,CITY,PERPETRATOR_GROUP,FATALITIES,INJURED,TARGET_TYPE


In [6]:
def createDf(GTD_ID,DATE,COUNTRY,CITY,PERPETRATOR_GROUP,FATALITIES,INJURED,TARGET_TYPE):
    df = pd.DataFrame({"GTD_ID":GTD_ID,
                   "DATE":DATE,
                   "COUNTRY":COUNTRY,
                   "CITY":CITY,
                   "PERPETRATOR_GROUP":PERPETRATOR_GROUP,
                   "FATALITIES":FATALITIES
                   ,"INJURED":INJURED,
                   "TARGET_TYPE":TARGET_TYPE,
                   })
    return df

## First part - get Data: Flow

In [7]:
# 8 Lists that will store the data.
GTD_ID = []
DATE = []
COUNTRY = []
CITY = [] 
PERPETRATOR_GROUP = []
FATALITIES = []
INJURED = []
TARGET_TYPE = []

In [None]:
# First we Need the URLS which will be used by the crawlling method.
pageList = getPagesToCrawl()

# For each page inside our pageList:
for page in pageList:
    
    # Get the table data from the page with getTableDataFromPage function.
    tableData = getTableDataFromPage(page)
    
    # Extract 8 lists (according to parameter) with cleanHtmlCode function.
    GTD_ID_temp,DATE_temp,COUNTRY_temp,CITY_temp,PERPETRATOR_GROUP_temp,FATALITIES_temp,INJURED_temp,TARGET_TYPE_temp = cleanHtmlCode(tableData)
    
    # The the temp lists and EXTEND the data to non-temp list.
    GTD_ID.extend(GTD_ID_temp)
    DATE.extend(DATE_temp)
    COUNTRY.extend(COUNTRY_temp)
    CITY.extend(CITY_temp)
    PERPETRATOR_GROUP.extend(PERPETRATOR_GROUP_temp)
    FATALITIES.extend(FATALITIES_temp)
    INJURED.extend(INJURED_temp)
    TARGET_TYPE.extend(TARGET_TYPE_temp)

In [29]:
# After the crawlling method is done all we have left to do is create a DF with createDf function and save the df to csv file.
df = createDf(GTD_ID,DATE,COUNTRY,CITY,PERPETRATOR_GROUP,FATALITIES,INJURED,TARGET_TYPE) 
df.to_csv('GTD_Data_Frame.csv')

In [64]:
# Reading the df once again to see that everything is ok
df = pd.read_csv('GTD_Data_Frame.csv')
df

Unnamed: 0.1,Unnamed: 0,GTD_ID,DATE,COUNTRY,CITY,PERPETRATOR_GROUP,FATALITIES,INJURED,TARGET_TYPE
0,0,201912310033,2019-12-31,China,Hong Kong,Unknown,0,0,Government (General)
1,1,201912310032,2019-12-31,India,Bagiot Dora,Unknown,0,1,Private Citizens & Property
2,2,201912310031,2019-12-31,Sudan,El Geneina,Unknown,2,0,"Government (General),Police"
3,3,201912310030,2019-12-31,Sudan,El Geneina,Unknown,2,1,Police
4,4,201912310028,2019-12-31,Iraq,Baghdad,Unknown,0,0,Private Citizens & Property
...,...,...,...,...,...,...,...,...,...
301178,301178,197001000003,1970-01-00,Japan,Fukouka,Unknown,Unknown,Unknown,Government (Diplomatic)
301179,301179,197001000002,1970-01-00,Greece,Athens,Unknown,Unknown,Unknown,Government (Diplomatic)
301180,301180,197001000001,1970-01-00,Philippines,Unknown,Unknown,1,0,Journalists & Media
301181,301181,197000000002,1970-00-00,Mexico,Mexico city,23rd of September Communist League,0,0,Government (Diplomatic)


In [None]:
df = df.drop_duplicates(subset=['GTD_ID'], keep='first')
df.to_csv('GTD_Data_Frame_Without_Duplicates.csv')

##   Second part - Multi-Crawl
###  Get more data according to GTD_ID

In [23]:
def getLinksForSecondCrawl(GTD_ID):
    baseUrl = "https://www.start.umd.edu/gtd/search/IncidentSummary.aspx?gtdid="
    urlList = []
    
    for id in GTD_ID:
        urlList.append(baseUrl +str(id))
    
    return urlList
    

In [13]:
df = pd.read_csv('GTD_Data_Frame_Without_Duplicates.csv')

In [None]:
# GTD_ID will be used in order to get the pages which we need to crawl to get the data 
GTD_ID = df.GTD_ID

In [21]:
df[df["GTD_ID"] == 201912310018]

Unnamed: 0.1,Unnamed: 0,GTD_ID,DATE,COUNTRY,CITY,PERPETRATOR_GROUP,FATALITIES,INJURED,TARGET_TYPE
13,13,201912310018,2019-12-31,Yemen,Ataq district,Al-Islah Party,Unknown,Unknown,Government (General)


In [28]:
urlList = getLinksForSecondCrawl(GTD_ID)

In [30]:
urlForCheck = "https://www.start.umd.edu/gtd/search/IncidentSummary.aspx?gtdid=201912270020"