## General imports #1

In [None]:
'''
    All needed imports for the cells to run accordingly.
'''
## General data processing and visualisation use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import os
import glob
import plotly.express as px

## For webscraping
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

## Machine learning / Deep learning classification models
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import tree
from sklearn.metrics import ConfusionMatrixDisplay

## XGBoost as extra
import xgboost as xgb

## Set the pandas display option set to max_columns
pd.set_option('display.max_columns', None)

## Natural language processing
from collections import Counter
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import spacy


## Web scraping funcions prep #2

Data gethered from: nwac.ac -> washington mountsins public observations

**WEBSITE IS VERY SPECIFIC**: There appear to be a varierity of mash ups between containers when looking in the detailed overview of an avalanche observation. Will need to copy the href link from detailed observations and call that seperately.

In [None]:
'''
    Function - extra_info_gather()
    input - {str:site_link, arr:date, arr:zone, arr:location, arr:recent_av, arr:cracking, arr:collapsing, arr:info_observation, arr:cloud_cover, arr:wind, arr:advanced_observation_comments}
    output - arrays appended with data from the web address "site_link"

    NOTE: all of the paramater variable are arrays that get appedned with data from nwac.us. Specific to the website do to its caothic structure of containers and divs. Havent YET found a way to scrape it without getting all data.
'''

## FUTURE - THIS NEEDS TO BE CHANGED TO USE ASSERT INSTEAD OF TRY/CATCH BLOCKS
def extra_info_gather(site_link, date, zone, location, recent_av, cracking, collapsing, info_observation, cloud_cover, wind, advanced_observation_comments):
    response = requests.get(site_link)
    time.sleep(0.1)
    # verify_request(response)

    # String to append to in order
    temp_str = ''

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all of the divs from the observation details section
    ## All seems to have a class of col-xs-12
    all_info_divs_row_1 = soup.findAll('div', class_='col-xs-12')

    ## Field-values seem to describe conditions and snow texture, save all into one?
    all_info_advanced = soup.find('div', class_='advanced-observations')    #.findAll('p', class_='field-value')

    ## Check
    if type(all_info_advanced) != type(None):
        all_info_advanced = all_info_advanced.findAll('p', class_='field-value')

    # Holds cloud cover, and wind
    all_info_weather_summary = soup.find('div', class_='advanced-observations')
    if type(all_info_weather_summary) != type(None):
        all_info_weather_summary = all_info_weather_summary.findAll('div', class_='col-xs-4')

    ## Initialise found for wanted features as false.
    date_added, zone_region_added, location_added, recent_av_added, cracking_added, collapsing_added, obs_added = False, False, False, False, False, False, False

    ## Need to compare each txt, if match to what i want then store, otherwise return nan?
    for div_instance in all_info_divs_row_1:
        ## First take care of the observation_date
        try:
            date_bool = str(div_instance.h5.text) == 'Observation Date:'
            zone_region_bool = str(div_instance.h5.text) == 'Zone or Region:'
            location_bool = str(div_instance.h5.text) == 'Location:'
            recent_av_bool = str(div_instance.h5.text) == 'Recent Avalanches? '
            cracking_bool = str(div_instance.h5.text) == 'Cracking? '
            collapsing_bool = str(div_instance.h5.text) == 'Collapsing? '

            observation_full = div_instance.text
            observation = observation_full.replace(str(div_instance.h5.text), '').strip()

            ## Check if true and concat with array, further set flag to True (1)
            if date_bool:
                date += [observation]
                date_added = True

            elif zone_region_bool:
                zone += [observation]
                zone_region_added = True

            elif location_bool:
                location += [observation]
                location_added = True

            elif recent_av_bool:
                recent_av += [observation]
                recent_av_added = True

            elif cracking_bool:
                cracking += [observation]
                cracking_added = True

            elif collapsing_bool:
                collapsing += [observation]
                collapsing_added = True

        except Exception as e1:
            ## Cant be a h5 therefore must be something else, try h4 as thats the universal for observations on the website...
            # print(f'Error came up as h5 could not be found - {e}')

            try:
                obs_bool = str(div_instance.h4.text) == 'Observations'
                if obs_bool == True:
                    text = str(div_instance.p.text).strip()
                    info_observation += [text]
                    obs_added = True
            except Exception as e2:
                print(f'observation is missing? - {e2.__context__} followed by {e2.__cause__}')


    ## Weather summery data if there
    if not all_info_weather_summary:
        # populate with nan
        cloud_cover += [np.nan]
        wind += [np.nan]
    elif type(all_info_weather_summary) is None:
        cloud_cover += [np.nan]
        wind += [np.nan]
    else:
        for i in all_info_weather_summary:
            string = " ".join(i.text.split())
            if i.h5.text == 'Cloud Cover:':
                string = string.replace('Cloud Cover:', '')
                cloud_cover += [string]
            if i.h5.text == 'Wind:':
                string = string.replace('Wind:', '')
                wind += [string]


    ## Comments from advanced
    if not all_info_advanced:
        ## No data found, have to populate with nan
        advanced_observation_comments += [np.nan]
    elif type(all_info_advanced) is None:
        advanced_observation_comments += [np.nan]
    else:
        for i in all_info_advanced:
            temp_str += str(i.text).strip()
        advanced_observation_comments += [temp_str]

    # Which has not been added?
    # If one has not, populate it with nan/relevant to keep size consistancy.
    if date_added == False:
        print("date added was not found")
        date += [np.nan]
    if zone_region_added == False:
        print("zone added was not found")
        zone += [np.nan]
    if location_added == False:
        print("location added was not found")
        location += [np.nan]
    if recent_av_added == False:
        print("av added was not found")
        recent_av += ['No']
    if cracking_added == False:
        print("cracking added was not found")
        cracking += ['None Reported']
    if collapsing_added == False:
        print("collapsing added was not found")
        collapsing += ['None Reported']
    if obs_added == False:
        print("observation comments added was not found")
        info_observation += [np.nan]

    ## return
    return date, zone, location, recent_av, cracking, collapsing, info_observation, cloud_cover, wind, advanced_observation_comments


In [None]:
'''
    Function - links_extra()
    Input - {webdriver:browser, arr:links, arr:links_arr}
    Output - arr:links arr, filled array with all links to each instance of an avalanche

    NOTE: uses webdriver to get data and click accordingly to get more data, then passed to beautiful soup for extraction, does this repeatedly for every entry of the page to gather url links to extra information about avalanches. again specific to nwac.us.
'''
def links_extra(browser, links, links_arr):
    length = len(links)
    print(length)

    for i in range(0, length-1):
        ## Want to do it up to 49 as index 50 is an instagram link.
        links[i].click()
        # time.sleep()
        ## Extract the open in new tab link
        ## pass through to soup?
        html = browser.page_source
        soup = BeautifulSoup(html, 'html')

        full_link = soup.find('a', class_='obs-open-new').get('href')
        links_arr += [full_link]

        quit_link = browser.find_element(By.CLASS_NAME, 'close')
        quit_link.click()

    return links_arr


In [None]:
## Collect data finds all the needed data from website using beautiful soup and appends
''' '''
def base_collect_data(soup, dates, observer, region, location, avalanche, instability):
    '''
        already made soup passed in, other variables are array arguments to be appended with data
        returns the appended data from souped website
    '''

    dates += [y.a.text for y in soup.findAll('td', class_='date')]
    observer += [y.text for y in soup.findAll('td', class_='observer')]
    region += [y.text for y in soup.findAll('td', class_='zone')]
    location += [y.text for y in soup.findAll('td', class_='location')]
    avalanche += [y.text for y in soup.findAll('td', class_='avalanches')]
    instability += [y.text for y in soup.findAll('td', class_='instability')]

    return dates, observer, region, location, avalanche, instability

## WEB SCRAPING EXTRACTION #3

1. it seems that for the website a 500 error occurs if you give hand it too large of a range, seemed to work really well for 4 years of data at a time absolute max. Anything above that will return error 500.

In [None]:
'''
 This cell by means of using WebDriver can be upgraded to fit a verierity of inputted seasons
 Future improvements or if I have enough time left...
 Cell is very spcific for nwac as its structure on getting data via next buttons works slightly differently.
 Use of all previous created functions, this is the main cell that runs the web scrape.

 IMPORTANT NOTES:
    - If wanting to scrape the nwac.us website given observations, you have a 45 second window after the window opens using where this cell sleeps using time, reason for that is so then one can input a range of dates per wanted data in the web driver instance, after 45 seconds the cell will start scraping the entire section and its pages accordingly.
    - The website gets a request error 500 when exceeding a certain range of extracted dates, if wanting to extract everything:
        - 1st extract from 1st JAN 2020 till 30th NOV 2022
        - 2nd extract from 1st DEC 2022 till CURRENT (if you put a date above current, should still work)
        - Thus needs to be run twice due to the server error that occurs when attempting to extract the full range.
'''

# Initialise arrays to store the data
dates = []
observer = []
region = []
location = []
avalanche = []
instability = []
links = []

# Initialsie extra arrays for the extra data insigts of each avalanche instance
date_arr = []
zone_arr = []
location_arr = []
recent_av_arr = []
cracking_arr = []
collapsing_arr = []
info_observation_arr = []
image_links_arr = []
cloud_cover_arr = []
wind_arr = []
advanced_observation_comments_arr = []


# Initialise the browser
browser = webdriver.Chrome()
site = 'https://nwac.us/observations/#/obs'
browser.get(site)
# Give it time for user to pick range within chrome instance.
time.sleep(45)

# Find all button indexes for pages
selects = browser.find_elements(By.CLASS_NAME, 'paginate_button')

# Minus 1 due to the last increment not changing the html layout, called twice
len_run = int(selects[6].text) - 1
# Layout of index pages changes on click 5
first_change_limit = 5

################################################
# NOTE BELOW CAN BE OPTIMSED, 3 COPIES OF THESAME CODE CAN BE MADE INTO ONE FUNCTION
# THINK ABOUT IF TIME ALLOWS.
################################################


# For loop to go through the website, dependent upon seasonal settings within the browser
for i in range(0, len_run):

    # Find all the buttons that change sub pages within the website
    selects = browser.find_elements(By.CLASS_NAME, 'paginate_button')

    # First 4 button clicks for next page are thesame
    if i < first_change_limit:
        print(f'--- CURRENT PAGE BEING SCRAPED {selects[i].text}')
        selects[i].click()
        time.sleep(1)

        # Getting extra links for extra info
        extra_info = browser.find_elements(By.CLASS_NAME, 'strong')
        links = links_extra(browser, extra_info, links)

        # Get the source of the page and soup it, append arrays of data
        html = browser.page_source
        soup = BeautifulSoup(html, 'html')
        dates, observer, region, location, avalanche, instability = base_collect_data(
            soup, dates, observer, region, location, avalanche, instability
        )

    # Layout changes after 5th click
    elif i >= first_change_limit and i<=len_run-2:
        print(f'--- CURRENT PAGE BEING SCRAPED {selects[4].text}')
        selects[4].click()
        time.sleep(1)

        # Getting extra links for extra info
        extra_info = browser.find_elements(By.CLASS_NAME, 'strong')
        links = links_extra(browser, extra_info, links)

        # Get the source of the page and soup it, append arrays of data
        html = browser.page_source
        soup = BeautifulSoup(html, 'html')
        dates, observer, region, location, avalanche, instability = base_collect_data(
            soup, dates, observer, region, location, avalanche, instability
        )

    # Last two clicks as layout does not update with the last 2 entires.
    else:
        try:
            print(f'--- CURRENT PAGE BEING SCRAPED {selects[5].text}')
            # Fetch the last 2 -> index 5 and 6
            selects[5].click()
            time.sleep(1)

            # Getting extra links for extra info
            extra_info = browser.find_elements(By.CLASS_NAME, 'strong')
            links = links_extra(browser, extra_info, links)

            # Get the source of the page and soup it, append arrays of data
            html = browser.page_source
            soup = BeautifulSoup(html, 'html')
            dates, observer, region, location, avalanche, instability = base_collect_data(
                soup, dates, observer, region, location, avalanche, instability
            )

            selects = browser.find_elements(By.CLASS_NAME, 'paginate_button')
            print(f'--- CURRENT PAGE BEING SCRAPED {selects[6].text}')
            selects[6].click()

            # Getting extra links for extra info
            extra_info = browser.find_elements(By.CLASS_NAME, 'strong')
            links = links_extra(browser, extra_info, links)

            # Get the source of the page and soup it, append arrays of data
            html = browser.page_source
            soup = BeautifulSoup(html, 'html')
            dates, observer, region, location, avalanche, instability = base_collect_data(
                soup, dates, observer, region, location, avalanche, instability
            )
        except Exception as e:
            # Exception will be raised
            print(f'Raised exception - {e}')

        print("--- EOW - End of web index - FINISHED ---")

## Check
for site_link in links:
    print(f"SCRAPING EXTRA DETAILS - {site_link}")

    # Will update the extra details
    date_arr, zone_arr, location_arr, recent_av_arr, cracking_arr, collapsing_arr, info_observation_arr, cloud_cover_arr, wind_arr, advanced_observation_comments_arr = extra_info_gather(site_link, date_arr, zone_arr, location_arr, recent_av_arr, cracking_arr, collapsing_arr, info_observation_arr, cloud_cover_arr, wind_arr, advanced_observation_comments_arr)



In [None]:
# With collected data, import into pandas dataframe.
df = pd.DataFrame({
    'dates': dates,
    'observer' : observer,
    'region' : region,
    'zone': zone_arr,
    'location' : location,
    'avalanche_Y/N' : avalanche,
    'instability' : instability,
    'links' : links,
    'location_ele': location_arr,
    'cracking': cracking_arr,
    'collapsing': collapsing_arr,
    'info_observation': info_observation_arr,
    'advanced_observations': advanced_observation_comments_arr
})

In [None]:
# Usually save, however already been done so commented out
# Remember, would need to be run twice on different ranges to get full data due to 500 web address error.

In [None]:
# Save
df.to_csv('avalanche_occurance_1/2.csv') # Change to whatever name you want

**Remember that the script needs to be run at least 2x as too large of a range in finding observations at nwac.ac fails to error code 500** -> Will need to concat the data together to get one complete dataframe.

**IN MY CASE:** They have been saved as **"avalanche_occurance_1.csv"** and **"avalanche_occurance_2.csv"**

**MORE IN SECOND FILE**