## WITHIN THIS: Basic cleaning is concucted and visualised, further key bigrams/trigrams were extracted and visualised.

In [None]:
## General data processing and visualisation use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import os
import glob
import plotly.express as px

## For webscraping
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

## Machine learning / Deep learning classification models
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import tree
from sklearn.metrics import ConfusionMatrixDisplay

## XGBoost as extra
import xgboost as xgb

## Set the pandas display option set to max_columns
pd.set_option('display.max_columns', None)

## Natural language processing
from collections import Counter
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import spacy


**Remember that the script needs to be run at least 2x as too large of a range in finding observations at nwac.ac fails to error code 500** -> Will need to concat the data together to get one complete dataframe.

**IN MY CASE:** They have been saved as **"avalanche_occurance_1.csv"** and **"avalanche_occurance_2.csv"**

**MORE IN SECOND FILE**

## Base Data Cleaning (01)

**data should be split into two dataframe (gained from running the script twice at two different ranges for dates.), will need to connect them together.**

1. Null values are handeled

Not much cleaning was needed to be done in terms for visualisation, as such only null values were the ones affected


In [None]:
''' Get the elevation if exists out of location '''
def location_to_elevation(entry):
    entry = re.findall(r'\b\d+\b', entry)
    return entry

In [None]:
# Less recent dates
df1 = pd.read_csv('avalanche_occurance_1.csv')
# More recent dates
df2 = pd.read_csv('avalanche_occurance_2.csv')

# Concat both the datasets
df = pd.concat([df2, df1])
df.drop(columns=['Unnamed: 0'], inplace=True)
df.to_csv('main_avalanche_observations_dataset.csv')
df.head()

In [None]:
# Read the csv of avalanche observations and add to variable df
df = pd.read_csv('main_avalanche_observations_dataset.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

In [None]:
# Given info_observation, fill in missing values with advanced_observations
# In turn giving one column, drop the advanced_observations col
df['info_observation'].fillna(df['advanced_observations'], inplace=True)
df.drop(columns=['advanced_observations'], inplace=True)

In [None]:
# Change nulls in [avalanche_Y/N, instability] to No
df['avalanche_Y/N'].fillna('No', inplace=True)
df['instability'].fillna('No', inplace=True)
df.head()


In [None]:
# Given info_observation, fill in missing values with advanced_observations
# In turn giving one column, drop the advanced_observations col
df['info_observation'].fillna(df['advanced_observations'], inplace=True)
df.drop(columns=['advanced_observations'], inplace=True)

In [None]:
# Change nulls in [avalanche_Y/N, instability] to No
df['avalanche_Y/N'].fillna('No', inplace=True)
df['instability'].fillna('No', inplace=True)
df.head()

In [None]:
# Get all the possible elevations given a location
# - some have been added but not many -> thus not included.
df['elevation'] = df['location_ele'].apply(location_to_elevation)
df['elevation'].head()

In [None]:
# Dont think this specifically can be used as too many missing, however could be found with the use of NLP?
df.elevation.value_counts()

## EDA - Base data cleaning (02)

**Bit kind of rushed, need to refactor the code slightly**

In [None]:
'''
Function to calculate the amoung of NULLs in a dataframe per its given columns
--- INPUT: pandas dataframe
--- OUTPUT: pandas dataframe
'''

def calc_null(dataframe):

    # Initialise a new df
    missing_df = pd.DataFrame(df.isna().sum(), columns=['missing_vals'])
    missing_df['percentage'] = 0.0

    # Initialise math and loop var
    total_count = len(dataframe)

    # Assign percentage
    missing_df['percentage'] = round(missing_df['missing_vals'] / total_count * 100, 3)

    # Sort accordingly
    missing_df = missing_df.sort_values('percentage', ascending=False)

    return missing_df

In [1]:
''' Shows count of the avalanche occures categorised into Y/N'''

# Avalanche_Y/N will not have any nulls as been web scraped.
disp_df = df.groupby('region')[['avalanche_Y/N']].value_counts().reset_index().copy()
disp_df = disp_df.rename(columns={ disp_df.columns[2]: "count" }).sort_values(['count'], ascending=False)

# Initialise figure
plt.figure(figsize = (5,5))
sns.barplot(
    data=disp_df,
    x='region',
    y='count',
    hue='avalanche_Y/N'
)
plt.xticks(
    rotation=45,
    horizontalalignment = 'right',
    fontweight = 'light',
    fontsize = 'large'
)
plt.title('Y/N occurances of avalanches per region')
# plt.savefig('Y/N_avalanches_region.png', dpi=300, bbox_inches='tight')
plt.show()

NameError: name 'df' is not defined

In [None]:
''' Shows the missing percentage df '''

# Use previously created function to check amount of missing values and %
df_missing_original = calc_null(df)

# Reset the index
df_missing_original_disp = df_missing_original.reset_index()

# Initialise the figure and run seaborn for visualisations.
plt.figure(figsize = (6, 6))

sns.barplot(
    data = df_missing_original_disp,
    x = 'index',
    y = 'percentage'
)
plt.xticks(
    rotation=45,
    horizontalalignment='right',
    fontweight ='light',
    fontsize ='large'
)
plt.title('percentage of nulls per column')
plt.show()

In [None]:
''' Unique display of avalanches per given feature '''

u_df = df.describe().loc['unique'].copy()
temp_df = pd.DataFrame({'unique_count':u_df})
temp_df = temp_df.sort_values('unique_count', ascending=False)
values = temp_df.unique_count
plt.figure(figsize=(5,5))
sns.barplot(
    data=temp_df,
    x=temp_df.index,
    y='unique_count'
)
plt.xticks(
    rotation=45,
    horizontalalignment = 'right',
    fontweight = 'light',
    fontsize = 'large'
)
plt.title('unique counts per column')
# plt.savefig('Y/N_avalanches_region.png', dpi=300, bbox_inches='tight')
plt.show()
print(values)

In [None]:
# Check describe
df.describe()

In [None]:
# Check info
df.info()

# NLP break down and testing for enrichment of dataset (03)

**FURTHER IMPORVEMENTS FOR THIS SECTION IS TO SPLIT THE YES AND NO AVALANCHES AND THEN DO EXACLY WHAT I DID BELOW, EASIER TO FIND OUT WHAT KIND OF TEXTURE PER GIVEN INSTANCE OF AVALANCHE**

**Further to figure out whether its past tense or not, find most common adjectives and go from there? - obviously need to do it for yes and no as the snow texture can be thought of as leading up**

In [None]:
def clean_text(text, token=False):
    ''' Cleans and preprocesses the input text '''

    # Stop words test
    stop_words = set(stopwords.words('english'))
    # stop_words = set()
    text = str(text)
    text = text.lower()

    # Removal of:
    punc = '!"#$%&()*+, -./:;<=>?@[\]^_`{|}~”“\''
    punc = [x for x in punc]
    text = re.sub(r'http\S+', '', text) # Urls
    text = re.sub(r'<.*?>', '', text) #Html tags
    text = re.sub(r'(@.+?)\s', '', text)
    text = re.sub(r'(//t.co/.+?)\s', '', text)
    text = re.sub(r'(//t.co/.+?)', '', text)
    text = re.sub(r'[^\w\s]', ' ', text) # Special characters

    # Tokenize text into a sentance
    sentences = sent_tokenize(text)

    # Tokenize sentences into words
    tokens = []
    for sentence in sentences:
        tokens.extend(word_tokenize(sentence))

    text = text.lower()
    text = [word for word in text if word not in stop_words]


    stop_words.update(['who', 'what', 'where', 'when', 'why', 'how', 'which'])
    stop_words.update(['rt', '#', 'fav', '', ':', '@', '!', ';', '…','...', '(', ')', '~'])
    stop_words.update(punc)

    stop_words.remove('no')
    tokens = [token for token in tokens if token not in stop_words]

    # Tokens into a string
    clean_text = ' '.join(tokens)

    if token:
        # Tokenize
        clean_text = word_tokenize(clean_text)

    return clean_text

In [None]:
# Check to see how well it works.
string_original = df['info_observation'].head(1).values
string = string_original[0]
string_stem_test = clean_text(string)

print(string_stem_test)
print("\n")
print(string_original)

In [None]:
# apply to the dataset as works
df['observation_cleaned'] = df['info_observation'].apply(clean_text)
df['observation_cleaned_tokens'] = df['observation_cleaned'].apply(word_tokenize)

In [None]:
df.shape

## Bigrams/Trigrams extraction & evaluation (04)

**EXPLINATION:**
1. **extract_ngrams_k():** function where k is the number of grams, will extract pairings of words. here ive done 2 to 4. Inputs it as a new column. called first to get the pairings.
2. **collect_dict_bi_tri_quad()** function is called. Checking if the right columns exist and if they dont, attempt to make them. counters are set up with the aim to be used later to find most dominant pairings amongst the text. Calls full_dictionary_words_count()
3. **full_dictionary_words_count()** function called. Creates empty dictionary that fills with pairings. Calls dict_word_collection() to count all of the pairings. Updated dictionary is called.
4. Said fiilled dictionary with most dominat word pairings based on **limit** are passed to **show_word_cloud_via_dict()** function to create and display a word cloud.

Process is called on all existing columns of bigrams/trigrams and quadgramns, just needs to be called on the dataframe.

In [None]:
# Idea is that the text is already cleaned
# Using bigrams and tri grams functions to apply to dataframe
def extract_ngrams_2(text):
    doc = word_tokenize(text)
    ngrams = [str((doc[i], doc[i+1])) for i in range(len(doc)-1)]
    return ngrams

def extract_ngrams_3(text):
    doc = word_tokenize(text)
    ngrams = [str((doc[i], doc[i+1], doc[i+2])) for i in range(len(doc)-2)]
    return ngrams

def extract_ngrams_4(text):
    doc = word_tokenize(text)
    ngrams = [str((doc[i], doc[i+1], doc[i+2], doc[i+3])) for i in range(len(doc)-3)]
    return ngrams


In [None]:
'''
    Input: Obs
    Output:
'''
def dict_word_collection(obs_list, obs_dict, limit):
    for k,v in obs_list.items():
        if v > limit:
            obs_dict[k] = v
    return obs_dict

In [None]:
'''
    Input:
    Output:
'''
def full_dictionary_words_count(counter_bi, counter_tri, counter_quad):
    # Create dictionary of unique full counts, used for visualisation - bigrams and tri words
    obs_bi_list, obs_tri_list, obs_quad_list  = dict(counter_bi), dict(counter_tri), dict(counter_quad)

    # Initialise a set of 3 empty dictionaries
    obs_bi_dictionary, obs_tri_dictionary, obs_quad_dictionary = {}, {}, {}

    # Collect dictionary total of occurences
    bi_dict_collection = dict_word_collection(obs_bi_list, obs_bi_dictionary, limit=5)
    tri_dict_collection = dict_word_collection(obs_tri_list, obs_tri_dictionary, limit=5)
    quad_dict_collection = dict_word_collection(obs_quad_list, obs_quad_dictionary, limit=5)

    # Return the collection
    return bi_dict_collection, tri_dict_collection, quad_dict_collection


In [None]:
'''
    Input: Dataframe with cleaned text.
    Output: Word clouds of bi-grams, tri-grams and quad-grams and updated dataframe
'''
def collect_dict_bi_tri_quad(df):

    # Initialise counters
    obs_counts_ngrams2 = Counter()
    obs_counts_ngrams3 = Counter()
    obs_counts_ngrams4 = Counter()

    try:
        # Apply if possible
        df['bigrams'].apply(obs_counts_ngrams2.update)
        df['trigrams'].apply(obs_counts_ngrams3.update)
        df['quadgrams'].apply(obs_counts_ngrams4.update)
    except Exception as e1:
        print(f"cols in DF must include - [['bigrams','trigrams','quadgrams']]. \n Attempting to create one for user. \n Make sure text col is named observation_cleaned (does not need to be processed using NLP) \n-> {e1}")

        # If not created, create if for the user
        try:
            df['bigrams'] = df['observation_cleaned'].apply(extract_ngrams_2)
            df['trigrams'] = df['observation_cleaned'].apply(extract_ngrams_3)
            df['quadgrams'] = df['observation_cleaned'].apply(extract_ngrams_4)

        except Exception as e2:
            print(f"Attempted to create one. You sure the text field is called observation_cleaned?. \n-> {e2.__context____} // {e2}")

        else:
            # Apply
            df['bigrams'].apply(obs_counts_ngrams2.update)
            df['trigrams'].apply(obs_counts_ngrams3.update)
            df['quadgrams'].apply(obs_counts_ngrams4.update)

            return full_dictionary_words_count(obs_counts_ngrams2, obs_counts_ngrams3, obs_counts_ngrams4)

    else:
        return full_dictionary_words_count(obs_counts_ngrams2, obs_counts_ngrams3, obs_counts_ngrams4)

In [None]:
'''
    Input:
    Output:
'''
def show_word_cloud_via_dict(dict_col):
    # Word cloud for observations or bi-grams
    wc = WordCloud(width=800, height=400, max_words=50).generate_from_frequencies(dict_col)
    plt.figure(figsize=(24, 20))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
## Apply
df['bigrams'] = df['observation_cleaned'].apply(extract_ngrams_2)
df['trigrams'] = df['observation_cleaned'].apply(extract_ngrams_3)
df['quadgrams'] = df['observation_cleaned'].apply(extract_ngrams_4)

In [None]:
# Get dictionary of total occurances count of pairs of words
bi_collection_dict, tri_collection_dict, quad_collection_dict = collect_dict_bi_tri_quad(df)

# Show the word clouds
print("----- OVERALL COLLECTION -----")
show_word_cloud_via_dict(bi_collection_dict)
show_word_cloud_via_dict(tri_collection_dict)
show_word_cloud_via_dict(quad_collection_dict)

## EDA - NLP: BIGRAMS/TRIGRAMS -> AVALANCHE/AREA OCCURANCE (05)

In [None]:
# Initialise conditions
condition_yes = df['avalanche_Y/N'] == 'Yes'
condition_no = df['avalanche_Y/N'] == 'No'

# Get the df copies
df_yes = df[condition_yes][['avalanche_Y/N', 'observation_cleaned_tokens', 'bigrams', 'trigrams', 'quadgrams']].copy()
df_no = df[condition_no][['avalanche_Y/N', 'observation_cleaned_tokens', 'bigrams', 'trigrams', 'quadgrams']].copy()

# Get dictionary of total occurances count of pairs of words
bi_collection_y_dict, tri_collection_y_dict, quad_collection_y_dict = collect_dict_bi_tri_quad(df_yes)
bi_collection_n_dict, tri_collection_n_dict, quad_collection_n_dict = collect_dict_bi_tri_quad(df_no)

# Show the word clouds
print("----- YES COLLECTION -----")
show_word_cloud_via_dict(bi_collection_y_dict)
show_word_cloud_via_dict(tri_collection_y_dict)
show_word_cloud_via_dict(quad_collection_y_dict)

print("\n----- NO COLLECTION -----")
show_word_cloud_via_dict(bi_collection_n_dict)
show_word_cloud_via_dict(tri_collection_n_dict)
show_word_cloud_via_dict(quad_collection_n_dict)


In [None]:
## Delete occurances of bigrams and trigrams that are empty as that means there is nothing interesting to deal with
df = df[df['trigrams'].notna()]
df = df[df['bigrams'].notna()]

**IMPORTANT**: interesting way of texturusing snow:

Via the bigram and trigram dictionary word clouds that were made, im able to find most dominant terms and then categorise them slightly better. By using public snow observation sentances that were gathered from nwac.ac take most dominant combinations and categorise them into the following
1. Strong
2. Weak
3. Wet
4. Frozen
5. FRESH
6. STORM



In [None]:
bi_collection_y_dict = dict(sorted(bi_collection_y_dict.items(), key=lambda item: item[1], reverse=True))
bi_collection_n_dict = dict(sorted(bi_collection_n_dict.items(), key=lambda item: item[1], reverse=True))
tri_collection_y_dict = dict(sorted(tri_collection_y_dict.items(), key=lambda item: item[1], reverse=True))
tri_collection_n_dict = dict(sorted(tri_collection_n_dict.items(), key=lambda item: item[1], reverse=True))

In [None]:
'''
    Input:
    Output:
'''

## Make multi-labeled instead of just one.
def texturise_snow_bi(observation_arr):
    # For each bi/trigram in the observation, find a match and assign to columns accordingly?
    bool_wet, bool_dry, bool_weak, bool_strong, bool_frozen, bool_storm = False, False, False, False, False, False
    string_arr = []
    for i in observation_arr:
        # print(i)
        i = str(i)
        if i in wet_bi_arr and bool_wet == False and bool_dry == False:
            print('Returning wet')
            string_arr += ['Wet']
            bool_wet = True
        elif i in dry_bi_arr and bool_dry == False and bool_wet == False:
            print('returning dry')
            string_arr += ['Dry']
            bool_dry = True
        elif i in weak_bi_arr and bool_weak == False and bool_strong == False:
            print('returning weak')
            string_arr += ['Weak']
            bool_weak = True
            # return 'Weak'
        elif i in strong_bi_arr and bool_strong == False and bool_weak == False:
            print('returning strong')
            string_arr += ['Strong']
            bool_strong = True
        elif i in hoar_bi_arr and bool_frozen == False:
            print('returning frozen')
            string_arr += ['Frozen']
            bool_frozen = True
            # return 'Frozen'
        elif i in storm_bi_arr and bool_storm == False:
            string_arr += ['Storm']
            bool_storm = True


    ## Apply set logic to make sure the combinations are thesame
    if set(string_arr) == {'Wet', 'Strong'}:
        return ['Wet', 'Strong']
    elif set(string_arr) == {'Wet', 'Weak'}:
        return ['Wet', 'Weak']
    elif set(string_arr) == {'Wet', 'Storm'}:
        return ['Wet', 'Storm']
    elif set(string_arr) == {'Wet', 'Frozen'}:
        return ['Wet', 'Frozen']
    elif set(string_arr) == {'Wet', 'Strong', 'Storm'}:
        return ['Wet', 'Strong', 'Storm']
    elif set(string_arr) == {'Wet', 'Strong', 'Frozen'}:
        return ['Wet', 'Strong', 'Frozen']
    elif set(string_arr) == {'Wet', 'Strong', 'Frozen', 'Storm'}:
        return ['Wet', 'Strong', 'Frozen', 'Storm']
    elif set(string_arr) == {'Wet', 'Weak', 'Storm'}:
        return ['Wet', 'Weak', 'Storm']
    elif set(string_arr) == {'Wet', 'Frozen', 'Storm'}:
        return ['Wet', 'Frozen', 'Storm']
    elif set(string_arr) == {'Wet', 'Weak', 'Frozen'}:
        return ['Wet', 'Weak', 'Frozen']
    elif set(string_arr) == {'Wet', 'Weak', 'Storm', 'Frozen'}:
        return ['Wet', 'Weak', 'Storm', 'Frozen']
    elif set(string_arr) == {'Dry', 'Strong'}:
        return ['Dry', 'Strong']
    elif set(string_arr) == {'Dry', 'Weak'}:
        return ['Dry', 'Weak']
    elif set(string_arr) == {'Dry', 'Storm'}:
        return ['Dry', 'Storm']
    elif set(string_arr) == {'Dry', 'Hoar'}:
        return ['Dry', 'Hoar']
    elif set(string_arr) == {'Dry', 'Strong', 'Storm'}:
        return ['Dry', 'Strong', 'Storm']
    elif set(string_arr) == {'Dry', 'Strong', 'Frozen'}:
        return ['Dry', 'Strong', 'Frozen']
    elif set(string_arr) == {'Dry', 'Strong', 'Frozen', 'Storm'}:
        return ['Dry', 'Strong', 'Frozen', 'Storm']
    elif set(string_arr) == {'Dry', 'Weak', 'Storm'}:
        return ['Dry', 'Weak', 'Storm']
    elif set(string_arr) == {'Dry', 'Weak', 'Frozen'}:
        return ['Dry', 'Weak', 'Frozen']
    elif set(string_arr) == {'Dry', 'Weak', 'Frozen', 'Storm'}:
        return ['Dry', 'Weak', 'Frozen', 'Storm']
    elif set(string_arr) == {'Dry', 'Frozen', 'Storm'}:
        return ['Dry', 'Frozen', 'Storm']
    elif set(string_arr) == {'Strong', 'Frozen'}:
        return ['Strong', 'Frozen']
    elif set(string_arr) == {'Strong', 'Storm'}:
        return ['Strong', 'Storm']
    elif set(string_arr) == {'Strong', 'Frozen', 'Storm'}:
        return ['Strong', 'Frozen', 'Storm']
    elif set(string_arr) == {'Weak', 'Frozen'}:
        return ['Weak', 'Frozen']
    elif set(string_arr) == {'Weak', 'Frozen', 'Storm'}:
        return ['Weak', 'Frozen', 'Storm']
    elif set(string_arr) == {'Weak', 'Storm'}:
        return ['Weak', 'Storm']
    elif set(string_arr) == {'Storm', 'Frozen'}:
        return ['Storm', 'Frozen']

    # if string_arr == []:
    #     return 'Fresh'
    return string_arr


In [None]:
def texturise_snow(observation_arr):
    # For each bi/trigram in the observation, find a match and assign to columns accordingly?
    bool_wet, bool_dry, bool_weak, bool_strong, bool_frozen, bool_storm = False, False, False, False, False, False
    string_arr = []
    for i in observation_arr:
        # print(i)
        i = str(i)
        if i in wet_arr and bool_wet == False and bool_dry == False:
            print('Returning wet')
            string_arr += ['Wet']
            bool_wet = True
        elif i in dry_arr and bool_dry == False and bool_wet == False:
            print('returning dry')
            string_arr += ['Dry']
            bool_dry = True
        elif i in weak_arr and bool_weak == False and bool_strong == False:
            print('returning weak')
            string_arr += ['Weak']
            bool_weak = True
            # return 'Weak'
        elif i in strong_arr and bool_strong == False and bool_weak == False:
            print('returning strong')
            string_arr += ['Strong']
            bool_strong = True
        elif i in hoar_arr and bool_frozen == False:
            print('returning frozen')
            string_arr += ['Frozen']
            bool_frozen = True
            # return 'Frozen'
        elif i in storm_arr and bool_storm == False:
            string_arr += ['Storm']
            bool_storm = True

    ## Apply set logic to make sure the combinations are thesame
    if set(string_arr) == {'Wet', 'Strong'}:
        return ['Wet', 'Strong']
    elif set(string_arr) == {'Wet', 'Weak'}:
        return ['Wet', 'Weak']
    elif set(string_arr) == {'Wet', 'Storm'}:
        return ['Wet', 'Storm']
    elif set(string_arr) == {'Wet', 'Frozen'}:
        return ['Wet', 'Frozen']
    elif set(string_arr) == {'Wet', 'Strong', 'Storm'}:
        return ['Wet', 'Strong', 'Storm']
    elif set(string_arr) == {'Wet', 'Strong', 'Frozen'}:
        return ['Wet', 'Strong', 'Frozen']
    elif set(string_arr) == {'Wet', 'Strong', 'Frozen', 'Storm'}:
        return ['Wet', 'Strong', 'Frozen', 'Storm']
    elif set(string_arr) == {'Wet', 'Weak', 'Storm'}:
        return ['Wet', 'Weak', 'Storm']
    elif set(string_arr) == {'Wet', 'Frozen', 'Storm'}:
        return ['Wet', 'Frozen', 'Storm']
    elif set(string_arr) == {'Wet', 'Weak', 'Frozen'}:
        return ['Wet', 'Weak', 'Frozen']
    elif set(string_arr) == {'Wet', 'Weak', 'Storm', 'Frozen'}:
        return ['Wet', 'Weak', 'Storm', 'Frozen']
    elif set(string_arr) == {'Dry', 'Strong'}:
        return ['Dry', 'Strong']
    elif set(string_arr) == {'Dry', 'Weak'}:
        return ['Dry', 'Weak']
    elif set(string_arr) == {'Dry', 'Storm'}:
        return ['Dry', 'Storm']
    elif set(string_arr) == {'Dry', 'Hoar'}:
        return ['Dry', 'Hoar']
    elif set(string_arr) == {'Dry', 'Strong', 'Storm'}:
        return ['Dry', 'Strong', 'Storm']
    elif set(string_arr) == {'Dry', 'Strong', 'Frozen'}:
        return ['Dry', 'Strong', 'Frozen']
    elif set(string_arr) == {'Dry', 'Strong', 'Frozen', 'Storm'}:
        return ['Dry', 'Strong', 'Frozen', 'Storm']
    elif set(string_arr) == {'Dry', 'Weak', 'Storm'}:
        return ['Dry', 'Weak', 'Storm']
    elif set(string_arr) == {'Dry', 'Weak', 'Frozen'}:
        return ['Dry', 'Weak', 'Frozen']
    elif set(string_arr) == {'Dry', 'Weak', 'Frozen', 'Storm'}:
        return ['Dry', 'Weak', 'Frozen', 'Storm']
    elif set(string_arr) == {'Dry', 'Frozen', 'Storm'}:
        return ['Dry', 'Frozen', 'Storm']
    elif set(string_arr) == {'Strong', 'Frozen'}:
        return ['Strong', 'Frozen']
    elif set(string_arr) == {'Strong', 'Storm'}:
        return ['Strong', 'Storm']
    elif set(string_arr) == {'Strong', 'Frozen', 'Storm'}:
        return ['Strong', 'Frozen', 'Storm']
    elif set(string_arr) == {'Weak', 'Frozen'}:
        return ['Weak', 'Frozen']
    elif set(string_arr) == {'Weak', 'Frozen', 'Storm'}:
        return ['Weak', 'Frozen', 'Storm']
    elif set(string_arr) == {'Weak', 'Storm'}:
        return ['Weak', 'Storm']
    elif set(string_arr) == {'Storm', 'Frozen'}:
        return ['Storm', 'Frozen']

    return string_arr


In [None]:

# Initialisation of categorical snow textures
wet_bi_arr =[
    "('rain', 'crust')","('wet', 'snow')","('melt', 'forms')","('loose', 'wet')","('moist', 'snow')","('small', 'wet')","('wet', 'avalanches')","('snow', 'wet')","('snow', 'moist')","('light', 'rain')","('wet', 'slide')" ,"('thick', 'melt')","('hard', 'melt')","('wet', 'loose')", "('wet', 'slides')","('wet', 'avalanches')","('wet', 'slabs')", "('remained', 'wet')", "('wetter', 'snow')"
]
dry_bi_arr = [
    "('dry', 'snow')","('dry', 'loose')","('soft', 'snow')","('loose', 'dry')","('cold', 'dry')","('dry', 'powder')","('dry', 'avalanches')","('dry', 'powder')", "('warm', 'weather')"
]
weak_bi_arr = [
    "('low', 'density')","('weak', 'snow')","('thin', 'crust')","('weaker', 'snow')","('loose', 'activity')","('poorly', 'bonded')","('loose', 'snow')","('weak', 'layers')", "('weak', 'layer')", "('softer', 'surface')", "('sun', 'crust')", "('new', 'snow')"
]
strong_bi_arr = [
    "('well', 'bonded')","('density', 'snow')","('hard', 'crust')","('firm', 'crust')","('thick', 'crust')","('bonded', 'crust')","('bonding', 'well')","('snow', 'hard')", "('snow', 'heavy')", "('stable', 'snow')"
]
## Seperate
storm_bi_arr =[
    "('recent', 'storm')","('new', 'storm')","('mid', 'storm')","('storm', 'layer')"
]
# aka frozen
hoar_bi_arr = [
    "('melt', 'freeze')", "('freeze', 'crust')","('freezing', 'rain')","('ice', 'crust')","('ice', 'layer')","('hoar', 'layer')","('hoar', 'observed')","('hoar', 'near')","('hoar', 'frost')", "('hoary', 'surface')", "('hoar', 'surface')", "('surface', 'hoar')"
]

# Single words for the nans after bigram check
wet_arr = ['wet']
dry_arr = ['dry', 'powder']
weak_arr = ['weak']
strong_arr = ['strong', 'durable', 'firm', 'consolidated', 'stable']
storm_arr = ['storm']
hoar_arr = ['frozen', 'cold']

**Save to a column per each bigram/trigram extraction**

In [None]:
df['snow_condition_per_bigram'] = df['bigrams'].apply(texturise_snow_bi)
df['snow_condition_per_bigram'].value_counts(normalize=True)

In [None]:
df['snow_condition_per_word'] = df['observation_cleaned_tokens'].apply(texturise_snow)
df['snow_condition_per_word'].value_counts(normalize=True)

In [None]:
'''

'''
def make_na(x):
    if [] == x:
        return np.nan
    else:
        return x

def make_fresh(x):
    if [] == x:
        return ['Fresh']
    else:
        return x

**FILL NEW NA FROM OBSERVATION EXTRACTIONS**

In [None]:
# Some came out as empty so setting them as nan
df['snow_condition_per_bigram'] = df['snow_condition_per_bigram'].apply(make_na)

In [None]:
## SAVE IT! - check point sort of deal
df.to_csv('avalanche_types.csv')

In [None]:
df['snow_condition_per_bigram'].fillna(df['snow_condition_per_word'], inplace=True)
df['snow_condition_per_bigram'].value_counts(normalize=True)
df.dtypes

In [None]:
# Make empty == null again and take out the ones that are not null
df['snow_condition_per_bigram'] = df['snow_condition_per_bigram'].apply(make_na)
df = df[df['snow_condition_per_bigram'].notna()]
df.to_csv('avalanche_types.csv')

In [None]:
df['snow_condition_per_bigram'] = df['snow_condition_per_bigram'].apply(make_fresh)
df['snow_condition_per_bigram'].value_counts(normalize=True)

In [None]:
df.to_csv('avalanche_type_fresh.csv')