# Table of Contents
1. [download/read csv file](#sec1)
2. [Convert the labeled data set tags into the new individual tag columns](#sec2)
3. [NLTK](#sec3)
4. [Classifier Building](#sec4)

In [5]:
#packages
#%pip install numpy
#%pip install pandas
import numpy as np
import pandas as pd

<a id = "sec1"></a>
## **Download/Read CSV file**

In [6]:
# read csv
github_csv_url = "https://raw.githubusercontent.com/JL72005/PIT-UN-Project4/refs/heads/main/NEW_TAGS_dc_md_va_flash_floods_1996_present.xlsx%20-%20dc_md_va_flash_floods_1996_pres.csv?token=GHSAT0AAAAAADEFGP75TFWP7C46X2NUKJCQ2CAU3HQ"
df = pd.read_csv(github_csv_url)
# print first 5 rows
print(df.head())

   BEGIN_YEARMONTH  BEGIN_DAY  BEGIN_TIME  END_YEARMONTH  END_DAY  END_TIME  \
0           199601         19        1100         199601       19      1300   
1           199606         18          30         199606       18       200   
2           199606         20        2200         199606       20      2300   
3           199609          6        1200         199609        6      1800   
4           199611          8        1600         199611        8      1715   

   EPISODE_ID  EVENT_ID                 STATE  STATE_FIPS  ...  END_LOCATION  \
0     2403644   5541358  DISTRICT OF COLUMBIA          11  ...           ALL   
1     1014286   5561204  DISTRICT OF COLUMBIA          11  ...     N PORTION   
2     2040906   5561207  DISTRICT OF COLUMBIA          11  ...     NORTHWEST   
3     2049837   5572405  DISTRICT OF COLUMBIA          11  ...      DOWNTOWN   
4     2049872   5572790  DISTRICT OF COLUMBIA          11  ...      CITYWIDE   

  BEGIN_LAT BEGIN_LON END_LAT  END_LON  \
0 

<a id = "sec2"></a>
## **Convert the labeled data set tags into the new individual tag columns**

In [7]:
#list of tags
tag_names = ['death','injury','evacuation','rescue','car_crash','home_damage','infrastructure_damage','soft_infrastructure_damage','road_closure','power_outage','tree_damage','vehicle_loss','agricultural_damage','campground_damage']
#list of weather events
weather_names = ['nor_easter','thunderstorm','hurricane','tornado','lightning','mudslide']

In [8]:
#populate columns for OG tags
for tag in tag_names:
    df[tag] = df['IMPACT_PREDICTORS'].apply(str).apply(lambda x: 1 if tag in x else 0)
for tag in weather_names:
    df[tag] = df['weather_events'].apply(str).apply(lambda x: 1 if tag in x else 0)

In [9]:
#check columns
df[tag_names+weather_names].head(15)

Unnamed: 0,death,injury,evacuation,rescue,car_crash,home_damage,infrastructure_damage,soft_infrastructure_damage,road_closure,power_outage,tree_damage,vehicle_loss,agricultural_damage,campground_damage,nor_easter,thunderstorm,hurricane,tornado,lightning,mudslide
0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
7,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0


<a id="sec3"></a>
## **NLTK**

In [10]:
#imports
#%pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

### Tokenization and Stop Word Removal

In [11]:
# TOKENIZATION & REMOVING STOP WRODS

#download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
#download stopwords
stop_words = set(stopwords.words('english'))

def remove_stopword(text):
  tk = RegexpTokenizer(r'\w+')
  word_tokens = tk.tokenize(text)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  filtered_text = " ".join(filtered_sentence)
  return filtered_text

df['TOKEN_EPISODE_NARRATIVE'] = df['EPISODE_NARRATIVE'].apply(str).apply(remove_stopword)
df['TOKEN_EVENT_NARRATIVE'] = df['EVENT_NARRATIVE'].apply(str).apply(remove_stopword)
df[['TOKEN_EPISODE_NARRATIVE', 'TOKEN_EVENT_NARRATIVE']].head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\emmab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emmab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,TOKEN_EPISODE_NARRATIVE,TOKEN_EVENT_NARRATIVE
0,unusually intense squall line feeding unseason...,
1,Intense thunderstorms moved northeast half Was...,
2,rapidly developing thunderstorm knocked numero...,
3,Feeder bands torrential tropical rains associa...,
4,Several roads briefly closed torrential rains ...,


### Stemming

In [12]:
# STEMMING

stemmer = PorterStemmer()

def stem_narrative(text):
    token_list = text.split(" ")
    stemmed_list = []
    for word in token_list:
        stemmed_list.append(stemmer.stem(word))
    return stemmed_list

df['STEM_EPISODE_NARRATIVE'] = df['TOKEN_EPISODE_NARRATIVE'].apply(str).apply(stem_narrative).apply(lambda x: " ".join(x)) #where x is the list of stemmed words
df['STEM_EVENT_NARRATIVE'] = df['TOKEN_EVENT_NARRATIVE'].apply(str).apply(stem_narrative).apply(lambda x: " ".join(x)) #where x is the list of stemmed words
df['STEM_EPISODE_NARRATIVE'].head()


0    unusu intens squall line feed unseason warm mo...
1    intens thunderstorm move northeast half washin...
2    rapidli develop thunderstorm knock numer tree ...
3    feeder band torrenti tropic rain associ remnan...
4    sever road briefli close torrenti rain associ ...
Name: STEM_EPISODE_NARRATIVE, dtype: object

### Lemmatization: Reducing word to base form

In [15]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_narrative(text):
    token_list = text.split(" ")
    lemmatized_list = []
    for word in token_list:
        lemmatized_list.append(lemmatizer.lemmatize(word))
    lemmatized_sentence = " ".join(lemmatized_list)
    return lemmatized_sentence
df['LEM_EPISODE_NARRATIVE'] = df['TOKEN_EPISODE_NARRATIVE'].apply(str).apply(lemmatize_narrative)
df['LEM_EVENT_NARRATIVE'] = df['TOKEN_EVENT_NARRATIVE'].apply(str).apply(lemmatize_narrative)
df[['LEM_EPISODE_NARRATIVE', 'LEM_EVENT_NARRATIVE']].head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emmab\AppData\Roaming\nltk_data...


Unnamed: 0,LEM_EPISODE_NARRATIVE,LEM_EVENT_NARRATIVE
0,unusually intense squall line feeding unseason...,
1,Intense thunderstorm moved northeast half Wash...,
2,rapidly developing thunderstorm knocked numero...,
3,Feeder band torrential tropical rain associate...,
4,Several road briefly closed torrential rain as...,


<a id ="sec4"></a>
## **Classifier Building**

### Corpus Creation

In [16]:
#take spreadsheet of tags and their key words and generate a list/corpus for every tag
import csv
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')


# Helper: Convert NLTK POS tag to WordNet POS
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)  # default to noun

lemmatizer2 = WordNetLemmatizer()
with open('tag_corpus.csv','r') as inputFile:
    data = csv.DictReader(inputFile)
    tag_corpus = {row['Tag']: row['Similar Words'].split(',') for row in data}
    for tag in tag_corpus:
        syns_list = tag_corpus[tag]
        lemmatized_list = []
        for syn,pos in zip(syns_list, pos_tag(syns_list)):
            lemmatized_syn = lemmatizer2.lemmatize(syn, pos)
            lemmatized_list.append(lemmatized_syn)
            print(lemmatized_syn)


        """for i in range(len(tag_corpus[tag])):
            tag_corpus[tag][i] = lemmatizer2.lemmatize(tag_corpus[tag][i])
            print(tag_corpus[tag][i])"""
        """for i in range(len(tag_corpus[tag])):
            word = tag_corpus[tag][i].strip()
            pos = get_wordnet_pos(word)
            tag_corpus[tag][i] = lemmatizer.lemmatize(word, pos)
            print(tag_corpus[tag][i])"""
#print(tag_corpus)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emmab\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\emmab\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


KeyError: ('death', 'NN')

### Create Function that adds these tags

In [None]:
def label_tag(text, tag): 
    for syns in tag_corpus[tag]: #the list in the tag 
        if syns in text: #if the text is in the list
            return 1
    return 0

# apply the label_tag function to each row in the dataframe for each tag
for tag in tag_corpus:
    df[tag] = df["STEM_EPISODE_NARRATIVE"].apply(lambda text: label_tag(text, tag))

Unnamed: 0,death,injury,evacuation,rescue,car_crash,home_damage,infrastructure_damage,soft_infrastructure_damage,road_closure,power_outage,tree_damage,vehicle_loss,agricultural_damage,campground_damage,nor_easter,thunderstorm,hurricane,tornado,lightning,mudslide
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
