<a href="https://colab.research.google.com/github/MatthewSchofield25/Weather-Emergency-Application/blob/main/CS4485_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Main Steps:
# Data Preprocessing: Tokenize data and remove punctuation. Don't forget to .lower
# NLP: Process Data; Detect frequency of words, n-grams (BOW), TF-IDF, possibly Glove Vectors.
# Model: We can use different models and test their accuracies. LTSM possibly

### DATA PREPROCESSING ###

### DATA IS MISSING. MUST INPUT 'test.csv' AND 'train.csv' MANUALLY. ###

import pandas as pd
import numpy as np

import spacy
from geopy.geocoders import Nominatim

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer

# Sentiment analysis
nltk.download("vader_lexicon")
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score

# Labels on the data
#id : A unique identifier for each tweet.
#keyword : A particular keyword from the tweet (may be blank).
#location: The location the tweet was sent from (may be blank).
#text : The text of the tweet.
#target : This denotes whether a tweet is about a real disaster (1) or not (0).

common_words = ['via','like','build','get','would','one','two','feel','lol','fuck','take','way','may','first','latest'
                'want','make','back','see','know','let','look','come','got','still','say','think','great','pleas','amp']

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

ps = PorterStemmer()
lm = WordNetLemmatizer()

X = train.drop(columns=["target"],axis=1)
y = train["target"]

# Load the SpaCy NLP model
nlp = spacy.load('en_core_web_sm')

# Initialize the GeoPy geocoder
geolocator = Nominatim(user_agent='project_app')

def text_cleaning(data):
    return ' '.join(i for i in data.split() if i not in common_words)

def preprocess_data(data):
    '''
    Input: Data to be cleaned.
    Output: Cleaned Data.

    '''
    review =re.sub(r'https?://\S+|www\.\S+|http?://\S+',' ',data) #removal of url
    review =re.sub(r'<.*?>',' ',review) #removal of html tags
    review = re.sub("["
                           u"\U0001F600-\U0001F64F"  # removal of emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+",' ',review)
    review = re.sub('[^a-zA-Z]',' ',review) # filtering out miscellaneous text.
    review = review.lower() # Lowering all the words in text
    review = review.split() # split into a list of words
    review = [lm.lemmatize(words) for words in review if words not in stopwords.words('english')] # Turn words into their stems/roots
    review = [i for i in review if len(i)>2] # Removal of words with length<2
    review = ' '.join(review) # Put back to single string with a space separator
    return review

def sentiment_ana(data):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(data)
    return sentiment_dict['compound']

def sentiment_ana_label(data):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(data)
    compound_score = sentiment_dict['compound']
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Define a function to extract location names from a text using SpaCy NER
def extract_locations(text):
    doc = nlp(text)
    #print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])
    return [ent.text for ent in doc.ents if ent.label_ in ['LOC', 'GPE']]

def top_ngrams(data,n,grams):

    if grams == 1:
        count_vec = CountVectorizer(ngram_range=(1,1)).fit(data)
        bow = count_vec.transform(data)
        add_words = bow.sum(axis=0)
        word_freq = [(word, add_words[0, idx]) for word, idx in count_vec.vocabulary_.items()]
        word_freq = sorted(word_freq, key = lambda x: x[1], reverse=True)
    elif grams == 2:
        count_vec = CountVectorizer(ngram_range=(2,2)).fit(data)
        bow = count_vec.transform(data)
        add_words = bow.sum(axis=0)
        word_freq = [(word,add_words[0,idx]) for word,idx in count_vec.vocabulary_.items()]
        word_freq = sorted(word_freq, key = lambda x: x[1], reverse=True)
    elif grams == 3:
        count_vec = CountVectorizer(ngram_range=(3,3)).fit(data)
        bow = count_vec.transform(data)
        add_words = bow.sum(axis=0)
        word_freq = [(word,add_words[0,idx]) for word,idx in count_vec.vocabulary_.items()]
        word_freq = sorted(word_freq, key = lambda x: x[1], reverse=True)

    return word_freq[:n]

train["Cleaned_text"] = train["text"].apply(preprocess_data)
test["Cleaned_text"] = test["text"].apply(preprocess_data)

# Find common words and get rid of words that are unneeded
train["Cleaned_text"] = train["Cleaned_text"].apply(text_cleaning)
test["Cleaned_text"] = test["Cleaned_text"].apply(text_cleaning)

train["Sentiment"] = train["text"].apply(sentiment_ana)
test["Sentiment"] = test["text"].apply(sentiment_ana)

train["Sentiment_Label"] = train["text"].apply(sentiment_ana_label)
test["Sentiment_Label"] = test["text"].apply(sentiment_ana_label)

train["Location_Test"] = train["text"].apply(extract_locations)


common_words_uni = top_ngrams(train["Cleaned_text"],20,1)
common_words_bi = top_ngrams(train["Cleaned_text"],20,2)
common_words_tri = top_ngrams(train["Cleaned_text"],20,3)

print(common_words_uni)
print(common_words_bi)
print(common_words_tri)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


[('fire', 356), ('new', 228), ('news', 213), ('people', 201), ('time', 181), ('year', 178), ('video', 175), ('disaster', 162), ('emergency', 159), ('body', 155), ('day', 151), ('home', 144), ('police', 143), ('building', 141), ('life', 132), ('family', 132), ('storm', 128), ('crash', 125), ('california', 121), ('burning', 121)]
[('suicide bomber', 60), ('burning building', 59), ('body bag', 51), ('youtube video', 43), ('liked youtube', 42), ('northern california', 41), ('cross body', 40), ('oil spill', 39), ('suicide bombing', 36), ('california wildfire', 35), ('year old', 35), ('mass murder', 33), ('heat wave', 31), ('full read', 31), ('natural disaster', 31), ('mass murderer', 31), ('forest fire', 30), ('prebreak best', 30), ('bomber detonated', 30), ('home razed', 29)]
[('liked youtube video', 42), ('suicide bomber detonated', 30), ('northern california wildfire', 29), ('latest home razed', 28), ('home razed northern', 28), ('pkk suicide bomber', 28), ('bomber detonated bomb', 28), 

In [None]:
train.head(50)

Unnamed: 0,id,keyword,location,text,target,Cleaned_text,Sentiment,Sentiment_Label,Location_Test
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive,0.2732,positive,[]
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near ronge sask canada,-0.34,negative,[Canada]
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...,-0.296,negative,[]
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...,0.0,neutral,[California]
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,sent photo ruby alaska smoke wildfire pours sc...,0.0,neutral,"[Ruby, Alaska]"
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,rockyfire update california hwy closed directi...,-0.34,negative,[Lake County]
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,flood disaster heavy rain cause flash flooding...,0.0,neutral,"[Manitou, Colorado Springs]"
7,13,,,I'm on top of the hill and I can see a fire in...,1,top hill fire wood,-0.1531,negative,[]
8,14,,,There's an emergency evacuation happening now ...,1,emergency evacuation happening building across...,-0.3818,negative,[]
9,15,,,I'm afraid that the tornado is coming to our a...,1,afraid tornado coming area,0.0,neutral,[]


In [4]:
### CAN DETECT DISASTER TYPE ### Not a predictive model ###

# Define a dictionary of disaster types (change later)
disaster_keywords = {
    'earthquake': ['earthquake', '#earthquake'],
    'flood': ['flood', '#flood'],
    'fire': ['fire', '#fire'],
    'storm': ['storm', '#storm'],
    'hurricane': ['hurricane', '#hurricane'],
    'tornado': ['tornado', '#tornado'],
    'tsunami': ['tsunami', '#tsunami'],
    'wildfire': ['wildfire', '#wildfire'],
    'drought': ['drought', '#drought'],
    'avalanche': ['avalanche', '#avalanche'],
    # Add more disaster types as needed
}

def get_disaster_type(text):
    """ Return the type of disaster based on the tweet's content """
    for disaster, keywords in disaster_keywords.items():
        for keyword in keywords:
            if keyword.lower() in text.lower():
                return disaster
    return 'other'  # Default if no match

# Add a column for disaster type
train['disaster_type'] = train['text'].apply(get_disaster_type)

# Convert the disaster_type column to numeric labels for multi-class classification
disaster_types = train['disaster_type'].unique()
disaster_type_dict = {disaster: idx for idx, disaster in enumerate(disaster_types)}
train['disaster_type_label'] = train['disaster_type'].map(disaster_type_dict)

train.head(100)


Unnamed: 0,id,keyword,location,text,target,Cleaned_text,Sentiment,Sentiment_Label,Location_Test,disaster_type,disaster_type_label
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake allah forgive,0.2732,positive,[],earthquake,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near ronge sask canada,-0.3400,negative,[Canada],fire,1
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...,-0.2960,negative,[],other,2
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...,0.0000,neutral,[California],fire,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,sent photo ruby alaska smoke wildfire pours sc...,0.0000,neutral,"[Ruby, Alaska]",fire,1
...,...,...,...,...,...,...,...,...,...,...,...
95,137,accident,Charlotte,9 Mile backup on I-77 South...accident blockin...,1,mile backup south accident blocking right lane...,-0.3818,negative,"[NC, NC, NC]",other,2
96,138,accident,"Baton Rouge, LA",Has an accident changed your life? We will hel...,0,accident changed life help determine option fi...,0.6705,positive,[],other,2
97,139,accident,"Hagerstown, MD",#BREAKING: there was a deadly motorcycle car a...,1,breaking deadly motorcycle car accident happen...,-0.4767,negative,[Hagerstown],other,2
98,141,accident,"Gloucestershire , UK",@flowri were you marinading it or was it an ac...,0,flowri marinading accident,-0.4767,negative,[@flowri],other,2


In [6]:
### TF_IDF AND LSTM ###

## ACCURACY IS INCONSISTENT ##

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.layers import Embedding,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM,Bidirectional,GRU,MaxPooling1D,Conv1D
from tensorflow.keras.layers import Dense
from keras.optimizers import Adam,SGD
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import *
n_epoch = 30

def encoding(train_data,test_data):
    tfidf = TfidfVectorizer(
          ngram_range=(1, 1), use_idf=True, smooth_idf=True, sublinear_tf=True
    )
    tf_df_train = tfidf.fit_transform(train_data).toarray()
    train_df = pd.DataFrame(tf_df_train,columns=tfidf.get_feature_names_out())
    tf_df_test = tfidf.transform(test_data).toarray()
    test_df = pd.DataFrame(tf_df_test,columns=tfidf.get_feature_names_out())

    return train_df,test_df

x_final,x_test_final = encoding(train["Cleaned_text"],test["Cleaned_text"])
y_final = np.array(y)

x_final.shape,y_final.shape,x_test_final.shape

# Dividing the data into training, validation and testing
from sklearn.model_selection import train_test_split
# for bow and tf-idf
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.1, random_state=42, stratify = y_final)
X_train, x_valid, Y_train, y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=42, stratify = y_train)
x_test_final = x_test_final

embedding_feature_vector = 200 # Since we used glove vector embedding of dim 200.
#model = Sequential()
#model.add(Embedding(vocab_size,embedding_feature_vector,input_length=max_length,weights = [word_vector_matrix], trainable = False))
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))  # Input layer with TF-IDF features
model.add(Dropout(0.35))  # Dropout layer for regularization
model.add(Dense(128, activation='relu'))  # Hidden layer
model.add(Dropout(0.35))  # Dropout layer for regularization
model.add(Dense(32, activation='relu'))  # Hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1,
                           mode='min', restore_best_weights=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5,
                              verbose=1, mode='min')

history = model.fit(X_train,Y_train,validation_data=(x_valid,y_valid),callbacks=[reduce_lr,early_stop],epochs=n_epoch,batch_size= 64)

predictions = model.predict(x_valid)

# Convert probabilities to binary values (0 or 1)
binary_predictions = (predictions > 0.5).astype(int)

# Print the first 10 predictions
print("First 10 predictions on validation set (1 = disaster, 0 = not disaster):")
print(binary_predictions[:10])

# If you want to see the actual prediction probabilities (between 0 and 1)
print("First 10 raw prediction probabilities:")
print(predictions[:10])

accuracy = accuracy_score(y_valid, binary_predictions)

# Output the overall accuracy
print(f"Validation accuracy: {accuracy * 100:.2f}%")

predictions = model.predict(x_test_final)

# Convert probabilities to binary values (0 or 1)
binary_predictions = (predictions > 0.5).astype(int)

print("First 10 predictions on test set (1 = disaster, 0 = not disaster):")
print(binary_predictions[:10])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None
Epoch 1/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 67ms/step - accuracy: 0.6042 - loss: 0.6453 - val_accuracy: 0.8134 - val_loss: 0.4311 - learning_rate: 0.0010
Epoch 2/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 65ms/step - accuracy: 0.8954 - loss: 0.2760 - val_accuracy: 0.8090 - val_loss: 0.4698 - learning_rate: 0.0010
Epoch 3/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 61ms/step - accuracy: 0.9563 - loss: 0.1366 - val_accuracy: 0.7974 - val_loss: 0.6058 - learning_rate: 0.0010
Epoch 4/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 70ms/step - accuracy: 0.9763 - loss: 0.0714 - val_accuracy: 0.7901 - val_loss: 0.6902 - learning_rate: 0.0010
Epoch 5/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 74ms/step - accuracy: 0.9803 - loss: 0.0534 - val_accuracy: 0.7638 - val_loss: 0.7732 - learning_rate: 0.0010
Epoch 6/30
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [