In [122]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

# Deep learning
import tqdm
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau


from sklearn.metrics import (
    precision_score, 
    recall_score, 
    f1_score, 
    classification_report,
    accuracy_score,
    confusion_matrix, 
    metrics
)
from sklearn.model_selection import train_test_split

# Utility
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)



ImportError: cannot import name 'metrics' from 'sklearn.metrics' (C:\Users\micha\AppData\Roaming\Python\Python312\site-packages\sklearn\metrics\__init__.py)

In [71]:
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading required NLTK data...")
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    print("Download complete!")

In [72]:
# First we want to crack into the data and see what we're working with
train_data_frame = pd.read_csv('train.csv')

train_data_frame.head()


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


With this in mind lets check back in on the purpose of the Kaggle Competition.</br>
</br>
https://www.kaggle.com/c/nlp-getting-started/overview</br>
</br>
In this competition, you're challenged to build a machine learning model that predicts which Tweets are about real disasters and which one's aren't. You'll have access to a dataset of 10,000 tweets that were hand classified. If this is your first time working on an NLP problem, we've created a quick tutorial to get you up and running.</br>
</br>
Disclaimer: The dataset for this competition contains text that may be considered profane, vulgar, or offensive.</br>
</br>
Now we want to state the class guidelines:</br>
</br>
Deliverable 1</br>
</br>
A Jupyter notebook with a description of the problem/data, exploratory data analysis (EDA) procedure, analysis (model building and training), result, and discussion/conclusion.</br>
</br>
Suppose your work becomes so large that it doesn't fit into one notebook (or you think it will be less readable by having one large notebook). In that case, you can make several notebooks or scripts in a GitHub repository (as deliverable 3) and submit a report-style notebook or pdf instead.</br>
</br>
If your project doesn't fit into Jupyter notebook format (E.g., you built an app that uses ML), write your approach as a report and submit it in a pdf form.</br>
</br>
Deliverable 2</br>
</br>
A public project GitHub repository with your work (please also include the GitHub repo URL in your notebook/report).</br>
</br>
Deliverable 3</br>
</br>
A screenshot of your position on the Kaggle competition leaderboard for your top-performing model.</br>
</br>
Step Breakdown:</br>
</br>
Step 1</br>
Brief description of the problem and data (5 pts)</br>
</br>
Step 2</br>
Exploratory Data Analysis (EDA) — Inspect, Visualize and Clean the Data (15 pts)</br>
</br>
Step 3</br>
Model Architecture (25 pts)</br>
</br>
Step 4</br>
Results and Analysis (35 pts)</br>
</br>
Step 5</br>
Discussion and Conclusion (15 pts)</br>
</br>
Extra Rules described above in the deliverables section</br>
Produce Deliverables: High-Quality, Organized Jupyter Notebook Report, GitHub Repository, and screenshot of Kaggle leaderboard (30 points)


For my approach I am going to do some simple Data exploration and make sure everythings all clean. Then we'll clean out the filler words and run the standard data statistics on our variables. Then we'll construct a word cloud for an extra layer of analysis.
</br>

Next we'll split up our data and then prepare for oru LSTM model. Once this is finished we will explore a TF-IDF model and see how it compares.


In [73]:
# quick clean
print(len(train_data_frame))
# 1. remove duplicates
train_data_frame = train_data_frame.drop_duplicates()
print(len(train_data_frame))
# 2. remove empty rows
train_data_frame = train_data_frame[train_data_frame['text'].notna()]
print(len(train_data_frame))

7613
7613
7613


From this point it looks like the data matches the implied level of quality infered by the Kaggle Contest page and the data sets 'data card'.
Next I want to address the two rows displayed above in the head(). The 'keyword' and 'location' columns display NaN, and this could complicate things. Lets chek and see if any data is present. 



In [74]:
# check the distribution of values in the keyword column
kayword_counts = pd.DataFrame(train_data_frame['keyword'].value_counts())
kayword_counts = kayword_counts.sort_values(by='count', ascending=False)
print(kayword_counts.head(10))

# check the distribution of values in the location column
location_counts = pd.DataFrame(train_data_frame['location'].value_counts())
location_counts = location_counts.sort_values(by='count', ascending=False)
print(location_counts.head(10))

# percentage of missing values of rows in 'keyword' and 'location' columns 
print((221/len(train_data_frame))*100, '% of rows contain values in the keyword column')
print((3341/len(train_data_frame))*100, '% of rows contain values in the location column')


             count
keyword           
fatalities      45
deluge          42
armageddon      42
damage          41
body%20bags     41
harm            41
sinking         41
evacuate        40
outbreak        40
fear            40
                 count
location              
USA                104
New York            71
United States       50
London              45
Canada              29
Nigeria             28
UK                  27
Los Angeles, CA     26
India               24
Mumbai              22
2.902929200052542 % of rows contain values in the keyword column
43.885459083147246 % of rows contain values in the location column


As we can see there is some issues with the fact that only 2% of the rows contain values in the keyword column. 

In [75]:
#

Cleaning text

In [76]:
test_data_frame = pd.read_csv('test.csv')
test_data_frame.head()


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [77]:
# function to process and clean tweets

def preprocess_data(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords and stem
    stemmer = nltk.stem.porter.PorterStemmer()
    text = ' '.join(stemmer.stem(word) for word in tokens if word not in stop_words)
    
    return text

In [78]:
# next we handle emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


In [79]:
def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text

In [80]:
# helper function to display the mean number of words in a tweet in each row of the dataset
def mean_words(text):
    return len(text.split(' '))

# calculate the mean number of words in the dataset
train_data_frame['mean_words']=train_data_frame['text'].apply(lambda x: mean_words(x))
test_data_frame['mean_words']=test_data_frame['text'].apply(lambda x: mean_words(x))


In [81]:

# Download required NLTK data
nltk.download('punkt_tab')
train_data_frame['clean_text']=train_data_frame['text'].apply(lambda x: remove_emoji(x))

# remove stopwords  
train_data_frame['clean_text']=train_data_frame['clean_text'].apply(lambda x: preprocess_data(x))

# remove extra whitespace
train_data_frame['clean_text']=train_data_frame['clean_text'].apply(lambda x: re.sub(r'\s+', ' ', x))

train_data_frame.head()


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\micha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,id,keyword,location,text,target,mean_words,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,13,deed reason earthquak may allah forgiv us
1,4,,,Forest fire near La Ronge Sask. Canada,1,7,forest fire near la rong sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,22,resid ask shelter place notifi offic evacu she...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,9,peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,17,got sent photo rubi alaska smoke wildfir pour ...


In [82]:
test_data_frame['clean_text']=test_data_frame['text'].apply(lambda x: remove_emoji(x))

# remove stopwords  
test_data_frame['clean_text']=test_data_frame['clean_text'].apply(lambda x: preprocess_data(x))

# remove extra whitespace
test_data_frame['clean_text']=test_data_frame['clean_text'].apply(lambda x: re.sub(r'\s+', ' ', x))

test_data_frame.head()


Unnamed: 0,id,keyword,location,text,mean_words,clean_text
0,0,,,Just happened a terrible car crash,6,happen terribl car crash
1,2,,,"Heard about #earthquake is different cities, s...",9,heard earthquak differ citi stay safe everyon
2,3,,,"there is a forest fire at spot pond, geese are...",19,forest fire spot pond gees flee across street ...
3,9,,,Apocalypse lighting. #Spokane #wildfires,4,apocalyps light spokan wildfir
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,8,typhoon soudelor kill china taiwan


In [83]:
# next we remove the stopwords
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords


# remove stopwords  
train_data_frame['clean_text']=train_data_frame['clean_text'].apply(lambda x: remove_stopwords(x))
test_data_frame['clean_text']=test_data_frame['clean_text'].apply(lambda x: remove_stopwords(x))
train_data_frame.head()


Unnamed: 0,id,keyword,location,text,target,mean_words,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,13,deed reason earthquak may allah forgiv us
1,4,,,Forest fire near La Ronge Sask. Canada,1,7,forest fire near la rong sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,22,resid ask shelter place notifi offic evacu she...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,9,peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,17,got sent photo rubi alaska smoke wildfir pour ...


In [84]:
train_tweets = train_data_frame['clean_text'].values
test_tweets = test_data_frame['clean_text'].values
train_target = train_data_frame['target'].values

In [85]:
# Tokenize
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(train_tweets)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

14094

In [97]:
def create_corpus(df):
    corpus=[]
    for tweet in tqdm.tqdm(df['clean_text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop_words))]
        corpus.append(words)
    return corpus

In [98]:
train_corpus = create_corpus(pd.DataFrame({'clean_text': train_tweets}))


100%|██████████| 7613/7613 [00:01<00:00, 7192.90it/s] 


In [99]:
def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)

In [None]:
longest_train = max(train_tweets, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))
padded_sentences = pad_sequences(embed(train_tweets), length_long_sentence, padding='post')
test_sentences = pad_sequences(
    embed(test_tweets), 
    length_long_sentence,
    padding='post'
)

In [104]:
embeddings_dict = dict()
embedding_dim = 100
with open('G:/My Drive/Academia/MSDS/Machine Learning Specialization/DTSA5511 Deep Learning/glove.twitter.27B/glove.twitter.27B.100d.txt', encoding='utf-8') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dict [word] = vector_dimensions


In [105]:
# Glove dictionary. Others will be initialized to 0.

embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.21063   , -0.010992  , -0.17552   , ..., -0.37547001,
         0.58029002,  0.16067   ],
       [ 0.066373  ,  1.09249997, -0.59674001, ...,  0.040076  ,
        -0.12083   , -0.1785    ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.20125   , -0.091671  ,  0.51243001, ..., -0.19316   ,
         0.33122   ,  0.25007999],
       [-0.076711  , -0.77710998, -0.75962001, ...,  0.23106   ,
         0.09527   , -0.15951   ]])

In [108]:
X_train, X_test, y_train, y_test = train_test_split(
    padded_sentences, 
    train_target, 
    test_size=0.25
)

In [113]:
def glove_lstm():
    model = Sequential()
    
    # Add embedding layer
    model.add(Embedding(
        vocab_length,
        embedding_dim,
        weights=[embedding_matrix],
        input_length=length_long_sentence,
        trainable=False
    ))
    
    # Add LSTM layers
    model.add(LSTM(128, return_sequences=True, dropout=0.2))
    model.add(LSTM(64, dropout=0.2))
    
    # Add Dense layers
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [115]:
model = glove_lstm()
# ... existing code ...

# Update the checkpoint to save in .keras format (newer TensorFlow versions prefer this)
checkpoint = ModelCheckpoint(
    'disaster_tweets_model.keras',  # More descriptive name
    monitor='val_loss',
    verbose=1,
    save_best_only=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    verbose=1,
    patience=5,
    min_lr=0.001
)

# Add saving of training history
history = model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1,
    callbacks=[reduce_lr, checkpoint]
)

# Save training history to a file
import json
with open('training_history.json', 'w') as f:
    json.dump(history.history, f)

# Save the final model (in addition to checkpoints)
model.save('disaster_tweets_model_final.keras')

Epoch 1/10
[1m177/179[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.7041 - loss: 0.5839
Epoch 1: val_loss improved from inf to 0.48476, saving model to disaster_tweets_model.keras
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 30ms/step - accuracy: 0.7049 - loss: 0.5832 - val_accuracy: 0.7810 - val_loss: 0.4848 - learning_rate: 0.0010
Epoch 2/10
[1m177/179[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 23ms/step - accuracy: 0.7720 - loss: 0.5118
Epoch 2: val_loss improved from 0.48476 to 0.47387, saving model to disaster_tweets_model.keras
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.7720 - loss: 0.5116 - val_accuracy: 0.7931 - val_loss: 0.4739 - learning_rate: 0.0010
Epoch 3/10
[1m178/179[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.7776 - loss: 0.4909
Epoch 3: val_loss improved from 0.47387 to 0.46769, saving model to disaster_tweets_model.keras


In [125]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Loss:', loss)
print('Accuracy:', accuracy)

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8006 - loss: 0.5159
Loss: 0.5117337703704834
Accuracy: 0.8014705777168274


In [127]:
submission = pd.read_csv('sample_submission.csv')


In [131]:
# Get predictions as probabilities
predictions = model.predict(test_sentences)
# Convert to binary (0 or 1) predictions using 0.5 threshold
submission.target = (predictions > 0.5).astype(int)
submission.to_csv("submission.csv", index=False)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


References

This one was a
https://www.kaggle.com/code/mariapushkareva/nlp-disaster-tweets-with-glove-and-lstm