# NLP Tensorflow Project

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers

In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv('train.csv')

# Print the first few rows of the DataFrame (head)
print("First few rows of the DataFrame:")
print(df.head())

# Print the shape of the DataFrame
print("\nDataFrame shape:")
print(df.shape)

# Print the number of entries labeled as 'Disaster' (target == 1)
print("\nNumber of entries labeled as 'Disaster':")
print((df.target == 1).sum())

# Print the number of entries labeled as 'No Disaster' (target == 0)
print("\nNumber of entries labeled as 'No Disaster':")
print((df.target == 0).sum())

First few rows of the DataFrame:
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  

DataFrame shape:
(7613, 5)

Number of entries labeled as 'Disaster':
3271

Number of entries labeled as 'No Disaster':
4342


In [3]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Convert text in 'text' column to lowercase
df['text'] = df['text'].apply(lambda x: x.lower())

# Define preprocessing functions
def remove_URL(text):
    url_pattern = r'https?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)

def remove_punct(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.isalnum()]
    return ' '.join(filtered_words)
    # Define a function to remove stopwords from text

def remove_stopwords(text):
    # Tokenize the text into words
    words = text.split()
    # Remove stopwords
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    # Join the filtered words back into a string
    return ' '.join(filtered_words)


# 'text' is the column containing text data in your DataFrame 'df'
df["text"] = df.text.map(remove_URL) # map(lambda x: remove_URL(x))
df["text"] = df.text.map(remove_punct)
df["text"] = df.text.map(remove_stopwords)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
# Function to count unique words in a text column
def counter_word(text_col):
    # Initialize a Counter object to count word occurrences
    count = Counter()
    # Iterate over each text entry in the text column
    for text in text_col.values:
        # Split the text into words and update the counter
        for word in text.split():
            count[word] += 1
    return count

# Call the counter_word function with the 'text' column of DataFrame 'df'
counter = counter_word(df['text'])

# Print the total number of unique words
print("Total number of unique words:", len(counter))

# Print the resulting word count dictionary
print("\nCounter:", counter)

# Print the most common words and their frequencies
print("\nMost Common words:", counter.most_common(5))

Total number of unique words: 15950


Most Common words: [('like', 346), ('amp', 344), ('fire', 249), ('get', 228), ('new', 223)]


In [5]:
# Define features (text) and labels
X = df['text'].values
y = df['target'].values

# Split the dataset into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(X, y,
                                                                       test_size=0.2, random_state=42)

train_sentences.shape, val_sentences.shape

((6090,), (1523,))

In [6]:
# Calculate the number of unique words from the counter
num_unique_words = len(counter)

# Initialize the Tokenizer with the number of unique words
tokenizer = Tokenizer(num_words=num_unique_words)

# Fit the Tokenizer on the training sentences
tokenizer.fit_on_texts(train_sentences)

# Obtain the word index dictionary from the Tokenizer
word_index = tokenizer.word_index

# Print the word index dictionary
print(word_index)



In [7]:
# Tokenize text data in the training and validation sets
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

# Print a subset of original sentences from the training set
print("Subset of original sentences from the training set:")
print(train_sentences[10:15])

# Print the corresponding sequences of integers for the subset of sentences
print("\nCorresponding sequences of integers:")
print(train_sequences[10:15])

Subset of original sentences from the training set:
['one direction pick fan army directioners x1402'
 'inws alert spsgsp pm pm edt pickens county strong thunderstorm impact'
 'sadly windows 10 reveals microsoft ethics armageddon'
 'aurora theater shooting trial gunman expected notoriety mass murder nothing else'
 'officials say quarantine place birmingham home possible ebola case edward khan']

Corresponding sequences of integers:
[[9, 992, 653, 536, 101, 1606, 5375], [5376, 1845, 5377, 90, 90, 1607, 3575, 303, 1079, 189, 1608], [1302, 1846, 626, 2190, 2191, 5378, 414], [5379, 859, 415, 1847, 1303, 993, 5380, 83, 207, 441, 654], [442, 62, 443, 567, 2738, 70, 359, 734, 627, 3576, 3577]]


In [8]:
# Define the maximum length for sequences
max_length = 20

# Pad sequences for training and validation sets
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")

# Print the shapes of padded sequences
print("Shape of padded training sequences:", train_padded.shape)
print("Shape of padded validation sequences:", val_padded.shape)

# Print an example of a padded training sequence
print("\nExample of a padded training sequence:")
print(train_padded[10])

# Print corresponding original sentence, its sequence, and padded sequence
print("\nOriginal sentence:")
print(train_sentences[10])
print("\nSequence representation:")
print(train_sequences[10])
print("\nPadded sequence:")
print(train_padded[10])

Shape of padded training sequences: (6090, 20)
Shape of padded validation sequences: (1523, 20)

Example of a padded training sequence:
[   9  992  653  536  101 1606 5375    0    0    0    0    0    0    0
    0    0    0    0    0    0]

Original sentence:
one direction pick fan army directioners x1402

Sequence representation:
[9, 992, 653, 536, 101, 1606, 5375]

Padded sequence:
[   9  992  653  536  101 1606 5375    0    0    0    0    0    0    0
    0    0    0    0    0    0]


In [9]:
# Check reversing the indices

# Create a dictionary to map integer indices to words
reverse_word_index = dict([(idx, word) for (word, idx) in word_index.items()])

# Print the reversed word index
print("Reversed word index:", reverse_word_index)



In [10]:
def decode(sequence):
    # Use list comprehension to map integer indices to words using reverse_word_index dictionary
    # If the index is not found in the reverse_word_index, replace it with "?"
    return " ".join([reverse_word_index.get(idx, "?") for idx in sequence])

# Decode the sequence of indices back to text
decoded_text = decode(train_sequences[10])

# Print the original sequence of indices and the decoded text
print("Original sequence of indices:")
print(train_sequences[10])
print("\nDecoded text:")
print(decoded_text)

Original sequence of indices:
[9, 992, 653, 536, 101, 1606, 5375]

Decoded text:
one direction pick fan army directioners x1402


In [11]:
from tensorflow.keras import layers

# Create a Sequential model
model = keras.models.Sequential()

# Add an Embedding layer to the model
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
# Add an LSTM layer to the model
model.add(layers.LSTM(64, dropout=0.1))
# Add a Dense layer to the model
model.add(layers.Dense(1, activation="sigmoid"))

# Print a summary of the model architecture
model.summary()



In [13]:
# Define the loss function
loss = keras.losses.BinaryCrossentropy(from_logits=False)

# Define the optimizer
optim = keras.optimizers.Adam(learning_rate=0.001)

# Define evaluation metrics
metrics = ["accuracy"]

# Compile the model
model.compile(loss=loss, optimizer=optim, metrics=metrics)

In [14]:
# Train the model
model.fit(
    train_padded,            # Training data: padded sequences
    train_labels,            # Training labels
    epochs=20,               # Number of training epochs
    validation_data=(val_padded, val_labels),  # Validation data for evaluation during training
    verbose=2                # Verbosity mode: 0 (silent), 1 (progress bar), 2 (one line per epoch)
)

Epoch 1/20
191/191 - 9s - 46ms/step - accuracy: 0.7051 - loss: 0.5448 - val_accuracy: 0.8011 - val_loss: 0.4580
Epoch 2/20
191/191 - 3s - 18ms/step - accuracy: 0.8798 - loss: 0.3097 - val_accuracy: 0.7787 - val_loss: 0.5006
Epoch 3/20
191/191 - 3s - 17ms/step - accuracy: 0.9379 - loss: 0.1799 - val_accuracy: 0.7695 - val_loss: 0.6423
Epoch 4/20
191/191 - 4s - 22ms/step - accuracy: 0.9596 - loss: 0.1265 - val_accuracy: 0.7794 - val_loss: 0.5712
Epoch 5/20
191/191 - 3s - 16ms/step - accuracy: 0.9711 - loss: 0.0951 - val_accuracy: 0.7715 - val_loss: 0.9540
Epoch 6/20
191/191 - 5s - 26ms/step - accuracy: 0.9759 - loss: 0.0812 - val_accuracy: 0.7669 - val_loss: 0.7914
Epoch 7/20
191/191 - 4s - 19ms/step - accuracy: 0.9762 - loss: 0.0701 - val_accuracy: 0.7617 - val_loss: 0.9192
Epoch 8/20
191/191 - 4s - 21ms/step - accuracy: 0.9800 - loss: 0.0566 - val_accuracy: 0.7735 - val_loss: 0.7390
Epoch 9/20
191/191 - 5s - 26ms/step - accuracy: 0.9801 - loss: 0.0523 - val_accuracy: 0.7452 - val_loss:

<keras.src.callbacks.history.History at 0x799505d30cd0>

In [15]:
# Make predictions using the trained model on the padded training sequences
predictions = model.predict(train_padded)

# Convert predicted probabilities to binary labels using a threshold of 0.5
predictions = [1 if p > 0.5 else 0 for p in predictions]

[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [16]:
# Print a subset of training sentences to inspect the original text data
print("Subset of Training Sentences:")
print(train_sentences[:10])

# Print a subset of true labels to inspect the ground truth
print("\nSubset of True Labels:")
print(train_labels[:10])

# Print a subset of predicted labels to inspect the model's binary predictions
print("\nSubset of Predicted Labels:")
print(predictions[:10])

Subset of Training Sentences:
['courageous honest analysis need use atomic bomb 1945 hiroshima70 japanese military refused surrender'
 'zachzaidman 670thescore wld b shame golf cart became engulfed flames boycottbears'
 'tell barackobama rescind medals given us soldiers massacre wounded knee sign amp rt'
 'worried ca drought might affect extreme weather dampen economy'
 'youngheroesid lava blast amp power red pantherattack jamilazzaini alifaditha'
 'wreckage confirmed mh370 malaysia pm investigators families'
 'builder dental emergency ruined plan emotionally blackmail afternoon bump'
 'bmx issues areal flood advisory shelby al till aug 5 pm cdt'
 '360wisenews china stock market crash gems rubble'
 'robertoneill31 getting hit foul ball sitting hardly freak accident war zone']

Subset of True Labels:
[1 0 1 1 0 1 1 1 1 0]

Subset of Predicted Labels:
[1, 0, 1, 1, 0, 1, 1, 1, 0, 0]
