In [None]:
# Import necessary libraries
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from google.colab import drive
import spacy
import pickle

In [None]:
# Mount Google Drive to access dataset and save the model
drive.mount('/content/drive')

In [None]:
# Load dataset from Google Drive
path = "/content/drive/MyDrive/ML_Project/tweets.csv"
df = pd.read_csv(path)

In [None]:
# Drop rows with missing values in the 'text' column
df = df.dropna(subset=['text'])

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()  # Initialize the tokenizer
tokenizer.fit_on_texts(df['text'])  # Learn word indices from the text data
total_words = len(tokenizer.word_index) + 1  # Total number of unique words (vocabulary size)

In [None]:
# Convert text into sequences of integers
sequences = tokenizer.texts_to_sequences(df['text'])  # Encode the text as integer sequences
padded_sequences = pad_sequences(sequences)  # Pad sequences to ensure uniform length

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, df['target'], test_size=0.2, random_state=42
)

In [None]:
# Define the LSTM-based neural network model
embedding_dim = 50  # Dimension of the word embeddings
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=embedding_dim, input_length=padded_sequences.shape[1]))
model.add(LSTM(100))  # Add an LSTM layer with 100 units
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model for 5 epochs
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

In [None]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Test the model on a single example
test_sentence = ["No cows today but our local factory is sadly still ablaze"]
test_sequences = tokenizer.texts_to_sequences(test_sentence)  # Encode the test sentence
padded_test_sequences = pad_sequences(test_sequences, maxlen=padded_sequences.shape[1])  # Pad the test sequence

# Get predictions from the model
predictions = model.predict(padded_test_sequences)

# Use a threshold to classify the sentence
threshold = 0.5
if predictions[0, 0] >= threshold:
    print(f'Test sentence: "{test_sentence[0]}" is a disaster.')
else:
    print(f'Test sentence: "{test_sentence[0]}" is not a disaster.')

Test sentence: "No cows today but our local factory is sadly still ablaze" is a disaster.


In [None]:
# Extract locations from a sentence using spaCy
test_sentence = "There is a cyclone in Florida"
nlp = spacy.load('en_core_web_sm')  # Load the spaCy English language model
doc = nlp(test_sentence)  # Process the sentence
locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']  # Extract Geo-political Entities (GPE)
print("Disaster Locations:", locations)

Disaster Locations: ['Florida']


In [None]:
# Save the trained LSTM model using pickle
with open('/content/drive/MyDrive/ML_Project/lstm.pkl', 'wb') as f:
    pickle.dump(model, f)