# 1. Importing Libraries

In [None]:
pip install --update pandas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, classification_report

# 2. Importing Dataset

In [None]:
df = pd.read_csv('disaster-train-consolidated.csv')
df_val = pd.read_csv('disaster-val-consolidated.csv')
df_test = pd.read_csv('disaster-test-consolidated.csv')

# 3. Preprocessing the Tweets

In [None]:
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags = re.MULTILINE)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    new_tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens_stemmed = [stemmer.stem(word) for word in new_tokens]
    return " ".join(tokens_stemmed)

In [None]:
df['text'] = df['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

# 4. TF- IDF and Data Assigning

In [None]:
tf_idf = TfidfVectorizer()
X_train = tf_idf.fit_transform(df['text'])
X_val = tf_idf.transform(df_val['text'])
X_test = tf_idf.transform(df_test['text'])
y_train = df['label']
y_val = df_val['label']
y_test = df_test['label']

In [None]:
train_counts = df['label'].value_counts()
val_counts = df_val['label'].value_counts()
test_counts = df_test['label'].value_counts()
fig, axes = plt.subplots(1, 3, figsize=(15, 10))
axes[0].pie(train_counts, labels=train_counts.index, autopct='%1.1f%%', colors = ['lightblue', 'green'], startangle=180)
axes[0].set_title('Training Set Label Distribution')
axes[1].pie(val_counts, labels=val_counts.index, autopct='%1.1f%%', colors = ['orange', 'red'] ,startangle=180)
axes[1].set_title('Validation Set Label Distribution')
axes[2].pie(test_counts, labels=test_counts.index, autopct='%1.1f%%', colors = ['yellow', 'purple'], startangle=180)
axes[2].set_title('Test Set Label Distribution')
plt.show()

# 5. Model Building

### 5.1 XGBoost

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
parameters = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'eta': 0.3,
    'eval_metric': 'logloss',
}

In [None]:
model_eval = [(dtrain, 'train'), (dval, 'val')]
bst = xgb.train(parameters, dtrain, num_boost_round=100, evals=model_eval, early_stopping_rounds=10)

In [None]:
y_predictions = bst.predict(dtest)
y_pred = (y_predictions > 0.5).astype(int)
print("Accuracy Score for XGBoost: ", accuracy_score(y_test, y_pred))
print("F1-Score: ", f1_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

### 5.2 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators = 500, class_weight = 'balanced', random_state = 25)
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)
print("Accuracy of Random Forest: ", accuracy_score(y_test, y_pred))
print("F1_score: ", f1_score(y_test, y_pred))
print("Classification Report")
print(classification_report(y_test, y_pred))

# 5.3 Convolutional Neural Network

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
word_index = tokenizer.word_index
vocabulary_size = len(word_index)
print(f"Vocabulary size: {vocabulary_size}")

In [None]:
X_train_seq = tokenizer.texts_to_sequences(df['text'])
X_val_seq = tokenizer.texts_to_sequences(df_val['text'])
X_test_seq = tokenizer.texts_to_sequences(df_test['text'])

In [None]:
X_train_padded = pad_sequences(X_train_seq, padding = 'post')
X_val_padded = pad_sequences(X_val_seq, padding = 'post')
X_test_padded = pad_sequences(X_test_seq, padding = 'post')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

In [None]:
sequence_lengths_train = [len(sequence) for sequence in X_train_seq]
sequence_lengths_val = [len(sequence) for sequence in X_val_seq]
sequence_lengths_test = [len(sequence) for sequence in X_test_seq]
all_sequence_lengths = sequence_lengths_train + sequence_lengths_val + sequence_lengths_test
plt.hist(all_sequence_lengths)
plt.xlabel('Sequence Length')
plt.ylabel('Frequency')
plt.title('Distribution of Sequence Lengths')
plt.show()

In [None]:
percentile = np.percentile(all_sequence_lengths, 95)
print("95th percentile of sequence length: ", percentile)
maxlen = int(percentile)
X_train_padded = pad_sequences(X_train_seq, padding='post', maxlen=maxlen)
X_val_padded = pad_sequences(X_val_seq, padding='post', maxlen=maxlen)
X_test_padded = pad_sequences(X_test_seq, padding='post', maxlen=maxlen)

In [None]:
model = Sequential()
model.add(Embedding(input_dim = vocabulary_size + 1, output_dim = 120, input_length = maxlen))
model.add(Conv1D(128, 5, activation = 'relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(20, activation = 'relu'))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

In [None]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train_padded, y_train, epochs = 20, validation_data = (X_val_padded, y_val), batch_size = 32)

In [None]:
y_predictions = (model.predict(X_test_padded) > 0.5).astype('int32')
print("F1 Score for CNN 1D: ", f1_score(y_test, y_predictions))
test_loss, test_acc = model.evaluate(X_test_padded, y_test)
print("Test Loss: ", test_loss)
print("Test Accuracy: ", test_acc)
print(classification_report(y_test, y_predictions))

In [None]:
model.save("1dcnndisaster.keras")

In [None]:
import keras
new_model = keras.models.load_model("1dcnndisaster.keras")

In [None]:
import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
preprocessed_text = preprocess_text("How much money would you ride out Hurricane Irma for? Umm no.")
sequence = tokenizer.texts_to_sequences([preprocessed_text])
padded_sequence = pad_sequences(sequence, maxlen=maxlen)
new_model = keras.models.load_model("1dcnndisaster.keras")
prediction = new_model.predict(padded_sequence)
print(prediction)

### 5.4 Recurrent Neural Networks

In [None]:
from tensorflow.keras.layers import SimpleRNN

In [None]:
model = Sequential()
model.add(Embedding(input_dim = vocabulary_size + 1, output_dim = 120, input_length = maxlen))
model.add(SimpleRNN(units = 64, return_sequences = False))
model.add(Dense(20, activation = 'relu'))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

In [None]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train_padded, y_train, epochs = 20, validation_data = (X_val_padded, y_val), batch_size = 32)

In [None]:
y_predictions = (model.predict(X_test_padded) > 0.5).astype('int32')
print("F1 Score for RNN: ", f1_score(y_test, y_predictions))
test_loss, test_acc = model.evaluate(X_test_padded, y_test)
print("Test Loss: ", test_loss)
print("Test Accuracy: ", test_acc)
print(classification_report(y_test, y_predictions))

### 5.5 Long Short Term Memory

In [None]:
from tensorflow.keras.layers import LSTM

In [None]:
model = Sequential()
model.add(Embedding(input_dim = vocabulary_size + 1, output_dim = 120, input_length = maxlen))
model.add(LSTM(64))
model.add(Dense(20, activation = 'relu'))
model.add(Dense(10, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

In [None]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train_padded, y_train, epochs = 20, validation_data = (X_val_padded, y_val), batch_size = 32)

In [None]:
y_predictions = (model.predict(X_test_padded) > 0.5).astype('int32')
print("F1 Score for RNN: ", f1_score(y_test, y_predictions))
test_loss, test_acc = model.evaluate(X_test_padded, y_test)
print("Test Loss: ", test_loss)
print("Test Accuracy: ", test_acc)
print(classification_report(y_test, y_predictions))