# CS 441 Final Project - Disaster Tweets

## 1. Import

In [63]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
# Load the dataset
train_df = pd.read_csv("train.csv")

# Split the dataset (80% train, 20% validation)
train_data, val_data = train_test_split(train_df, test_size = 0.2, random_state = 441)

print("Training set:", train_data.shape)
print("Validation set:", val_data.shape)

print("Example:\n", train_df.iloc[32])

Training set: (6090, 5)
Validation set: (1523, 5)
Example:
 id                                                         49
keyword                                                ablaze
location                        Est. September 2012 - Bristol
text        We always try to bring the heavy. #metal #RT h...
target                                                      0
Name: 32, dtype: object


# 2. Preprocess

In [58]:
# Preprocess text, remove part of speech, stopwords, and http links
# https://stackoverflow.com/questions/17390326/getting-rid-of-stop-words-and-document-tokenization-using-nltk
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return " ".join(tokens)

train_data["clean_text"] = train_data["text"].apply(preprocess_text)
val_data["clean_text"] = val_data["text"].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer(max_features = 5000)

X_train = tfidf_vectorizer.fit_transform(train_data["clean_text"])
y_train = train_data["target"]

X_val = tfidf_vectorizer.transform(val_data["clean_text"])
y_val = val_data["target"]

print("Original:", train_data.iloc[1]["text"])
print("Processed:", train_data.iloc[1]["clean_text"])

Original: British diver Neil Anthony Fears found dead by the wreck of a steamship - Daily Mail http://t.co/QP3GVvfoFq
Processed: british diver neil anthony fear found dead wreck steamship daily mail http


In [62]:
print(X_train)

  (0, 3790)	0.4528132146058555
  (0, 4188)	0.6304602258185331
  (0, 1052)	0.6304602258185331
  (1, 2111)	0.0783485762315847
  (1, 2629)	0.3755054057380762
  (1, 1074)	0.3234361332079501
  (1, 4938)	0.2867904162218649
  (1, 1105)	0.25773202663579503
  (1, 1741)	0.2844201476156351
  (1, 1603)	0.2696431982036788
  (1, 170)	0.4083574590241537
  (1, 2921)	0.4083574590241537
  (1, 565)	0.3353473047758339
  (2, 2597)	0.49900367146439967
  (2, 2027)	0.6185146148096988
  (2, 3788)	0.6069884736400349
  (3, 384)	0.25601239218389893
  (3, 1831)	0.15656023521192458
  (3, 2929)	0.20923834054253226
  (3, 4919)	0.1840471674116045
  (3, 4858)	0.22375717070468867
  (3, 2227)	0.24435971857921673
  (3, 2456)	0.22853864159345988
  (3, 1696)	0.5467947578070591
  (3, 3248)	0.46811688928390544
  :	:
  (6086, 781)	0.7445823875382646
  (6086, 2102)	0.25512528788254585
  (6086, 1247)	0.2903709775038372
  (6086, 990)	0.24308554360957427
  (6086, 2873)	0.2886177297671586
  (6086, 2111)	0.07389615676752616
  (6087,