In [1]:
# Binary Classification on Text Data

In [26]:
# Imports
import pandas as pd

train_presplit = pd.read_csv("./data-from-NLP-disaster-tweets/train.csv")
test = pd.read_csv("./data-from-NLP-disaster-tweets/test.csv")
print(train_presplit.shape)
test[10:20]

(7613, 5)


Unnamed: 0,id,keyword,location,text
10,30,,,No I don't like cold!
11,35,,,NOOOOOOOOO! Don't do that!
12,42,,,No don't tell me that!
13,43,,,What if?!
14,45,,,Awesome!
15,46,ablaze,London,Birmingham Wholesale Market is ablaze BBC News...
16,47,ablaze,Niall's place | SAF 12 SQUAD |,@sunkxssedharry will you wear shorts for race ...
17,51,ablaze,NIGERIA,#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriag...
18,58,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...
19,60,ablaze,"Los Angeles, Califnordia",PSA: IÛªm splitting my personalities.\n\n?? t...


In [9]:
# Data exploration
print(train_presplit.shape)
print(test.shape)

(7613, 5)
(3263, 4)


In [12]:
# Percent of training tweets are disasters
count_disasters = (train_presplit["target"] == 1).sum()
print(count_disasters)

3271


In [8]:
# Split training data
from sklearn.model_selection import train_test_split

train, validate = train_test_split(
    train_presplit, test_size=0.3, random_state=42, shuffle=True
)
print(train.shape)
train.head()

(5329, 5)


Unnamed: 0,id,keyword,location,text,target
1186,1707,bridge%20collapse,,Ashes 2015: AustraliaÛªs collapse at Trent Br...,0
4071,5789,hail,"Carol Stream, Illinois",GREAT MICHIGAN TECHNIQUE CAMP\nB1G THANKS TO @...,1
5461,7789,police,Houston,CNN: Tennessee movie theater shooting suspect ...,1
5787,8257,rioting,,Still rioting in a couple of hours left until ...,1
7445,10656,wounds,Lake Highlands,Crack in the path where I wiped out this morni...,0


In [22]:
# Preprocess decision-making
# YES: Convert all words to lowercase
# YES: Lemmatize all words
# YES: Strip punctuation
# YES: Strip stop words

# Maybe: Strip @s
count_ats = train_presplit[train_presplit["text"].str.contains("@")]
count_ats_disaster = (count_ats["target"] == 1).sum() / (train_presplit["target"] == 1).sum()
count_ats_nondisaster = (count_ats["target"] == 0).sum() / (train_presplit["target"] == 0).sum()
print(f"% Disasters with @: {count_ats_disaster}, % Nondisasters with @: {count_ats_nondisaster}")

% Disasters with @: 0.20666462855395903, % Nondisasters with @: 0.31391064025794563


In [21]:
# Maybe: Strip #s
count_ats = train_presplit[train_presplit["text"].str.contains("#")]
count_ats_disaster = (count_ats["target"] == 1).sum() / (train_presplit["target"] == 1).sum()
count_ats_nondisaster = (count_ats["target"] == 0).sum() / (train_presplit["target"] == 0).sum()
print(f"% Disasters with #: {count_ats_disaster}, % Nondisasters with #: {count_ats_nondisaster}")

% Disasters with #: 0.2675022928767961, % Nondisasters with #: 0.20405343159834177


In [27]:
# Maybe: Strip Û symbols that did not get recorded properly
count_ats = train_presplit[train_presplit["text"].str.contains("Û")]
count_ats_disaster = (count_ats["target"] == 1).sum() / (train_presplit["target"] == 1).sum()
count_ats_nondisaster = (count_ats["target"] == 0).sum() / (train_presplit["target"] == 0).sum()
print(f"% Disasters with Û: {count_ats_disaster}, % Nondisasters with Û: {count_ats_nondisaster}")

% Disasters with Û: 0.09110363803118313, % Nondisasters with Û: 0.07231690465223399


In [34]:
# Maybe: Strip urls
count_ats = train_presplit[train_presplit["text"].str.contains("http:")]
count_ats_disaster = (count_ats["target"] == 1).sum() / (train_presplit["target"] == 1).sum()
count_ats_nondisaster = (count_ats["target"] == 0).sum() / (train_presplit["target"] == 0).sum()
print(f"% Disasters with http: {count_ats_disaster}, % Nondisasters with http: {count_ats_nondisaster}")

% Disasters with http: 0.6285539590339346, % Nondisasters with http: 0.35651773376324275


In [32]:
# Maybe: Strip urls HTTPS
count_ats = train_presplit[train_presplit["text"].str.contains("https")]
count_ats_disaster = (count_ats["target"] == 1).sum() / (train_presplit["target"] == 1).sum()
count_ats_nondisaster = (count_ats["target"] == 0).sum() / (train_presplit["target"] == 0).sum()
print(f"% Disasters with https: {count_ats_disaster}, % Nondisasters with https: {count_ats_nondisaster}")

% Disasters with https: 0.040966065423417915, % Nondisasters with https: 0.06287425149700598


In [None]:
# Final preprocess list
# Convert all words to lowercase
# Lemmatize all words
# Strip punctuation (but keep @s, #s)
# Strip stop words
# Strip Û symbols and other non-standard characters
# Strip link specifics BUT keep the fact that a link was included (And also distinguishing between http: and https:)

In [94]:
# Preprocessing function definitions
from nltk.stem import *
import nltk
from nltk.tokenize import word_tokenize
import re

# Adapted from code generated by ChatGPT (OpenAI, 2025)
# Prompt: "how to use nltk stem to take a string with multiple words and convert them to the stems"
# Prompt: "how to only keep abc characters in a string"
# Prompt: "create a set of common stop words like the, and, or"

nltk.download("punkt")
nltk.download('punkt_tab')
ps = PorterStemmer()

common_stops = {
    # Standard stop words
    "a", "an", "the", "and", "or", "but", "if", "while", "with",
    "of", "at", "by", "for", "to", "in", "on", "off", "out", "up",
    "down", "over", "under", "again", "further", "then", "once",
    "here", "there", "when", "where", "why", "how", "all", "any",
    "both", "each", "few", "more", "most", "other", "some", "such",
    "no", "nor", "not", "only", "own", "same", "so", "than", "too",
    "very", "can", "will", "just", "don", "should", "now",

    # Pronouns
    "i", "me", "my", "mine", "you", "your", "yours",
    "he", "him", "his", "she", "her", "they", "them", "their",
    "we", "us", "our", "its", "it",

    # Auxiliary verbs
    "is", "am", "are", "was", "were", "be", "being", "been",
    "do", "does", "did", "have", "has", "had",
    "will", "would", "shall", "should", "can", "could", "may", "might", "must",

    # Informal/slang tokens from dataset
    "im", "idk", "u", "wa", "gon", "na", "tho", "thats", "ur",

    # Single letters/numbers that often aren't meaningful
    "a", "i", "u", "2", "3", "4", "5", "6", "7", "8", "9", "0",

    # extra added to deal with links or that I needed to add manually
    ":", "as", "into", "until", "among", "like", "dont", "from", "doesnt", "that", "be", "ha", "thi",
}

def word_based_processing(text):
    # stems text and removes common stop words
    # and removes details of https and http links (but keeps the http or https)
    # and removes username tags after the @ (but keeps the @)
    temp_words = []
    for word in text.split(" "):
        if word.startswith("https:"):
            temp_words.append(word[:5])
        elif word.startswith("http:"):
            temp_words.append(word[:4])
        elif word.startswith("@"):
            temp_words.append(word[:1])
        else:
            temp_words.append(word)
    text_links_handled = " ".join(temp_words)
    words = word_tokenize(text_links_handled) 
    words_processed = []
    for word in words:
        if isinstance(word, str):
            stem = ps.stem(word)
        # remove common stops and : that lingers
        if stem not in common_stops:
            words_processed.append(stem)
    return " ".join(words_processed)

def filter_for_alphanumeric_and_more(text):
    # filters to only keep a-z, 0-9, @, #, and spaces and :
    return re.sub(r'[^a-z0-9@# :]', '', text)

def preprocess(df):
    new_df = df.copy()
    print("Original:")
    print(new_df.head())
    new_df["text"] = new_df["text"].str.lower()
    print("Lowercase:")
    print(new_df.head())
    new_df["text"] = new_df["text"].apply(filter_for_alphanumeric_and_more)
    print("Filtered:")
    print(new_df.head())
    new_df["text"] = new_df["text"].apply(word_based_processing)
    print("Filtered wordbased:")
    print(new_df.head())
    return new_df

[nltk_data] Downloading package punkt to
[nltk_data]     /home/jiwonjjeong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/jiwonjjeong/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [95]:
# Preprocess test
data = {
    "Id": [1,2,3,4,5],
    "text": ["SOOOO PUMPED FOR ABLAZE ???? @southridgelife",
             "Check these out: http://t.co/rOI2NSmEJJ http://t.co/3Tj8ZjiN21 http://t.co/YDUiXEfIpE http://t.co/LxTjc87KLS #nsfw",
             "Rene Ablaze &amp; Jacinta - Secret 2k13 (Fallen Skies Edit) - Mar 30 2013  https://t.co/7MLMsUzV1Z",
             "@flowri were you marinading it or was it an accident?",
             "105	320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/yNXnvVKCDA | @djicemoon | #Dubstep #TrapMusic #DnB #EDM #Dance #IcesÛ_ http://t.co/weQPesENku"]
}

df = pd.DataFrame(data)
preprocess(df)

Original:
   Id                                               text
0   1       SOOOO PUMPED FOR ABLAZE ???? @southridgelife
1   2  Check these out: http://t.co/rOI2NSmEJJ http:/...
2   3  Rene Ablaze &amp; Jacinta - Secret 2k13 (Falle...
3   4  @flowri were you marinading it or was it an ac...
4   5  105\t320 [IR] ICEMOON [AFTERSHOCK] | http://t....
Lowercase:
   Id                                               text
0   1       soooo pumped for ablaze ???? @southridgelife
1   2  check these out: http://t.co/roi2nsmejj http:/...
2   3  rene ablaze &amp; jacinta - secret 2k13 (falle...
3   4  @flowri were you marinading it or was it an ac...
4   5  105\t320 [ir] icemoon [aftershock] | http://t....
Filtered:
   Id                                               text
0   1           soooo pumped for ablaze  @southridgelife
1   2  check these out: http:tcoroi2nsmejj http:tco3t...
2   3  rene ablaze amp jacinta  secret 2k13 fallen sk...
3   4  @flowri were you marinading it or was it an ac...


Unnamed: 0,Id,text
0,1,soooo pump ablaz @
1,2,check these http http http http # nsfw
2,3,rene ablaz amp jacinta secret 2k13 fallen sky ...
3,4,@ marinad accid
4,5,105320 ir icemoon aftershock http @ # dubstep ...


In [96]:
# Preprocess all and save
preprocess(train).to_csv("data-from-NLP-disaster-tweets/preprocessed/train.csv", index=False)
preprocess(validate).to_csv("data-from-NLP-disaster-tweets/preprocessed/validate.csv", index=False)
preprocess(test).to_csv("data-from-NLP-disaster-tweets/preprocessed/test.csv", index=False)

Original:
         id            keyword                location  \
1186   1707  bridge%20collapse                     NaN   
4071   5789               hail  Carol Stream, Illinois   
5461   7789             police                Houston    
5787   8257            rioting                     NaN   
7445  10656             wounds          Lake Highlands   

                                                   text  target  
1186  Ashes 2015: AustraliaÛªs collapse at Trent Br...       0  
4071  GREAT MICHIGAN TECHNIQUE CAMP\nB1G THANKS TO @...       1  
5461  CNN: Tennessee movie theater shooting suspect ...       1  
5787  Still rioting in a couple of hours left until ...       1  
7445  Crack in the path where I wiped out this morni...       0  
Lowercase:
         id            keyword                location  \
1186   1707  bridge%20collapse                     NaN   
4071   5789               hail  Carol Stream, Illinois   
5461   7789             police                Houston    
57

In [97]:
# SUMMARY OF PREPROCESSING
# Lowercase all letters
# Only keep alphanumerics, # and @
# Keep the http or https part of links, get rid of the rest of the link
# Keep the @ part of @usernames, get rid of the actual username
# Keep the # of #topics, keep both topic and # but separate them
# Lemmatize all words
# Remove common stop words (and, or, the ..)