<a href="https://colab.research.google.com/github/JericCantos/DisasterTweetPredictions/blob/main/notebooks/DisasterTweetPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Load Libraries

In [173]:
!pip install ftfy



In [174]:
import requests
import io
from ftfy import fix_text

import numpy as np
import pandas as pd
import random
import nltk
import re

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer # more appropriate for tweets than word_tokenizer
from nltk.corpus import wordnet

# Downloading necessary NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')  # List of common stop words in English
nltk.download('punkt')  # Pre-trained tokenizer models
nltk.download('wordnet')  # WordNet lemmatizer dataset
nltk.download('averaged_perceptron_tagger')  # Needed for POS tagging
nltk.download('averaged_perceptron_tagger_eng') # Needed for POS tagging

# Libraries for text feature extraction and model training
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Libraries for model evaluation
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


## Load Dataset

In [175]:
# Build the direct download URL from a file ID
def make_drive_url(file_id):
    return f"https://drive.google.com/uc?id={file_id}"

# Helper function to load a CSV from a direct URL
def load_csv_from_url(url):
    response = requests.get(url)
    response.raise_for_status()  # Raises an error if the request fails
    return pd.read_csv(io.StringIO(response.text))
    # Decode as UTF-8 manually, not using response.text
    #decoded_content = response.content.decode('utf-8', errors='replace')
    #return pd.read_csv(io.StringIO(decoded_content))

In [176]:
file_ids = {
    "train": "1a8kB3M_-ZTQFJemnNY_g9M3d6yIoJERL",
}

# train: https://drive.google.com/file/d/1a8kB3M_-ZTQFJemnNY_g9M3d6yIoJERL/view?usp=drive_link

In [177]:
df_train = load_csv_from_url(make_drive_url(file_ids["train"]))

### Fix Double Encoding Errors

In [178]:
# Fix encoding of all tweets to avoid words like "America\x89Ûªs"
df_train['text'] = df_train['text'].apply(fix_text)

In [179]:
def fix_double_encoding(text):
    try:
        return text.encode('latin1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text


In [180]:
df_train['text'] = df_train['text'].apply(lambda t: fix_double_encoding(fix_text(t)))

In [181]:
def repair_text(s):
    if not isinstance(s, str):
        return s
    try:
        # step 1: encode the visible garbage characters as if they were latin1 bytes
        # step 2: decode them back as UTF-8
        s_fixed = s.encode('cp1252', errors='ignore').decode('utf-8', errors='ignore')
        return s_fixed
    except Exception:
        return s


In [182]:
df_train['text'] = df_train['text'].apply(repair_text)

In [183]:
df_train.head(20)

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
5,#RockyFire Update => California Hwy. 20 closed...,1
6,#flood #disaster Heavy rain causes flash flood...,1
7,I'm on top of the hill and I can see a fire in...,1
8,There's an emergency evacuation happening now ...,1
9,I'm afraid that the tornado is coming to our a...,1


# Data Preprocessing

In [184]:
# Function to map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

In [185]:
def fix_encoding(text):
    try:
        # Encode as latin1 and decode as utf-8 to reverse mojibake
        return text.encode('latin1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError):
        # If it fails, return original
        return text

In [186]:
# Define the preprocessing function
def preprocess_text(sentences):
    # Fix encoding of all tweets to avoid words like "America\x89Ûªs"
    sentences = [fix_encoding(sentence) for sentence in sentences]

    # Convert all tokens to lowercase
    sentences = [sentence.lower() for sentence in sentences]

    # Replace Unicode full-width @ with normal @
    sentences = [re.sub(r"＠", "@", sentence) for sentence in sentences]


    # Remove punctuation using regex
    # Keep hashtags and mentions
    sentences = [re.sub(r"[^\w\s'#@]", "", sentence) for sentence in sentences]

    # Remove numbers and URLs starting with http or www
    sentences = [re.sub(r"http\S+|www\S+|\d+", "", sentence)
                  for sentence in sentences]


    # Remove extra whitespace between words
    sentences = [" ".join(sentence.split()) for sentence in sentences]

    # Tokenize sentences into words
    tokenizer = TweetTokenizer(preserve_case=False)
    sentences = [tokenizer.tokenize(sentence) for sentence in sentences]

    # Remove stop words and single-letter wordsd
    stop_words = set(stopwords.words('english'))  # Load English stop words
    filtered_sentences = []
    for sentence in sentences:
        filtered_sentence = [word for word in sentence if
                             word not in stop_words and
                             len(word) > 1]
        filtered_sentences.append(filtered_sentence)


    lemmatized_sentences = []
    lemmatizer = WordNetLemmatizer()
    for sentence in filtered_sentences:
      # Get POS tags for each word
      pos_tags = nltk.pos_tag(sentence)

      # Lemmatize with POS tags
      lemmas = []
      for word, tag in pos_tags:
        wn_pos = get_wordnet_pos(tag)
        lemmas.append(lemmatizer.lemmatize(word, pos=wn_pos))
      lemmatized_sentences.append(lemmas)

    return [' '.join(sentence) for sentence in lemmatized_sentences]

In [187]:
# shuffle the training ddata
df_shuffled = df_train.sample(frac=1, random_state=42).reset_index(drop=True)

df_shuffled.head(20)

Unnamed: 0,text,target
0,So you have a new weapon that can cause un-ima...,1
1,The f$&@ing things I do for #GISHWHES Just got...,0
2,DT @georgegalloway: RT @Galloway4Mayor: The Co...,1
3,Aftershock back to school kick off was great. ...,0
4,in response to trauma Children of Addicts deve...,0
5,@Calum5SOS you look like you got caught in a r...,0
6,my favorite lady came to our volunteer meeting...,1
7,@brianroemmele UX fail of EMV - people want to...,1
8,Can't find my ariana grande shirt this is a f...,0
9,The Murderous Story Of America۪s First Hijacki...,1


In [188]:
tweets = df_shuffled['text']
targets = df_shuffled['target']

pre_processed_tweets = pd.Series(preprocess_text(tweets))

In [189]:
pre_processed_tweets.head(20)

Unnamed: 0,0
0,new weapon cause unimaginable destruction
1,@ing thing #gishwhes get soaked deluge go pad ...
2,dt @georgegalloway rt @gallowaymayor col polic...
3,aftershock back school kick great want thank e...
4,response trauma child addict develop defensive...
5,@calumsos look like get catch rainstorm amaze ...
6,favorite lady come volunteer meet hopefully jo...
7,@brianroemmele ux fail emv people want insert ...
8,can't find ariana grande shirt fuck tragedy
9,murderous story america first hijack


In [190]:
pre_processed_tweets.iloc[9]

'murderous story america first hijack'

In [191]:
tweets.iloc[9]

'The Murderous Story Of America۪s First Hijacking http://t.co/EYUGk6byxr'

# Train-Test Split

In [192]:
df = pd.DataFrame({'text': pre_processed_tweets, 'target': targets})
df.head()

Unnamed: 0,text,target
0,new weapon cause unimaginable destruction,1
1,@ing thing #gishwhes get soaked deluge go pad ...,0
2,dt @georgegalloway rt @gallowaymayor col polic...,1
3,aftershock back school kick great want thank e...,0
4,response trauma child addict develop defensive...,0


In [193]:
# TODO: Use train_test_split instead so we can stratify the split to maintain balance
X_train, X_test, y_train, y_test = train_test_split(df['text'],
                                                    df['target'],
                                                    test_size=0.2,
                                                    stratify=df['target'],
                                                    random_state=42)

# Print sizes of training and test sets
print("Size of training set:", len(X_train))
print("Size of test set:", len(X_test))

Size of training set: 6090
Size of test set: 1523


# Feature Extraction

In [197]:
tweet_tokenizer = TweetTokenizer(preserve_case=False)

tfidf_vectorizer = TfidfVectorizer(
    tokenizer = tweet_tokenizer.tokenize,
    preprocessor = None,
    token_pattern = None
)

# Transform the training data into a TF-IDF matrix
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Check the number of samples and features
num_samples, num_features = X_train_tfidf.shape
print("#Samples: {}, #Features: {}".format(num_samples, num_features))

#Samples: 6090, #Features: 13656


In [198]:
tfidf_vectorizer.get_feature_names_out()

array(['##book', '##fukushima', '##youtube', ..., 'zss', 'zumiez', 'zzzz'],
      dtype=object)

In [196]:
sample = "RT @__ohhmyjoshh and also ＠__fakeunicode"
print("Before preprocessing:", sample)
print("After preprocessing:", preprocess_text([sample])[0])


Before preprocessing: RT @__ohhmyjoshh and also ＠__fakeunicode
After preprocessing: rt @__ohhmyjoshh also @__fakeunicode
