In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("disastertweets-train.csv")
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
df = df.drop(['keyword', 'location'], axis=1)

In [5]:
df

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,Police investigating after an e-bike collided ...,1


In [30]:
df.isna().sum()

id        0
text      0
target    0
dtype: int64

removing the following:
- Numbers (might change but numbers could indicate anything)- all letters should be made lowercase - punctuation
- Special characters
- URLs and @s (mentions)

TOKENIZE then:

- stopwords
- short words (2 letters and below) in case not covered by stop words
- lemmatization to normalize text

In [58]:
from nltk.corpus import stopwords
from string import punctuation
import re
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hhalo\AppData\Roaming\nltk_data...


True

punctuation, numbers and lowercase

In [34]:
def clean_tweet(tweet):
    cleaned_tweet = ''.join([char.lower() for char in tweet if char.isalpha() or char.isspace()])
    return cleaned_tweet

should remove @s and URLs

In [45]:
def remove_mention_url(tweet):
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)

    tweet = re.sub(r'@\w+', '', tweet)
    
    return tweet

In [43]:
(df['text'][2])

"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"

In [54]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    cleaned_tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    return cleaned_tokens

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

In [63]:
tweet = (df['text'][3])
tweet = clean_tweet(tweet)
tweet = remove_mention_url(tweet)
tweet = word_tokenize(tweet)
tweet = remove_stopwords(tweet)
tweet = lemmatize(tweet)
tweet

['people', 'receive', 'wildfire', 'evacuation', 'order', 'california']

In [66]:
def process_tweet(tweet):
    tweet = clean_tweet(tweet)
    tweet = remove_mention_url(tweet)
    tweet = re.sub(r'(.)\1+', r'\1\1', tweet) # also remove repeated char from elongated words
    tweet = word_tokenize(tweet)
    tweet = remove_stopwords(tweet)
    tweet = lemmatize(tweet)
    return tweet

In [67]:
df['processed'] = df['text'].apply(process_tweet)
df

Unnamed: 0,id,text,target,processed
0,1,Our Deeds are the Reason of this #earthquake M...,1,"[deed, reason, earthquake, may, allah, forgive]"
1,4,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, ronge, sask, canada]"
2,5,All residents asked to 'shelter in place' are ...,1,"[resident, asked, shelter, place, notified, of..."
3,6,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfire, evacuation, order,..."
4,7,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."
...,...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1,"[two, giant, crane, holding, bridge, collapse,..."
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1,"[ariaahrary, thetawniest, control, wild, fire,..."
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[utckm, volcano, hawaii]"
7611,10872,Police investigating after an e-bike collided ...,1,"[police, investigating, ebike, collided, car, ..."


word embeddings

In [70]:
import numpy as np

# Assuming you have already loaded GloVe embeddings into this dictionary
glove_file = 'glove.6B.100d.txt'
embeddings_index = {}

with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector


In [78]:
embeddings_index['cream']

array([-0.75739  ,  0.16928  , -0.78391  , -0.10915  ,  0.0082292,
        0.72339  ,  1.4583   , -0.06999  ,  0.01214  , -0.10243  ,
       -0.47295  , -0.37129  , -0.17431  ,  0.92555  ,  0.55883  ,
        0.26866  ,  0.5315   , -0.8269   ,  0.070044 , -0.16351  ,
       -0.41403  ,  0.83683  , -0.3771   , -0.31498  , -0.14327  ,
        1.3757   ,  0.25528  , -0.83953  , -0.45377  , -0.68199  ,
        0.72951  ,  0.67171  , -0.29706  , -0.76983  , -0.16529  ,
        0.65401  ,  0.39921  ,  0.46134  ,  0.12598  , -1.4694   ,
        0.94455  , -1.7318   , -0.48173  , -1.0355   ,  0.13409  ,
        0.43274  , -0.20636  ,  0.0086919,  0.62417  , -0.94417  ,
       -0.24822  , -0.32836  , -0.17971  ,  1.2036   , -0.88055  ,
       -1.0946   , -0.48351  ,  0.73395  ,  0.58274  ,  0.37251  ,
        0.60411  ,  0.45338  ,  0.038755 , -0.16675  ,  0.2082   ,
       -0.53577  ,  0.64532  , -0.19959  , -0.061568 , -0.87592  ,
       -0.23337  , -0.03427  , -0.017372 ,  0.62228  ,  0.6371

In [81]:
def apply_embeddings(tokenized_tweet):
    tweet_vector = [embeddings_index.get(token, np.zeros(100)) for token in tokenized_tweet]
    return tweet_vector

In [80]:
df['processed'][2]

['resident',
 'asked',
 'shelter',
 'place',
 'notified',
 'officer',
 'evacuation',
 'shelter',
 'place',
 'order',
 'expected']

In [82]:
apply_embeddings(df['processed'][2])

[array([ 0.29695  ,  0.099894 ,  0.47566  , -0.37281  ,  0.26364  ,
         0.37181  ,  0.16255  ,  0.58769  ,  0.15589  ,  0.24411  ,
        -0.89198  , -0.45334  ,  0.78667  , -0.43635  ,  0.25158  ,
        -0.68829  ,  0.35426  , -0.23437  , -0.57745  ,  0.66571  ,
        -0.31958  ,  0.55651  , -0.01792  , -0.12152  , -0.067306 ,
        -0.3646   ,  0.64946  , -1.4211   , -0.008817 ,  0.4048   ,
        -0.46976  , -0.12827  ,  0.42479  ,  0.81011  , -0.0073016,
        -0.42876  , -0.23824  ,  0.35281  ,  1.438    , -0.55432  ,
        -0.47091  ,  0.2951   ,  0.43151  , -0.14291  ,  1.1853   ,
        -0.43876  , -0.37454  , -0.075578 , -0.090728 ,  0.30303  ,
        -0.40244  , -1.0643   ,  0.33005  , -0.076731 ,  0.052418 ,
        -1.1262   , -0.66846  , -0.4765   ,  0.81525  ,  0.25164  ,
         0.28933  ,  0.0718   ,  0.033929 , -0.20505  ,  0.5149   ,
        -0.22925  ,  0.54417  ,  0.25427  ,  0.17549  ,  0.6556   ,
        -0.63258  , -0.05621  , -0.12629  , -0.1

nice nut bad

In [83]:
df['embedded'] = df['processed'].apply(apply_embeddings)
df

Unnamed: 0,id,text,target,processed,embedded
0,1,Our Deeds are the Reason of this #earthquake M...,1,"[deed, reason, earthquake, may, allah, forgive]","[[0.48937, -0.44453, -0.36711, -0.18025, -0.67..."
1,4,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, ronge, sask, canada]","[[-0.51682, 0.49154, 0.66964, 0.40753, 0.03444..."
2,5,All residents asked to 'shelter in place' are ...,1,"[resident, asked, shelter, place, notified, of...","[[0.29695, 0.099894, 0.47566, -0.37281, 0.2636..."
3,6,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfire, evacuation, order,...","[[0.29019, 0.80497, 0.31187, -0.32706, -0.4723..."
4,7,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi...","[[0.54635, 0.19018, 0.51298, -0.76729, -0.2382..."
...,...,...,...,...,...
7608,10869,Two giant cranes holding a bridge collapse int...,1,"[two, giant, crane, holding, bridge, collapse,...","[[-0.20154, 0.32739, 0.0004758, -0.22452, 0.44..."
7609,10870,@aria_ahrary @TheTawniest The out of control w...,1,"[ariaahrary, thetawniest, control, wild, fire,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7610,10871,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,"[utckm, volcano, hawaii]","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7611,10872,Police investigating after an e-bike collided ...,1,"[police, investigating, ebike, collided, car, ...","[[0.1049, -0.86604, -0.18197, -0.72754, -0.390..."


In [85]:
pip install torch torchvision


Collecting torch
  Using cached torch-2.4.0-cp38-cp38-win_amd64.whl.metadata (27 kB)
Collecting torchvision
  Using cached torchvision-0.19.0-1-cp38-cp38-win_amd64.whl.metadata (6.1 kB)
Collecting sympy (from torch)
  Using cached sympy-1.13.2-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.4.0-cp38-cp38-win_amd64.whl (198.1 MB)
   ---------------------------------------- 0.0/198.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/198.1 MB ? eta -:--:--
   ---------------------------------------- 0.1/198.1 MB 812.7 kB/s eta 0:04:04
   ---------------------------------------- 0.1/198.1 MB 1.3 MB/s eta 0:02:31
   ---------------------------------------- 0.2/198.1 MB 1.5 MB/s eta 0:02:10
   ---------------------------------------- 0.3/198.1 MB 1.4 MB/s eta 0:02:20
   ---------------------------------------- 0.3/198.1 MB 1.3 MB/s eta 0:02:32
   -------------