In [1]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from spacy.tokenizer import Tokenizer
import torch
import torch.nn as nn

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
smaple = pd.read_csv('sample_submission.csv')
print(train.shape)
print(test.shape)

(7613, 5)
(3263, 4)


# Exploration

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Data Cleaning

In [5]:
print(train.info())
print("=================")
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB
None


In [6]:
train.drop(['location', 'keyword'], axis=1, inplace=True)
test.drop(['location', 'keyword'], axis=1, inplace=True)

In [7]:
train.shape, test.shape

((7613, 3), (3263, 2))

In [8]:
for i in range(50):
    print(train['text'][i])
    print("------------")

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
------------
Forest fire near La Ronge Sask. Canada
------------
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
------------
13,000 people receive #wildfires evacuation orders in California 
------------
Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 
------------
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
------------
#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas
------------
I'm on top of the hill and I can see a fire in the woods...
------------
There's an emergency evacuation happening now in the building across the street
------------
I'm afraid that the tornado is coming to our area...
------------
Three people died from the heat wave so far
------------
Haha South Tamp

In [9]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', 
          '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', '·', '_', 
          '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×',
          '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', 
          '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', 
          '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 
          'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', 
          '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 
          'Ø', '¹', '≤', '‡', '√', ]


def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    x = re.sub('[0-9]{5, }', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [10]:
mispell_dict = {"aren't": "are not", "can't": "cannot", 
                "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not", "don't": "do not", 
                "hadn't": "had not", "hasn't": "has not",
                "haven't": "have not", "he'd": "he would", 
                "he'll": "he will", "he's": "he is",
                "i'd": "I would", "i'd": "I had", "i'll": 
                "I will", "i'm" : "I am", "isn't": "is not",
                "it's": "it is", "it'll": "it will", 
                "i've" : "I have", "let's": "let us", 
                "mightn't": "might not", "mustn't": "must not", 
                "shan't" : "shall not", "she'd": "she would",
                "she'll": "she will", "she's": "she is", 
                "shouldn't": "should not", "that's": "that is", 
                "there's": "there is","they'd": "they would", 
                "they'll": "they will", "they're": "they are",
                "they've": "they have", "we'd": "we would", 
                "we're": "we are", "weren't": "were not",
                "we've": "we have", "what'll": "what will", 
                "what're": "what are", "what's": "what is", 
                "what've": "what have", "where's": "where is", 
                "who'd": "who would", "who'll": "who will",
                "who're": "who are", "who's": "who is", 
                "who've": "who have", "won't": "will not",
                "wouldn't" : "would not", "you'd": "you would", 
                "you'll": "you will", "you're": "you are",
                "you've": "you have", "'re": " are", 
                "wasn't": "was not", "we'll": " will", 
                "didn't": "did not", "tryin'": "trying", 
                "colour": "color", "centre": "center",
                "didnt": "did not", "doesnt": "does not",
                "isnt": "is not", "shouldnt": "should not",
                "favourite": "favorite", "travelling": "traveling",
                "counselling": "counseling", "theatre": "theater",
                "cancelled": "canceled", "labour": "labor",
                "organisation": "organization", "wwii": "world war 2",
                "citicise": "criticize", "instagram": "social medium",
                "whatsapp": "social medium", "snapchat": "social medium"}


def get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [11]:
mispellings, mispellings_re = get_mispell(mispell_dict)
mispellings, mispellings_re

({"aren't": 'are not',
  "can't": 'cannot',
  "couldn't": 'could not',
  "didn't": 'did not',
  "doesn't": 'does not',
  "don't": 'do not',
  "hadn't": 'had not',
  "hasn't": 'has not',
  "haven't": 'have not',
  "he'd": 'he would',
  "he'll": 'he will',
  "he's": 'he is',
  "i'd": 'I had',
  "i'll": 'I will',
  "i'm": 'I am',
  "isn't": 'is not',
  "it's": 'it is',
  "it'll": 'it will',
  "i've": 'I have',
  "let's": 'let us',
  "mightn't": 'might not',
  "mustn't": 'must not',
  "shan't": 'shall not',
  "she'd": 'she would',
  "she'll": 'she will',
  "she's": 'she is',
  "shouldn't": 'should not',
  "that's": 'that is',
  "there's": 'there is',
  "they'd": 'they would',
  "they'll": 'they will',
  "they're": 'they are',
  "they've": 'they have',
  "we'd": 'we would',
  "we're": 'we are',
  "weren't": 'were not',
  "we've": 'we have',
  "what'll": 'what will',
  "what're": 'what are',
  "what's": 'what is',
  "what've": 'what have',
  "where's": 'where is',
  "who'd": 'who would',
  "

In [12]:
def remove_emoji(sentence):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', sentence)

def remove_stopwords(sentence):
    words = sentence.split()
    words = [word for word in words if word not in stopwords.words('english')]
    
    return ' '.join(words)

In [13]:
stemmer = SnowballStemmer('english')

def stem_words(sentence):
    words = sentence.split()
    words = [stemmer.stem(word) for word in words ]
    
    return ' '.join(words)

In [14]:
# Clean the text
train["text"] = train["text"].apply(lambda x: clean_text(x.lower()))
test["text"] = test["text"].apply(lambda x: clean_text(x.lower()))

# Clean numbers
train["text"] = train["text"].apply(lambda x: clean_numbers(x))
test["text"] = test["text"].apply(lambda x: clean_numbers(x))

# Clean spellings
train["text"] = train["text"].apply(lambda x: replace_typical_misspell(x))
test["text"] = test["text"].apply(lambda x: replace_typical_misspell(x))

# Clear emojis
train["text"] = train["text"].apply(lambda x: remove_emoji(x))
test["text"] = test["text"].apply(lambda x: remove_emoji(x))

# Stopwords
train["text"] = train["text"].apply(lambda x: remove_stopwords(x))
test["text"] = test["text"].apply(lambda x: remove_stopwords(x))

# Stemming
train["text"] = train["text"].apply(lambda x: stem_words(x))
test["text"] = test["text"].apply(lambda x: stem_words(x))

In [15]:
for i in range(50):
    print(train['text'][i])
    print("------------")

deed reason # earthquak may allah forgiv us
------------
forest fire near la rong sask . canada
------------
resid ask ' shelter place ' notifi offic . evacu shelter place order expect
------------
## , ### peopl receiv # wildfir evacu order california
------------
got sent photo rubi # alaska smoke # wildfir pour school
------------
# rockyfir updat = > california hwi . ## close direct due lake counti fire - # cafir # wildfir
------------
# flood # disast heavi rain caus flash flood street manitou , colorado spring area
------------
' top hill see fire wood . . .
------------
' emerg evacu happen build across street
------------
' afraid tornado come area . . .
------------
three peopl die heat wave far
------------
haha south tampa get flood hah - wait second live south tampa gonna gonna fvck # flood
------------
# rain # flood # florida # tampabay # tampa ## ## day . ' lost count
------------
# flood bago myanmar # arriv bago
------------
damag school bus ## multi car crash # break


In [16]:
val_data = train.tail(1500)
train_data = train.head(6113)