In [1]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re

In [2]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

## Preprocessing

In [3]:
# remove urls, html tags, punctuations, stopwords, emojis
my_stopwords = stopwords.words('english')

In [4]:
def data_preprocess(sentence):
    '''
    lowercase the sentence
    replace special cahracters with spacebar
    remove urls
    remove special characters
    '''
    
    sentence = sentence.lower()
    sentence = re.sub(r"[^a-zA-Z?.!,¿]+", " ",sentence)
    sentence = re.sub(r"http\S+", "",sentence)    
    
    
    html=re.compile(r'<.*?>') 
    sentence = html.sub(r'',sentence)
    
    characters = "/><|{}^@#!?+();$=&*[]-%.:_`''" 
    for c in characters:
        sentence = sentence.replace(c,'')
        
    sentence = [word.lower() for word in sentence.split() if word.lower() not in my_stopwords]
    cleaned_sentence = " ".join(sentence)
    
    ## This section is taken from another project to refer emojis on twitter tweets
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    #print(sentence)
    sentence = emoji_pattern.sub(r'', cleaned_sentence)
    return sentence

In [5]:
train_data['text'] = train_data['text'].apply(lambda x: data_preprocess(x))
test_data['text'] = test_data['text'].apply(lambda x:data_preprocess(x))

In [6]:
# Let's look at our cleaned sentence now
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,deeds reason earthquake may allah forgive us,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,residents asked shelter place notified officer...,1
3,6,,,", people receive wildfires evacuation orders c...",1
4,7,,,got sent photo ruby alaska smoke wildfires pou...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding bridge collapse nearb...,1
7609,10870,,,aria ahrary thetawniest control wild fires cal...,1
7610,10871,,,utc km volcano hawaii http tco zdtoyd ebj,1
7611,10872,,,police investigating e bike collided car littl...,1


In [7]:
test_data

Unnamed: 0,id,keyword,location,text
0,0,,,happened terrible car crash
1,2,,,"heard earthquake different cities, stay safe e..."
2,3,,,"forest fire spot pond, geese fleeing across st..."
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills china taiwan
...,...,...,...,...
3258,10861,,,earthquake safety los angeles safety fasteners...
3259,10865,,,storm ri worse last hurricane city amp others ...
3260,10868,,,green line derailment chicago http tco utbxlcbiuy
3261,10874,,,meg issues hazardous weather outlook hwo http ...


In [8]:
train_data.to_csv("data/preprocessed_train.csv")
test_data.to_csv("data/preprocessed_test.csv")