In [1]:
import pandas as pd
import numpy as np
import re
import string
import spacy

In [None]:
!pip install spacy

In [2]:
# loading document/csv - make sure csv is in same folder as this .ipynb file

df = pd.read_csv('train.csv')

# Exploratory Code

In [3]:
# exploratory function

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df.shape

(7613, 5)

In [5]:
# check for missing values

df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
# check for class imbalance

df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [7]:
# example of a non disaster tweet

print(f'This is a non-disaster tweet: {df[df["target"] == 0]["text"].values[1]}')

# example of a disaster tweet

print(f'This is a disaster tweet: {df[df["target"] == 1]["text"].values[1]}')

This is a non-disaster tweet: I love fruits
This is a disaster tweet: Forest fire near La Ronge Sask. Canada


# Data cleaning and pre-processing

In [8]:
# drop the instances where the keywords are missing

df = df[df.keyword.notnull()]

In [9]:
# check if above function worked

df.isnull().sum()

id             0
keyword        0
location    2472
text           0
target         0
dtype: int64

In [10]:
# recheck the head

# TODO: restructure id for better dataframe manipulation

df.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [11]:
df.shape

(7552, 5)

## Delete links from text

Important to do link replacement first since text is(not) a duplicate based on the link, i.e. some text are duplicates only when the links are taken away (re-tweet)

In [12]:
# Before link replacement

print('text example with url(https):')
print(df.iloc[31]['text'], '\n')

print('text example with url(http):')
print(df.iloc[0]['text'],'\n')

print('text example starting/ending with link:')
print(df[df['id']==1153]['text'], '\n')

text example with url(https):
Rene Ablaze &amp; Jacinta - Secret 2k13 (Fallen Skies Edit) - Mar 30 2013  https://t.co/7MLMsUzV1Z 

text example with url(http):
@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C 

text example starting/ending with link:
795    http://t.co/ETkd58Un8n - Cleveland Heights Sha...
Name: text, dtype: object 



In [13]:
# Link deletion replace variant (doesn't delete full text (however 2x .replace))

df['text'] = df['text'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

In [14]:
# After link replacement

print('text example with url(https):')
print(df.iloc[31]['text'], '\n')

print('text example with url(http):')
print(df.iloc[0]['text'],'\n')

print('text example starting/ending with link:')
print(df[df['id']==1153], '\n')

text example with url(https):
Rene Ablaze &amp; Jacinta - Secret 2k13 (Fallen Skies Edit) - Mar 30 2013   

text example with url(http):
@bbcmtd Wholesale Markets ablaze  

text example starting/ending with link:
       id keyword location                                               text  \
795  1153  blight      NaN   - Cleveland Heights Shaker Heights fight blig...   

     target  
795       0   



## Further data cleaning

In the following code blocks we do some priority data cleaning. This includes handling:
* Duplicates
* Lower casing
* Punctuation
* Remove words with numbers
* Remove additional punctuation and non-sensical text

In [15]:
# Delete duplicates
df = df.drop_duplicates(subset=['text'], keep='first')

# Lower case, and start/end white spaces
df['text'] = df['text'].apply(lambda x: x.strip(' ').lower())


In [16]:
# Delete punctuation (what to do with periods/full stops) V

# df_without_full_stop = df['text'].replace(r'[%s]' % re.escape(string.punctuation), '', regex=True) # this contains all punct removed

df['text'] = df['text'].replace(r'[%s]' % re.escape("!\"$%&'()*+,-/:;<=>?@[\]^_`{|}~"), '', regex=True)# this contains full stop


In [17]:
all_hashtags = []
for index, row in df.iterrows():
    list_of_hashtags = []
    for word in row['text'].split(' '):
        if '#' in word:
            list_of_hashtags.append(word)
    all_hashtags.append(list_of_hashtags)
df['hashtags'] = all_hashtags

In [23]:
# Remove numbers

# df['text'] = df['text'].replace(r'^[-+]?([1-9]\d*|0)$', '', regex=True)
df['text'] = df['text'].replace(r'\d+', '', regex=True)

In [24]:
# Remove additional puntctuation and non-sensical text

df['text'] = df['text'].replace(r'[‘’“”…]', '', regex=True)

df['text'] = df['text'].replace(r'\n', '', regex=True)

## NLP pre-processing

Here we include processing that is more common to Natural Language Processing. We handle the most conventional ones:
* Delete stopwords
* Tokenization
* Lemmatization
<!-- * Stemming -->

In [27]:
nlp = spacy.load("en_core_web_sm") # other languages: de, es, pt, fr, it, nl
def normalize(sentence):
    tokenized_text = []
    doc = nlp(sentence)
    for token in doc:
        if not(token.is_stop):
            tokenized_text.append(token.lemma_.strip())
    return " ".join(tokenized_text)

In [28]:
df['cleaned_text'] = df['text'].apply(normalize)

In [None]:
# TODO: Remove non unicode characters
# df['text'] = df['text'].replace(r'[%s]' % re.escape("!\"#$%&'()*+,-/:;<=>?@[\]^_`{|}~"), '', regex=True)# this contains full stop


## Sentiment analysis using VADER:

In [33]:
import nltk
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
# make sure to run the beneath lines if you haven't done that above
# import spacy
# nlp = spacy.load('en') # en_core_web_sm

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hasan\AppData\Roaming\nltk_data...


True

In [34]:
vader_model = SentimentIntensityAnalyzer()

In [38]:
def run_vader(textual_unit, 
              lemmatize=False, 
              parts_of_speech_to_consider=set(),
              verbose=0):
    """
    Run VADER on a sentence from spacy
    
    :param str textual unit: a textual unit, e.g., sentence, sentences (one string)
    (by looping over doc.sents)
    :param bool lemmatize: If True, provide lemmas to VADER instead of words
    :param set parts_of_speech_to_consider:
    -empty set -> all parts of speech are provided
    -non-empty set: only these parts of speech are considered
    :param int verbose: if set to 1, information is printed
    about input and output
    
    :rtype: dict
    :return: vader output dict
    """
    doc = nlp(textual_unit)
        
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:

            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)
    print(input_to_vader)
    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('INPUT SENTENCE', sent)
        print('INPUT TO VADER', input_to_vader)
        print('VADER OUTPUT', scores)

    return scores

In [104]:
print(df.iloc[15]['cleaned_text'])
run_vader(df.iloc[15]['text'])#['compound']
# vader_model.polarity_scores(df.iloc[15]['text'])

KeyError: 'cleaned_text'

In [135]:
# print(df.iloc[10]['text'])
# df.head()
# run_vader("Crying out for more! Set me ablaze")#['compound']
vader_model.polarity_scores('afraid tornado come area ...')
# normalize("I'm afraid that the tornado is coming to our area...")

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [136]:
df['score'] = df['text'].apply(lambda x:vader_model.polarity_scores(x))
df['c_score'] = df['score'].apply(lambda score: 'pos' if score['compound']>=0 else 'neg')

df.head(10)

Unnamed: 0,id,keyword,location,text,target,score,comp_score,c_score
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"{'neg': 0.0, 'neu': 0.851, 'pos': 0.149, 'comp...",pos,pos
1,4,,,Forest fire near La Ronge Sask. Canada,1,"{'neg': 0.286, 'neu': 0.714, 'pos': 0.0, 'comp...",neg,neg
2,5,,,All residents asked to 'shelter in place' are ...,1,"{'neg': 0.095, 'neu': 0.905, 'pos': 0.0, 'comp...",neg,neg
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",pos,pos
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",pos,pos
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1,"{'neg': 0.13, 'neu': 0.87, 'pos': 0.0, 'compou...",neg,neg
6,10,,,#flood #disaster Heavy rain causes flash flood...,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",pos,pos
7,13,,,I'm on top of the hill and I can see a fire in...,1,"{'neg': 0.158, 'neu': 0.724, 'pos': 0.118, 'co...",neg,neg
8,14,,,There's an emergency evacuation happening now ...,1,"{'neg': 0.191, 'neu': 0.809, 'pos': 0.0, 'comp...",neg,neg
9,15,,,I'm afraid that the tornado is coming to our a...,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",pos,pos


In [52]:
df['compound_score'] = df['text'].apply(compound_score)

In [140]:
from textblob import TextBlob

In [206]:
sentiments = [TextBlob(t) for t in df['text']]
sentiments_scores = [t.sentiment for t in sentiments]
print(sentiments_scores[0], sentiments[0])

Sentiment(polarity=0.0, subjectivity=0.0) Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all


In [200]:
sample = df['text'].sample(n=10, random_state=1)
before = sample.apply(lambda x:vader_model.polarity_scores(x)['compound'])
after = sample.apply(lambda x:vader_model.polarity_scores(normalize(x))['compound'])
print(f'\nThe difference between them: {after-before}\n')
print(f'\nThe values before:\n {before} \n\nAND AFTER:\n\n {after}')


The difference between them: 3228    0.0659
3706    0.0107
6957    0.0000
2887   -0.1982
7464    0.0000
2539    0.3400
6837    0.0875
7386    0.0000
1506    0.0000
1875    0.0000
Name: text, dtype: float64


The values before:
 3228   -0.6908
3706   -0.5046
6957    0.0000
2887    0.0451
7464    0.0000
2539    0.0000
6837   -0.7783
7386    0.4019
1506   -0.4939
1875   -0.1531
Name: text, dtype: float64 

AND AFTER:

 3228   -0.6249
3706   -0.4939
6957    0.0000
2887   -0.1531
7464    0.0000
2539    0.3400
6837   -0.6908
7386    0.4019
1506   -0.4939
1875   -0.1531
Name: text, dtype: float64


In [202]:
df[df['id']==2887]

Unnamed: 0,id,keyword,location,text,target
2011,2887,damage,,@WonderousAllure crosses her arms to cover her...,0


In [50]:
def compound_score(text):
    x = run_vader(text)
    return x['compound']

In [54]:
df.head(10)

Unnamed: 0,id,keyword,location,text,target,hashtags,cleaned_text,compound_score
31,48,ablaze,Birmingham,bbcmtd wholesale markets ablaze,1,[],bbcmtd wholesale market ablaze,0.0
32,49,ablaze,Est. September 2012 - Bristol,we always try to bring the heavy. #metal #rt,0,"[#metal, #rt]",try bring heavy . # metal # rt,0.0
33,50,ablaze,AFRICA,#africanbaze breaking newsnigeria flag set abl...,1,[#africanbaze],# africanbaze break newsnigeria flag set ablaz...,0.0
34,52,ablaze,"Philadelphia, PA",crying out for more set me ablaze,0,[],cry set ablaze,-0.4767
35,53,ablaze,"London, UK",on plus side look at the sky last night it was...,0,[],plus look sky night ablaze,0.0
36,54,ablaze,Pretoria,phdsquares #mufc theyve built so much hype aro...,0,[#mufc],phdsquare # mufc ve build hype new acquisition...,-0.5023
37,55,ablaze,World Wide!!,inec office in abia set ablaze,1,[],inec office abia set ablaze,0.0
38,56,ablaze,,barbados #bridgetown jamaica ûò two cars set ...,1,[#bridgetown],barbado # bridgetown jamaica ûò car set ablaz...,0.0
39,57,ablaze,Paranaque City,ablaze for you lord d,0,[],ablaze lord d,0.0
40,59,ablaze,Live On Webcam,check these out #nsfw,0,[#nsfw],check # nsfw,0.0


In [None]:
# def average_adj_count(blog):
#     adj_counter = 0
#     stripped_blog = blog.translate(str.maketrans('', '', "!\"#$%&'()*+,-/:;<=>?@[\]^_`{|}~")) # all punct expect period are removed
#     splitted_blog = sent_tokenize(stripped_blog)
#     for sentence in splitted_blog:
#         sent = nlp(sentence)
#         for word in sent:
#             if word.pos_ == 'ADJ':
#                 adj_counter += 1
#     avg = adj_counter/len(splitted_blog)
#     return avg

In [20]:
# source function

# def clean_text_round1(text):
#     '''Remove text in square brackets, remove punctuation and remove words containing numbers.'''
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
#     text = re.sub('\w\d\w*', '', text)
#     return text

# #round1 = lambda x: clean_text_round1(x)
# def clean_text_round2(text):
#     '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
#     text = re.sub('[‘’“”…]', '', text)
#     text = re.sub('\n', '', text)
#     return text

#round2 = lambda x: clean_text_round2(x)

## Double checking cleaning/pre-processing

Here we check the content of our code after the data cleaning and pre-processing. 

Important is to check for class imbalance.

# Methods and Models

Here we describe which models we want to be applying.

## Splitting the data train/test

Mind that the current notebook only takes the training data into consideration. Therefore, applying sklearn train/test functions on the dataframe, kind of makes us capable of tweaking the model as if using a validation set.

# Evaluation

Here we test, compare, and discuss the models, and see if we need to return to previous points to improve our models.