In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

sns.set_style("darkgrid")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


# 1. Understand data 

In [3]:
combine = [train, test] # just comfortable python list for data cleaning in both df's. 

In [4]:
train.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [5]:
test.columns

Index(['id', 'keyword', 'location', 'text'], dtype='object')

In [6]:
train.head(20)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [15]:
ex_sent = train.iloc[5]['text']
print(ex_sent)

#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires


In [70]:
# extract hash tags 
def is_alphanumerical(letter: str) -> bool:
    ans = False
    is_capital_letter = (65 <= ord(letter) and ord(letter) <= 90)
    is_minor_letter   = (97 <= ord(letter) and ord(letter) <= 122)
    is_digit          = (48 <= ord(letter) and ord(letter) <= 57)
    
    if is_capital_letter or is_minor_letter or is_digit:
        ans = True 
    
    return ans

def extract_hash_tags(data) -> set:
    hashtags = {}
    for index, row in data.iterrows(): 
        sentence = row['text']
        word = ''
        flag = False 
        for c in sentence:
            if flag and is_alphanumerical(c): 
                word = word + c
                
            if flag and not(is_alphanumerical(c)):
                word = word.lower()
                if word not in hashtags:
                    hashtags[word] = 1
                else:
                    hashtags[word] += 1
                
                word = ''
                flag = False
            
            if c == '#':
                flag = True
                continue  
        if word != '':
            if word not in hashtags:
                hashtags[word] = 1
            else:
                hashtags[word] += 1
    return hashtags

hashtags = extract_hash_tags(train)
print("the number of hashtags is : ", len(hashtags))
print(hashtags)

the number of hashtags is :  2007
{'earthquake': 14, 'wildfires': 5, 'alaska': 2, 'rockyfire': 4, 'cafire': 1, 'flood': 4, 'disaster': 8, 'flooding': 4, 'raining': 1, 'florida': 2, 'tampabay': 1, 'tampa': 2, 'we': 2, 'breaking': 9, 'metal': 1, 'rt': 4, 'africanbaze': 1, 'mufc': 2, 'bridgetown': 1, 'nsfw': 4, 'kurds': 1, 'diyala': 1, 'california': 5, 'climate': 2, 'energy': 1, 'nowplaying': 22, 'EDM': 1, 'nashvilletraffic': 1, 'santaclara': 1, 'bayarea': 1, 'traffic': 3, 'personalinjury': 1, 'solicitor': 1, 'OtleyHour': 1, 'stlouis': 1, 'caraccidentlawyer': 1, 'truckcrash': 1, 'fortworth': 1, 'ashville': 1, 'manchester': 4, 'hagerstown': 1, 'WHAG': 1, 'bahrain': 1, 'ArrestPastorNganga': 1, 'dubstep': 7, 'trapmusic': 7, 'dnb': 8, 'edm': 7, 'dance': 7, 'ices': 7, 'growingupspoiled': 1, 'wisdomwed': 1, 'lifehacks': 1, 'silverwood': 1, 'aftershock': 1, '': 71, 'book': 1, 'now': 1, 'wdyouth': 1, 'biblestudy': 1, 'news': 66, 'horrible': 1, 'accident': 4, 'watchthevideo': 1, 'kca': 4, 'votejkt

In [45]:
# analyse if digits are important for our analysis or not 
keys_with_digits = []
for key, value in hashtags.items():
    have_digit = False; 
    for c in key: 
        have_digit = have_digit or (48 <= ord(c) and ord(c) <= 57)
    
    if have_digit:
        keys_with_digits.append(key)
        
keys_with_digits[:100] # as we can observe, digits azre used for model numbering, date/time of event or even as typo - information that is not relevant for our classification

['votejkt48id',
 'g90',
 'fant4stic',
 'gilbert23',
 '1008pla',
 '1008planet',
 '2a',
 '5sosfam',
 '7newsadl',
 '171',
 '124',
 'du19',
 'ww1',
 'coast2coastdjs',
 'elxn42',
 'tweet4taiji',
 '9973',
 'moving2k15',
 'setting4success',
 'tweetlikeitsseptember11th2001',
 '2015',
 'homealone2',
 'usar2015',
 'usar15',
 '6',
 'roh3',
 'alrasyid448iturasya',
 'wattys2015',
 'ihave44episodesofgg',
 'hwy401',
 'windows10',
 '0215',
 'ny35',
 'vra50',
 'mh370',
 'b2b',
 'b2bagency',
 'asae15',
 'media420',
 'trump2016',
 '4',
 '1',
 'mumbairiot92',
 'ashes2015',
 'msgdoing111welfareworks',
 'allthekidneybeansandsorbet4misha',
 '3novices',
 'bb17',
 'borderlands2',
 '37592',
 '911',
 'nbc15',
 '039',
 'fifa16',
 'koin6news',
 'sr14',
 'liveonk2',
 '17',
 '38745',
 '16',
 'tw4rw',
 '9',
 '3',
 '999day',
 'floored4',
 '452',
 'round2',
 'wholeteam3',
 'offers2go',
 'pp15000266858',
 '263chat',
 'tedcruz2016',
 '8392',
 'nj36',
 '365disasters',
 '034',
 '629',
 '615',
 '7',
 '4playthursdays',
 'hir

In [48]:
# delete digits from hashtags 
important_hashtags = {}

for key, value in hashtags.items():
    word = ''
    for c in key: 
        is_digit = (48 <= ord(c) and ord(c) <= 57)
        if is_digit:
            continue
        word = word + c 
    
    if word == '': 
        continue 
    
    if word not in important_hashtags:
        important_hashtags[word] = 1
    else:
        important_hashtags[word] += 1
print("the size of our new hashtags is : ", len(important_hashtags))

the size of our new hashtags is :  1509


In [61]:
hashtags = important_hashtags 
print(important_hashtags)

{'earthquake': 1, 'wildfires': 1, 'alaska': 1, 'rockyfire': 1, 'cafire': 1, 'flood': 1, 'disaster': 1, 'raining': 1, 'flooding': 1, 'florida': 1, 'tampabay': 1, 'tampa': 1, 'we': 1, 'breaking': 3, 'metal': 1, 'rt': 1, 'africanbaze': 1, 'mufc': 1, 'bridgetown': 1, 'kurds': 1, 'diyala': 1, 'california': 1, 'climate': 1, 'energy': 1, 'nowplaying': 1, 'nashvilletraffic': 1, 'santaclara': 1, 'bayarea': 1, 'traffic': 1, 'personalinjury': 1, 'solicitor': 1, 'stlouis': 1, 'caraccidentlawyer': 1, 'truckcrash': 1, 'fortworth': 1, 'ashville': 1, 'manchester': 1, 'hagerstown': 1, 'bahrain': 1, 'dubstep': 1, 'trapmusic': 1, 'dnb': 1, 'edm': 1, 'dance': 1, 'ices': 1, 'growingupspoiled': 1, 'wisdomwed': 1, 'lifehacks': 1, 'silverwood': 1, 'book': 1, 'now': 1, 'wdyouth': 1, 'biblestudy': 1, 'horrible': 1, 'accident': 1, 'watchthevideo': 1, 'kca': 1, 'votejktid': 1, 'rip': 1, 'binladen': 1, 'man': 1, 'airport': 1, 'airplane': 1, 'aircraft': 1, 'aeroplane': 1, 'runway': 1, 'freaky': 1, 'crash': 1, 'pilo

In [59]:
sorted_hashtags = sorted(hashtags.items(), key = lambda x : x[1], reverse = True)
temp            = dict(sorted_hashtags)
hashtags        = temp
del temp 
print("this is the list of keys present in our dictionary: \n", hashtags)
print("\n\n\n this is the size of our hashtags: ", len(hashtags))


this is the list of keys present in our dictionary: 
 {'breaking': 3, 'g': 2, 'ww': 2, 'hiroshima': 2, 'ashes': 2, 'ny': 2, 'gbbo': 2, 'usar': 2, 'bb': 2, 'trump': 2, 'borderlands': 2, 'pp': 2, 'nj': 2, 'earthquake': 1, 'wildfires': 1, 'alaska': 1, 'rockyfire': 1, 'cafire': 1, 'flood': 1, 'disaster': 1, 'raining': 1, 'flooding': 1, 'florida': 1, 'tampabay': 1, 'tampa': 1, 'we': 1, 'metal': 1, 'rt': 1, 'africanbaze': 1, 'mufc': 1, 'bridgetown': 1, 'kurds': 1, 'diyala': 1, 'california': 1, 'climate': 1, 'energy': 1, 'nowplaying': 1, 'nashvilletraffic': 1, 'santaclara': 1, 'bayarea': 1, 'traffic': 1, 'personalinjury': 1, 'solicitor': 1, 'stlouis': 1, 'caraccidentlawyer': 1, 'truckcrash': 1, 'fortworth': 1, 'ashville': 1, 'manchester': 1, 'hagerstown': 1, 'bahrain': 1, 'dubstep': 1, 'trapmusic': 1, 'dnb': 1, 'edm': 1, 'dance': 1, 'ices': 1, 'growingupspoiled': 1, 'wisdomwed': 1, 'lifehacks': 1, 'silverwood': 1, 'book': 1, 'now': 1, 'wdyouth': 1, 'biblestudy': 1, 'horrible': 1, 'accident': 