In [391]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

sns.set_style("darkgrid")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


# 1. Understand data 

In [392]:
train.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [393]:
test.columns

Index(['id', 'keyword', 'location', 'text'], dtype='object')

In [394]:
train.head(20)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [395]:
ex_sent3    = train['text'].iloc[3]
ex_sent5    = train['text'].iloc[5]
ex_sent50   = train['text'].iloc[50]
ex_sent5226 = train['text'].iloc[5226] 
print(ex_sent3)
print(ex_sent5)
print(ex_sent50)
print(ex_sent5226)

13,000 people receive #wildfires evacuation orders in California 
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
Deputies: Man shot before Brighton home set ablaze http://t.co/gWNRhMSO8k
@Eganator2000 There aren't many Obliteration servers but I always like to play when there are :D


In [396]:
# extract hash tags 
def is_alphanumerical(letter: str) -> bool:
    ans = False
    is_capital_letter = (65 <= ord(letter) and ord(letter) <= 90)
    is_minor_letter   = (97 <= ord(letter) and ord(letter) <= 122)
    is_digit          = (48 <= ord(letter) and ord(letter) <= 57)
    
    if is_capital_letter or is_minor_letter or is_digit:
        ans = True 
    
    return ans

def is_alphabetical(letter: str) -> bool: 
    return ('a' <= letter and letter <= 'z')

def extract_hash_tags(data) -> dict:
    hashtags = {}
    for index, row in data.iterrows(): 
        sentence = row['text']
        word = ''
        flag = False 
        for c in sentence:
            if flag and is_alphanumerical(c): 
                word = word + c
                
            if flag and not(is_alphanumerical(c)):
                word = word.lower()
                if word not in hashtags:
                    hashtags[word] = 1
                else:
                    hashtags[word] += 1
                
                word = ''
                flag = False
            
            if c == '#':
                flag = True
                continue  
        
        if word != '':
            word = word.lower()
            if word not in hashtags:
                hashtags[word] = 1
            else:
                hashtags[word] += 1
    return hashtags

hashtags = extract_hash_tags(train)
print("the number of hashtags is : ", len(hashtags))
print(hashtags)

the number of hashtags is :  1924
{'earthquake': 19, 'wildfires': 5, 'alaska': 2, 'rockyfire': 4, 'cafire': 2, 'flood': 4, 'disaster': 8, 'flooding': 4, 'raining': 1, 'florida': 2, 'tampabay': 1, 'tampa': 2, 'we': 2, 'breaking': 9, 'metal': 1, 'rt': 10, 'africanbaze': 1, 'mufc': 2, 'bridgetown': 1, 'nsfw': 4, 'kurds': 1, 'diyala': 1, 'california': 5, 'climate': 2, 'energy': 1, 'nowplaying': 23, 'edm': 8, 'nashvilletraffic': 1, 'santaclara': 1, 'bayarea': 1, 'traffic': 3, 'personalinjury': 1, 'solicitor': 1, 'otleyhour': 1, 'stlouis': 1, 'caraccidentlawyer': 1, 'truckcrash': 1, 'fortworth': 1, 'ashville': 1, 'manchester': 4, 'hagerstown': 1, 'whag': 1, 'bahrain': 1, 'arrestpastornganga': 2, 'dubstep': 7, 'trapmusic': 7, 'dnb': 8, 'dance': 7, 'ices': 7, 'growingupspoiled': 1, 'wisdomwed': 1, 'lifehacks': 1, 'silverwood': 1, 'aftershock': 1, '': 71, 'book': 1, 'now': 1, 'wdyouth': 1, 'biblestudy': 1, 'news': 76, 'horrible': 1, 'accident': 4, 'watchthevideo': 1, 'kca': 4, 'votejkt48id': 4,

In [397]:
# analyse if digits are important for our analysis or not 
keys_with_digits = []
for key, value in hashtags.items():
    have_digit = False; 
    for c in key: 
        have_digit = have_digit or (48 <= ord(c) and ord(c) <= 57)
    
    if have_digit:
        keys_with_digits.append(key)
        
keys_with_digits[:100] # as we can observe, digits azre used for model numbering, date/time of event or even as typo - information that is not relevant for our classification

['votejkt48id',
 'g90',
 'fant4stic',
 'gilbert23',
 '1008pla',
 '1008planet',
 '2a',
 '5sosfam',
 '2minutemix',
 '7newsadl',
 '171',
 '124',
 'du19',
 'ww1',
 'coast2coastdjs',
 '2fast2furious',
 'wbc2015',
 'elxn42',
 'tweet4taiji',
 'bb17',
 '9973',
 'moving2k15',
 'norge2040',
 'setting4success',
 '9newsmornings',
 'tweetlikeitsseptember11th2001',
 'nh1news',
 '2015',
 'homealone2',
 'film4',
 'usar2015',
 'usar15',
 '6',
 'roh3',
 'alrasyid448iturasya',
 'roh3smantibatam',
 'isea2015',
 'wattys2015',
 'ihave44episodesofgg',
 'hwy401',
 'windows10',
 'bc19',
 '0215',
 'ny35',
 'icd10',
 'vra50',
 'mh370',
 'b2b',
 'b2bagency',
 'asae15',
 'media420',
 'trump2016',
 '4',
 'ashes2ashes',
 '1',
 'mumbairiot92',
 'ashes2015',
 'msgdoing111welfareworks',
 'allthekidneybeansandsorbet4misha',
 'ufo4ublogeurope',
 'summer2k15',
 '3novices',
 'borderlands2',
 '37592',
 '911',
 'nbc15',
 '039',
 'fifa16',
 'liveonk2',
 'koin6news',
 'sr14',
 '17',
 '38745',
 '16',
 'tw4rw',
 '9',
 'hiroshima

In [398]:
# delete digits from hashtags 
important_hashtags = {}
counter = 0 
for key, value in hashtags.items():
    word = ''
    for c in key: 
        is_digit = (48 <= ord(c) and ord(c) <= 57)
        if is_digit:
            continue
        word = word + c 
    if (counter % 25 == 0): print(f"the word = {word} has freq of {value}")
    if word == '': 
        continue 
    
    if word not in important_hashtags:
        important_hashtags[word] = value
    else:
        important_hashtags[word] += value
    counter += 1
print("the size of our new hashtags is : ", len(important_hashtags))

the word = earthquake has freq of 19
the word = nowplaying has freq of 23
the word = wisdomwed has freq of 1
the word = pilot has freq of 1
the word = az has freq of 1
the word = startrek has freq of 1
the word = beyhive has freq of 8
the word = newsadl has freq of 1
the word = hockey has freq of 1
the word = mobile has freq of 1
the word = artisteoftheweekfact has freq of 1
the word = stopharper has freq of 1
the word = thisdayinhistory has freq of 1
the word = dagens has freq of 1
the word = ashes has freq of 2
the word = history has freq of 1
the word = pbs has freq of 1
the word = borrowers has freq of 1
the word = catastrophic has freq of 1
the word = nieuws has freq of 1
the word = teenfiction has freq of 1
the word = gadget has freq of 1
the word = wine has freq of 1
the word = fitness has freq of 2
the word = st has freq of 1
the word = nghlth has freq of 1
the word = dumuzid has freq of 1
the word = telangana has freq of 1
the word = sms has freq of 1
the word = chicagoscanner

In [399]:
hashtags = important_hashtags 
print(important_hashtags)

{'earthquake': 19, 'wildfires': 5, 'alaska': 2, 'rockyfire': 4, 'cafire': 2, 'flood': 4, 'disaster': 8, 'flooding': 4, 'raining': 1, 'florida': 2, 'tampabay': 1, 'tampa': 2, 'we': 2, 'breaking': 11, 'metal': 1, 'rt': 10, 'africanbaze': 1, 'mufc': 2, 'bridgetown': 1, 'nsfw': 4, 'kurds': 1, 'diyala': 1, 'california': 5, 'climate': 2, 'energy': 1, 'nowplaying': 23, 'edm': 8, 'nashvilletraffic': 1, 'santaclara': 1, 'bayarea': 1, 'traffic': 3, 'personalinjury': 1, 'solicitor': 1, 'otleyhour': 1, 'stlouis': 1, 'caraccidentlawyer': 1, 'truckcrash': 1, 'fortworth': 1, 'ashville': 1, 'manchester': 4, 'hagerstown': 1, 'whag': 1, 'bahrain': 1, 'arrestpastornganga': 2, 'dubstep': 7, 'trapmusic': 7, 'dnb': 8, 'dance': 7, 'ices': 7, 'growingupspoiled': 1, 'wisdomwed': 1, 'lifehacks': 1, 'silverwood': 1, 'aftershock': 1, 'book': 1, 'now': 1, 'wdyouth': 1, 'biblestudy': 1, 'news': 76, 'horrible': 1, 'accident': 4, 'watchthevideo': 1, 'kca': 4, 'votejktid': 4, 'rip': 2, 'binladen': 1, 'mlb': 3, 'man': 

In [400]:
sorted_hashtags = sorted(hashtags.items(), key = lambda x : x[1], reverse = True)
temp            = dict(sorted_hashtags)
hashtags        = temp
del temp 
print("this is the list of keys present in our dictionary: \n", hashtags)
print("\n\n\n this is the size of our hashtags: ", len(hashtags))


this is the list of keys present in our dictionary: 
 {'news': 76, 'hot': 31, 'prebreak': 30, 'best': 30, 'hiroshima': 24, 'nowplaying': 23, 'islam': 23, 'earthquake': 19, 'gbbo': 19, 'jobs': 14, 'job': 12, 'isis': 12, 'breaking': 11, 'world': 11, 'japan': 11, 'hiring': 11, 'terrorism': 11, 'rt': 10, 'india': 10, 'bbc': 10, 'sismo': 10, 'yyc': 10, 'worldnews': 9, 'directioners': 9, 'irandeal': 9, 'fashion': 9, 'emmerdale': 9, 'cnn': 9, 'abstorm': 9, 'fukushima': 9, 'nuclear': 9, 'disaster': 8, 'edm': 8, 'dnb': 8, 'beyhive': 8, 'tcot': 8, 'handbag': 8, 'wildfire': 8, 'seattle': 8, 'genocide': 8, 'nursing': 8, 'dubstep': 7, 'trapmusic': 7, 'dance': 7, 'ices': 7, 'us': 7, 'mtvhottest': 7, 'business': 7, 'mh': 7, 'wx': 7, 'okwx': 7, 'hailstorm': 7, 'animalrescue': 7, 'truth': 7, 'god': 7, 'quran': 7, 'lies': 7, 'armageddon': 6, 'kindle': 6, 'bb': 6, 'womens': 6, 'usa': 6, 'tech': 6, 'course': 6, 'art': 6, 'photography': 6, 'rohingya': 6, 'wmata': 6, 'tubestrike': 6, 'soundcloud': 6, 'milit

I thought if I would have my hashtags - I could fed top 10% by freq in order to warn my neural net to keep an eye on tweets that contain them in order to ameliorate the prediciton quality.

This is time consuming and I'm not sure if this is a good idea -- so I will just try to write a random NN and then will look what can be improved an how/

In [401]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [402]:
train['keyword'].head(100)

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
        ...   
95    accident
96    accident
97    accident
98    accident
99    accident
Name: keyword, Length: 100, dtype: object

In [403]:
train = train.drop(columns = ['location'])
test  = test.drop(columns = ['location'])

In [404]:
train.head()

Unnamed: 0,id,keyword,text,target
0,1,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,Forest fire near La Ronge Sask. Canada,1
2,5,,All residents asked to 'shelter in place' are ...,1
3,6,,"13,000 people receive #wildfires evacuation or...",1
4,7,,Just got sent this photo from Ruby #Alaska as ...,1


In [405]:
test.head()

Unnamed: 0,id,keyword,text
0,0,,Just happened a terrible car crash
1,2,,"Heard about #earthquake is different cities, s..."
2,3,,"there is a forest fire at spot pond, geese are..."
3,9,,Apocalypse lighting. #Spokane #wildfires
4,11,,Typhoon Soudelor kills 28 in China and Taiwan


In [406]:
# 

## 2. Clean data and prepare it for modelling 

list of things to do:
- [x] remove capitalisation 
- [x] remove punctuation and other symbols (except for '#') 
- [x] remove digits from our hashtags 
- [ ] apply stemming 
- [ ] replace NaN in keywords (maybe using hashtags present in tweets?)
- [x] remove URLs 
- [ ] expand contractions 

In [407]:
# remove capitalisation 
for dataset in [train, test]:
    dataset['text'] = dataset['text'].apply(lambda x : x.lower())


In [408]:
# generated by chatgpt because I'm lazy, really and this tasks is just pain in the ass
import re

def remove_urls(text):
    # Regex pattern to match URLs starting with www. or https://
    pattern = r"(www\.|https?://)[^\s/$.?#].[^\s]*"
    
    # Removing URLs from the text
    cleaned_text = re.sub(pattern, "", text)
    return cleaned_text

# Example usage
text  = "Check out my Facebook profile: www.facebook.com/johndoe and don't forget to entire on https://website.com"
text2 = "my wife has site called http://lasta.com"
cleaned_text  = remove_urls(text)
cleaned_text2 = remove_urls(text2)
print(cleaned_text)
print(cleaned_text2)

Check out my Facebook profile:  and don't forget to entire on 
my wife has site called 


In [409]:
# remove actual links 
for dataset in [train, test]:
    dataset['text'] = dataset['text'].apply(remove_urls)

In [410]:
# remove special symbols (this function takes pandas.Series() and returns another pandas.Series())
def remove_special_symbols(data): 
    for index, text in data.items():
        sentence = ''
        for c in text:
            idx = ord(c) # ASCII index of our character 
            is_symbol = (33 <= idx and idx <= 64) or (91 <= idx and idx <= 96) or (123 <= idx and idx <= 126)
            if c != '#' and is_symbol: 
                continue
            if c == '#' or is_alphabetical(c) or c == ' ':
                sentence = sentence + c
        
        data.iloc[index] = sentence 
    
    return data
for dataset in [train, test]:
    dataset['text'] = remove_special_symbols(dataset['text'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.iloc[index] = sentence


In [412]:
print("sentence ex = ", train.iloc[3]['text'])
print("sentence ex = ", train.iloc[5]['text'])
print("sentence ex = ", train.iloc[50]['text'])
print("sentence ex = ", train.iloc[5226]['text'])

sentence ex =   people receive #wildfires evacuation orders in california 
sentence ex =  #rockyfire update  california hwy  closed in both directions due to lake county fire  #cafire #wildfires
sentence ex =  deputies man shot before brighton home set ablaze 
sentence ex =  eganator there arent many obliteration servers but i always like to play when there are d
