##Install Pre req from Kaggle

Download the required datasets from the Kaggle repository.

In [1]:
!pip install kaggle



In [2]:
!mkdir ~/.kaggle

In [3]:
cd /root/.kaggle

/root/.kaggle


In [4]:
!pwd
!ls

/root/.kaggle


Upload the json file for Kaggle authorization

In [None]:
from google.colab import files
files.upload()

In [6]:
!chmod 600 ~/.kaggle/kaggle.json

In [7]:
cd /content

/content


In [8]:
!kaggle competitions download -c nlp-getting-started

Downloading sample_submission.csv to /content
  0% 0.00/22.2k [00:00<?, ?B/s]
100% 22.2k/22.2k [00:00<00:00, 8.53MB/s]
Downloading train.csv to /content
  0% 0.00/965k [00:00<?, ?B/s]
100% 965k/965k [00:00<00:00, 62.9MB/s]
Downloading test.csv to /content
  0% 0.00/411k [00:00<?, ?B/s]
100% 411k/411k [00:00<00:00, 133MB/s]


#Import required libs

In [9]:
import numpy as np
import pandas as pd


import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer


import json
import os


pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.width = None
pd.options.display.max_colwidth = None

#Loading the data

In [10]:
Tweets = pd.read_csv("/content/train.csv", encoding="utf-8")

#Helper functions

Some helper functions for preprocessing the tweets are defined here

Load the dictionaries from the GitHub repositiory to process contracted words and words with repeated characters.

**Contracted words**




Eg  I've  = I have

**Repeated characters words**




Eg  goooaaalll  = goal

coooool     = cool
    

In [11]:
with open('contractions_dict.json', 'r') as fp:
    contractions_dict = json.load(fp)
with open('repetitions_dict.json', 'r') as fp:
    repetitions_dict = json.load(fp)
print('Contracted Words = ', len(contractions_dict), 'Repetition Letter Words = ',len(repetitions_dict))

Contracted Words =  107 Repetition Letter Words =  13


Helper functions for basic text processing, tweet processing, extracting features and tokenization

In [20]:
nltk.download('wordnet')

# To extract URLs as a seperate feature
def ExtractURL(text):
  match = re.findall('https?://\S+|www\.\S+', text)
  if match == []:
    url = np.nan
  else:
    url = match
  return url

# To extract Hashtags as a seperate feature
def Extracthashtag(text):
  match = re.findall('#\S+', text)
  if match == []:
    hashtag = np.nan
  else:
    hashtag = match
    for idx, i in enumerate(hashtag):
      hashtag[idx] = re.sub('[%s]' % re.escape(string.punctuation), '', i)
  return hashtag

# To extract Mentioned Names as a seperate feature
def ExtractMentioned(text):
  match = re.findall('@\S+', text)
  if match == []:
    mentioned = np.nan
  else:
    mentioned = match
    for idx, i in enumerate(mentioned):
      mentioned[idx] = re.sub('[@]' , '', i)
  return mentioned

# To perfrom tweets specific text processing
def FixCommonTweetErrors(text, remove_hashtag, remove_mentioned):
  # To expand contracted words
  def expand_contractions(text):
    for keys,vals in contractions_dict.items():
      text = re.sub(keys, vals, text)
    return text
  # To contract words with repeated letters
  def contract_repetitions(text):
    for keys,vals in repetitions_dict.items():
      text = re.sub(keys, vals, text)
    return text
  # To remove hashtaged words from text if necessary
  if remove_hashtag:
    text = re.sub('#\S+', '', text)
  if remove_mentioned:
    text = re.sub('@\S+', '', text)


  text = contract_repetitions(text)
  text = expand_contractions(text)
  return text   
    
     
def BasicTextProcessing(text, remove_url, lower_text_case,
                        remove_nos, remove_punctutation,
                        ):
  # Removing line breaks, html tags, certain unicode symbols
  # To remove url, nos and punctutation and convert to lower case if necessary
  if remove_url:
    text = re.sub('https?://\S+|www\.\S+', '', text)

  if lower_text_case:
    text = text.lower()
  text = re.sub('\\n', '', text)
  text = re.sub('ûª', "'", text)
  text = re.sub('&amp;', 'and', text)
  text = re.sub('&lt;', '', text)
  text = re.sub('&gt;', '', text)
  if remove_nos:
    text = re.sub('\d+', '', text)
  
    
  return text

def RemovePunctutation(text):
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
  return text

#   text = re.sub('\[.*?\]', '', text)
#   text = re.sub('<.*?>+', '', text)
#   text = re.sub('\w*\d\w*', '', text)
  
tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')

# Downloading Stopwords

def LoadStopwords():
  nltk.download('stopwords')
  stopwords.words('english')
  # len(stopwords.words('english'))

# To remove stopwords if necessary
def RemoveStopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words 

# To combine tokenized words if necessary
def CombineText(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

def StemWords(text, stemmer):
  text = " ".join([stemmer.stem(word) for word in text.split()])
  return text

def LemmatizeWords(text,lemmatizer):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


#Pipeline

Pipleline to perfrom all preprocessing functions.

Non - required for preprocessing can be made false

**Default values**

remove_url=True, extract_url=False, remove_punctutation=True, extract_hashtag=False, 
                  lower_text_case=True, remove_nos=True, fix_common_tweet_errors=False,
                  remove_hashtag=False, remove_stopwords=False, combine_text=False,
                  extract_mentioned = True, remove_mentioned=False, stem_words=False,
                  lemmatize_words=False

In [21]:
def CleanTextData(Data, remove_url=True, extract_url=False, remove_punctutation=True, extract_hashtag=False, 
                  lower_text_case=True, remove_nos=True, fix_common_tweet_errors=False,
                  remove_hashtag=False, remove_stopwords=False, combine_text=False,
                  extract_mentioned = True, remove_mentioned=False, stem_words=False,
                  lemmatize_words=False
                  ):
  Data = Data.assign(text_pre_pro = lambda x: (x['text'].apply(lambda x:BasicTextProcessing(x, remove_url, lower_text_case,
                        remove_nos, remove_punctutation))))

  if fix_common_tweet_errors:
    Data['text_pre_pro'] = Data['text_pre_pro'].apply(lambda x:FixCommonTweetErrors(x, remove_hashtag, remove_mentioned)) 

  if extract_hashtag:
    Data = Data.assign(hashtag=lambda x: (x['text'].apply(lambda x:Extracthashtag(x))))

  if extract_url:
    Data = Data.assign(url=lambda x: (x['text'].apply(lambda x:ExtractURL(x))))

  if extract_mentioned:
    Data = Data.assign(mentioned=lambda x: (x['text'].apply(lambda x:ExtractMentioned(x))))

  if remove_punctutation:
    Data['text_pre_pro'] = Data['text_pre_pro'].apply(lambda x:RemovePunctutation(x))
 
  if remove_stopwords:
    LoadStopwords()
    Data['text_pre_pro'] = Data['text_pre_pro'].apply(lambda x:RemoveStopwords(x))

  if combine_text:
    Data['tokenized_processed'] = Data['text_pre_pro'].apply(lambda x:CombineText(x))

  if stem_words:
    stemmer = PorterStemmer()
    Data['text_pre_pro'] = Data['text_pre_pro'].apply(lambda x:StemWords(x, stemmer))

  if lemmatize_words:    
    lemmatizer = WordNetLemmatizer()  
    Data['text_pre_pro'] = Data['text_pre_pro'].apply(lambda x:LemmatizeWords(x, lemmatizer))
  Data = Data.assign(tokenized_processed=lambda x: (x['text_pre_pro'].apply(lambda x:tokenizer.tokenize(x))))
 

  return Data

#Processing the data

In [22]:
Tweets_PreProcessed = CleanTextData(Tweets, extract_hashtag=True, 
                                        fix_common_tweet_errors=True,
                                        extract_url = True,
                                        remove_mentioned=True,
                                        stem_words=False,
                                        lemmatize_words=False
                                        )

# Some ids of tweets which can be used to check if the prrocssing had the inteded
# effect on the tweets
# listofid = [28, 109, 158, 220, 232, 1722]

# for i in listofid:
#   print(Tweets_PreProcessed.loc[Tweets_PreProcessed['id'] == i, ['text', 'text_pre_pro', 'mentioned']])

Tweets_PreProcessed.head(4)

Unnamed: 0,id,keyword,location,text,target,text_pre_pro,hashtag,url,mentioned,tokenized_processed
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,our deed are the reason of this earthquake may allah forgive u all,[earthquake],,,"[our, deed, are, the, reason, of, this, earthquake, may, allah, forgive, u, all]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,,,,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,all resident asked to ishelter in place are being notified by officer no other evacuation or shelter in place order are expected,,,,"[all, resident, asked, to, ishelter, in, place, are, being, notified, by, officer, no, other, evacuation, or, shelter, in, place, order, are, expected]"
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,people receive wildfire evacuation order in california,[wildfires],,,"[people, receive, wildfire, evacuation, order, in, california]"


In [28]:
Tweets_PreProcessed['mentioned'][Tweets_PreProcessed['mentioned'].notna()]

31                                                                                               [bbcmtd]
36                                                                                           [PhDSquares]
43                                                                                       [southridgelife]
54                                                                                      [Alexis_Sanchez:]
57                                                                                                  [Û_]
63                                                                                  [Navista7, News24680]
65                                                                                       [nxwestmidlands]
66                                                                                               [ablaze]
75                                                                                        [SleepJunkies:]
92                                            

In [None]:
Tweets_PreProcessed.to_csv('train_processed.csv')

Reset the variables if necessary

In [None]:
# %reset

#Sub Program to write new values to dicts

In [None]:
contractions_dict.items()

dict_items([("ain't", 'are not'), ("'s", ' is'), ("aren't", 'are not'), ("can't", 'cannot'), ("can't've", 'cannot have'), ("'cause", 'because'), ("could've", 'could have'), ("couldn't", 'could not'), ("couldn't've", 'could not have'), ("didn't", 'did not'), ("doesn't", 'does not'), ("don't", 'do not'), ("hadn't", 'had not'), ("hadn't've", 'had not have'), ("hasn't", 'has not'), ("haven't", 'have not'), ("he'd", 'he would'), ("he'd've", 'he would have'), ("he'll", 'he will'), ("he'll've", 'he will have'), ("how'd", 'how did'), ("how'd'y", 'how do you'), ("how'll", 'how will'), ("i'd", 'i would'), ("i'd've", 'i would have'), ("i'll", 'i will'), ("i'll've", 'i will have'), ("i'm", 'i am'), ("i've", 'i have'), ("isn't", 'is not'), ("it'd", 'it would'), ("it'd've", 'it would have'), ("it'll", 'it will'), ("it'll've", 'it will have'), ("let's", 'let us'), ("ma'am", 'madam'), ("mayn't", 'may not'), ("might've", 'might have'), ("mightn't", 'might not'), ("mightn't've", 'might not have'), ("mus

Add any new values to the dicts

In [None]:
contractions_dict['CONTRACTED WORD'] = 'EXPANDED WORD'
repetitions_dict['REPEATED LETTER WORD'] = 'CONTRACTED WORD'

Save the dict and use it for further preprocessing

In [None]:
with open('contractions_dict.json', 'w') as fp:
    json.dump(contractions_dict, fp)
with open('repetitions_dict.json', 'w') as fp:
    json.dump(repetitions_dict, fp)

#Sub Program to find list of words with repeated leterrs (Eg: gooooaaalll)

Check the number of words with repeated letters in your dataset and add any new words to the dict if necessary 

In [None]:
word_corpus = []
j = 0
for tweet in Tweets_PreProcessed['tokenized_processed']:
  for word in tweet:
    if word in word_corpus:
      j = j +1
    else:
      word_corpus.append(word)
print(len(word_corpus))

17118


In [None]:
Consec_2_letter_words = []
for word in word_corpus:
  for idx, char in enumerate(word):
    if idx + 1 < len(word):
      if word[idx] == word[idx + 1]:
        Consec_2_letter_words.append(word)
print(len(Consec_2_letter_words))
print(Consec_2_letter_words)

4160


In [None]:
Consec_3_letter_words = []
for word in word_corpus:
  for idx, char in enumerate(word):
    if idx + 2 < len(word):
      if word[idx] == word[idx + 1] == word[idx + 2]:
        Consec_3_letter_words.append(word)
print(len(Consec_3_letter_words))
print(Consec_3_letter_words)

196
['soooo', 'soooo', 'awwww', 'awwww', 'mhmmm', 'alexshipppp', 'alexshipppp', 'avysss', 'aiii', 'baaaack', 'baaaack', 'iii', 'xdojjjj', 'xdojjjj', 'omgbethersss', 'xxx', 'grrrr', 'grrrr', 'wwwbigbaldhead', 'zzzz', 'zzzz', 'lmfaoooo', 'lmfaoooo', 'deeeznvtzzz', 'deeeznvtzzz', 'oooureli', 'mmm', 'sniiiiiiff', 'sniiiiiiff', 'sniiiiiiff', 'sniiiiiiff', 'uhhhhh', 'uhhhhh', 'uhhhhh', 'ieee', 'rokiieee', 'loveyouuuu', 'loveyouuuu', 'sooo', 'michelleellle', 'emaaalay', 'crosssectarian', 'naaa', 'kwaaaaadead', 'kwaaaaadead', 'kwaaaaadead', 'ssshhheeesshh', 'ssshhheeesshh', 'ssshhheeesshh', 'pusssssssssy', 'pusssssssssy', 'pusssssssssy', 'pusssssssssy', 'pusssssssssy', 'pusssssssssy', 'pusssssssssy', 'shoook', 'frackfreeeu', 'stiiilo', 'ahhhhh', 'ahhhhh', 'ahhhhh', 'alllivesmatter', 'ohyayyyyay', 'ohyayyyyay', 'ayyy', 'aaaaaaallll', 'aaaaaaallll', 'aaaaaaallll', 'aaaaaaallll', 'aaaaaaallll', 'aaaaaaallll', 'aaaaaaallll', 'ssssnell', 'ssssnell', 'www', 'iiii', 'iiii', 'riveeeeeer', 'riveeeeeer'

In [None]:
Consec_4_letter_words = []
for word in word_corpus:
  for idx, char in enumerate(word):
    if idx + 3 < len(word):
      if word[idx] == word[idx + 1] == word[idx + 2] == word[idx + 3]:
        Consec_4_letter_words.append(word)
print(len(Consec_4_letter_words))
print(Consec_4_letter_words)

84
['soooo', 'awwww', 'alexshipppp', 'baaaack', 'xdojjjj', 'grrrr', 'zzzz', 'lmfaoooo', 'sniiiiiiff', 'sniiiiiiff', 'sniiiiiiff', 'uhhhhh', 'uhhhhh', 'loveyouuuu', 'kwaaaaadead', 'kwaaaaadead', 'pusssssssssy', 'pusssssssssy', 'pusssssssssy', 'pusssssssssy', 'pusssssssssy', 'pusssssssssy', 'ahhhhh', 'ahhhhh', 'ohyayyyyay', 'aaaaaaallll', 'aaaaaaallll', 'aaaaaaallll', 'aaaaaaallll', 'aaaaaaallll', 'ssssnell', 'iiii', 'riveeeeeer', 'riveeeeeer', 'riveeeeeer', 'milioooo', 'roomsgrrrr', 'alllll', 'alllll', 'nowwwwww', 'nowwwwww', 'nowwwwww', 'wompppp', 'selmoooooo', 'selmoooooo', 'selmoooooo', 'errrr', 'damnnnn', 'aannnnd', 'shidddd', 'ahhhh', 'omgggg', 'aaaa', 'onnnn', 'comeeeee', 'comeeeee', 'mxaaaa', 'llll', 'blaaaaaaa', 'blaaaaaaa', 'blaaaaaaa', 'blaaaaaaa', 'ayhhhhhdjjfjrjjrdjjeks', 'ayhhhhhdjjfjrjjrdjjeks', 'mochichiiiii', 'mochichiiiii', 'sheetingaaaaaand', 'sheetingaaaaaand', 'sheetingaaaaaand', 'maaaaan', 'maaaaan', 'ruddyyyyyy', 'ruddyyyyyy', 'ruddyyyyyy', 'caaaaaall', 'caaaaaall'

In [None]:
Consec_4_wr_letter_words = []
for word in Consec_4_letter_words:
  new_word = ''
  for idx, char in enumerate(word):
    if idx + 1 < len(word):
      
      if word[idx] != word[idx + 1]:
        # print(word[idx], word[idx + 1])
        new_word = new_word + char
    else:
      # if word[idx - 1] == word[idx]:
      new_word = new_word + char
  Consec_4_wr_letter_words.append(new_word)
        
print(len(Consec_4_wr_letter_words))
print(Consec_4_wr_letter_words)

84
['so', 'aw', 'alexship', 'back', 'xdoj', 'gr', 'z', 'lmfao', 'snif', 'snif', 'snif', 'uh', 'uh', 'loveyou', 'kwadead', 'kwadead', 'pusy', 'pusy', 'pusy', 'pusy', 'pusy', 'pusy', 'ah', 'ah', 'ohyayay', 'al', 'al', 'al', 'al', 'al', 'snel', 'i', 'river', 'river', 'river', 'milio', 'romsgr', 'al', 'al', 'now', 'now', 'now', 'womp', 'selmo', 'selmo', 'selmo', 'er', 'damn', 'and', 'shid', 'ah', 'omg', 'a', 'on', 'come', 'come', 'mxa', 'l', 'bla', 'bla', 'bla', 'bla', 'ayhdjfjrjrdjeks', 'ayhdjfjrjrdjeks', 'mochichi', 'mochichi', 'shetingand', 'shetingand', 'shetingand', 'man', 'man', 'rudy', 'rudy', 'rudy', 'cal', 'cal', 'cal', 'wo', 'wo', 'wo', 'wo', 'rios', 'rios', 'nice']


In [None]:
from collections import Counter

new_words = Counter(Consec_4_wr_letter_words)

for j, i in new_words.items():
  print(j, i)

so 1
aw 1
alexship 1
back 1
xdoj 1
gr 1
z 1
lmfao 1
snif 3
uh 2
loveyou 1
kwadead 2
pusy 6
ah 3
ohyayay 1
al 7
snel 1
i 1
river 3
milio 1
romsgr 1
now 3
womp 1
selmo 3
er 1
damn 1
and 1
shid 1
omg 1
a 1
on 1
come 2
mxa 1
l 1
bla 4
ayhdjfjrjrdjeks 2
mochichi 2
shetingand 3
man 2
rudy 3
cal 3
wo 4
rios 2
nice 1
