In [17]:
import pandas as pd 

data = pd.read_csv("Corona_NLP_train.csv", encoding = 'latin1') 
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [18]:
data = data.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis = 1)
data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [None]:
import re
import preprocessor.api as p
from contractions import contractions_dict
from nltk.corpus import wordnet
from pattern.en import suggest
import wordninja
from numpy import nan
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import words

def remove_nonASCII(text):
    #remove non english alphabet
    return re.sub(r'[^\x00-\x7f]',"", text)

def remove_repeated_characters(word):
    pattern = re.compile(r"(\w*)(\w)\2(\w*)")
    substitution_pattern = r"\1\2\3"
    while True:
        if wordnet.synsets(word):
            return word
        new_word = pattern.sub(substitution_pattern,word)
        if new_word != word:
            word = new_word
            continue
        else:
            return new_word

def expand_contractions(text):
    pattern = re.compile("({})".format("|".join(contractions_dict.keys())),flags = re.DOTALL| re.IGNORECASE)
    
    def replace_text(t):
        txt = t.group(0)
        if txt.lower() in contractions_dict.keys():
            return contractions_dict[txt.lower()]
        
    expand_text = pattern.sub(replace_text,text)
    return expand_text 

def spelling_checker(word):
    checker = suggest(word)
    return checker[0][0]

def build_stopwords():
    neg_list = ['not', 'none', 'nor', 'no']
    stop_words = nltk.corpus.stopwords.words('english')
    for word in neg_list:
        if word in stop_words:
              stop_words.remove(word)
    return stop_words

def split_words(words):
    #split words which are not separated by space
    wlist = []
    for word in words:
        # 12 is decided based on distribution of word length in English
        if len(word)> 12:
            wlist = wlist + wordninja.split(word)
        else:
            wlist.append(word)
    return wlist

def preprocess_unit(text):
    text = remove_nonASCII(text)
    text = re.sub('\S+@\S*\s?', ' ', text) #remove email
    text = p.clean(text)    # remove URL, hashtag, @-mention, emojis
    text = re.sub(r'[^\w\s]', ' ', text) #remove punctuations
    text = re.sub("(\s\d+)","",text) #remove digits
    text = text.lower() #to lower case
    text = expand_contractions(text) #expand contraction
    Words = word_tokenize(text) #tokenization
    stop_words = build_stopwords() #build stop words list
    filtered_words = [t for t in Words if t not in stop_words]
    output = []
    if len(filtered_words) > 0:
        
        for word in filtered_words:
            if word in words.words():
                output.append(word)
            else:
                output.append(remove_repeated_characters(word))
        #output = split_words(output)
        output = [spelling_checker(s) for s in output]
        
        return output
    else:
        return nan

data['tokens'] = data['OriginalTweet'].apply(preprocess_unit)
data.head()