In [1]:
# !pip install -U spaCy
# !python -m spacy download en
# !python -m spacy download en_core_web_sm
# !pip install contractions
# !pip install inflect

In [2]:
import pandas as pd
import spacy
import string
import re
import unicodedata
import inflect
import contractions

from nltk.corpus import stopwords as sw_nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction import stop_words as sw_sklearn
from spacy.lang.en.stop_words import STOP_WORDS as sw_spacy
from gensim.parsing.preprocessing import remove_stopwords as sw_gensim

In [3]:
#nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
token=ToktokTokenizer()

In [4]:
DATA_DIR = "../../data/raw/"
INPUT_FILE_NAME = 'subset_raw.parquet'
OUTPUT_FILE_NAME = 'cleaned.parquet'

In [5]:
df = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)

## Here is what we need to clean
1. Lower case [DONE]
2. Abbreviations (unnecessary?)
3. Numbers [DONE]
4. Contractions (unnecessary?) [DONE]
5. All punctuations [DONE]
6. Weird symbols (accounted for w/ punctuations)
7. Stop words (gensim) [DONE]

TOKENISATION


7. Stop words (nltk, sklearn, spacy) [DONE]
8. Stemming (unnecessary due to Lemmatization)
9. Lemmatization

In [6]:
def remove_whitespaces(input_string):
    result = input_string.strip()
    return result

In [7]:
def convert_lower(input_string):
    return input_string.lower()

In [8]:
def replace_contractions(input_string):
    return contractions.fix(input_string)
#only replaces common contractions
#is it truly necessary considering removal of stopwords?

In [9]:
def remove_non_ascii_token(tokens):
    result = []
    for word in tokens:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')
        result.append(new_word)
    return result

In [10]:
def remove_numbers(input_string):
    result = re.sub(r'\d+','',input_string)
    return result

def replace_numbers_token(tokens): #unnecessary for tagging, better to just remove?
    p = inflect.engine()
    result = []
    for word in tokens:
        if word.isdigit():
            new_word = p.number_to_words(word)
            results.append(new_word)
        else:
            results.append(word)
    return result

In [11]:
punctuations = string.punctuation
def remove_punctuations(input_string):
    result = input_string
    for i in punctuations:
        result = result.replace(i,'')
    return result

def remove_punctuations_token(tokens):
    result = []
    for word in tokens:
        new_word = re.sub(r'[^\w\s])','',word)
        result.append(new_word)
    return result

In [12]:
def remove_sw_string(input_string):
    result = sw_gensim(input_string)
    return result

def remove_sw_token(tokens):
    
    #nltk stop words
    nltkStopWords=sw_nltk.words('english')

    #sklearn stop words
    sets=[sw_sklearn.ENGLISH_STOP_WORDS]
    sklearnStopWords = [list(x) for x in sets][0]

    ##COMBINE NLTK & SKLEARN STOP WORDS
    allStopWords = nltkStopWords + sklearnStopWords
    allStopWords = list(dict.fromkeys(allStopWords)) #remove repeats

    after_nltk_sklearn = [i for i in tokens if not i in allStopWords]
    
    result = []
    for word in after_nltk_sklearn:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            result.append(word)
    return result
    

In [13]:
#do after removal of stopwords
def lemmatize_token(tokens):
    lemma = WordNetLemmatizer()
    result = []
    for w in tokens:
        w1 = lemma.lemmatize(w, pos = 'n')
        w2 = lemma.lemmatize(w1, pos = 'v')
        w3 = lemma.lemmatize(w2, pos = ('a'))
        result.append(w3)
    return result
#spacy can also lemmatize on tokens in doc

In [14]:
def data_specific_cleaning_token(tokens):
    result = []
    for word in tokens:
        if word != 'applause' and word != 'laughter':
            result.append(word)
    return result

In [15]:
def combined_cleaning(input_string):
    text = remove_whitespaces(input_string)
    text = convert_lower(text)
    text = replace_contractions(text)
    text = remove_numbers(text)
    text = remove_punctuations(text)
    text = remove_sw_string(text)
    
    tokens = token.tokenize(text)
    
    tokens = remove_sw_token(tokens)
    tokens = lemmatize_token(tokens)
    tokens = data_specific_cleaning_token(tokens)
    

    return tokens

In [16]:
# drop rows with empty transcripts
df = df.dropna(subset=['transcript'])
df = df.reset_index(drop=True)

## spaCy for sentence splitting? and the lemma_ does lower, contractions at the same time

In [17]:
#sample_transcript = "Oh no he didn't. I can't and I won't. I'll know what I'm gonna do."
sample_transcript = df['transcript'][0]
parsed_transcript = nlp(sample_transcript)
parsed_transcript[:100]

0:14


Thank you so much, Chris.
And it's truly a great honor
to have the opportunity
to come to this stage twice;
I'm extremely grateful.
I have been blown away by this conference,
and I want to thank all of you
for the many nice comments
about what I had to say the other night.
And I say that sincerely,
partly because (Mock sob)
I need that.




 0:40


(Laughter)




 0:45



In [18]:
# for num, sentence in enumerate(parsed_review.sents):
#     print(f"Sentence: {num + 1}")
#     print(sentence)
#     print("")

# for num, entity in enumerate(parsed_review.ents):
#     print(f"Entity {num + 1}:", entity, "-", entity.label_)
#     print("")

token_attributes = [(token.text,
                     token.lower_,
                     token.lemma_,
                     token.pos_,
                     token.ent_type_,
                     token.sentiment,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.is_digit,
                     token.like_num,
                     token.is_bracket,
                     token.is_ascii)
                    for token in parsed_transcript]

sample_attributes = pd.DataFrame(token_attributes,
                                 columns=['text',
                                          'lowercase',
                                          'lemma',
                                          'part_of_speech',
                                          'entity_type',
                                          'sentiment',
                                          'stopword?',
                                          'punctuation?',
                                          'whitespace?',
                                          'digit?',
                                          'number?',
                                          'bracket?',
                                          'ascii?'])

# To see 'True' values more clearly
sample_attributes.loc[:, 'stopword?':'ascii?'] = (sample_attributes.loc[:, 'stopword?':'ascii?']
                                                          .applymap(lambda x: 'Yes' if x else ''))
                                               
sample_attributes.head(30)

Unnamed: 0,text,lowercase,lemma,part_of_speech,entity_type,sentiment,stopword?,punctuation?,whitespace?,digit?,number?,bracket?,ascii?
0,0:14,0:14,0:14,PUNCT,,0.0,,,,,,,Yes
1,\r\r\r,\r\r\r,\r\r\r,SPACE,,0.0,,,Yes,,,,Yes
2,Thank,thank,thank,VERB,,0.0,,,,,,,Yes
3,you,you,-PRON-,PRON,,0.0,Yes,,,,,,Yes
4,so,so,so,ADV,,0.0,Yes,,,,,,Yes
5,much,much,much,ADV,,0.0,Yes,,,,,,Yes
6,",",",",",",PUNCT,,0.0,,Yes,,,,,Yes
7,Chris,chris,Chris,PROPN,PERSON,0.0,,,,,,,Yes
8,.,.,.,PUNCT,,0.0,,Yes,,,,,Yes
9,\r,\r,\r,SPACE,,0.0,,,Yes,,,,Yes


In [None]:
df['transcript'] = df['transcript'].map(lambda x: combined_cleaning(x) )

In [None]:
df.to_parquet(DATA_DIR + OUTPUT_FILE_NAME)