In [1]:
# !pip install -U spaCy
# !python -m spacy download en
# !python -m spacy download en_core_web_sm
# !pip install contractions
# !pip install inflect

In [2]:
import pandas as pd
import spacy
import string
import re
import unicodedata
import inflect
import contractions

from nltk.corpus import stopwords as sw_nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction import stop_words as sw_sklearn
from spacy.lang.en.stop_words import STOP_WORDS as sw_spacy
from gensim.parsing.preprocessing import remove_stopwords as sw_gensim

In [3]:
nlp = spacy.load('en')
token=ToktokTokenizer()

In [4]:
DATA_DIR = "../../data/raw/"
INPUT_FILE_NAME = 'subset_raw.parquet'
OUTPUT_FILE_NAME = 'cleaned.parquet'

In [5]:
df = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)

## Here is what we need to clean
1. Lower case [DONE]
2. Abbreviations (unnecessary?)
3. Numbers [DONE]
4. Contractions (unnecessary?) [DONE]
5. All punctuations [DONE]
6. Weird symbols (accounted for w/ punctuations)
7. Stop words (gensim) [DONE]

TOKENISATION


7. Stop words (nltk, sklearn, spacy) [DONE]
8. Stemming (unnecessary due to Lemmatization)
9. Lemmatization

In [6]:
def remove_whitespaces(input_string):
    result = input_string.strip()
    return result

In [7]:
def convert_lower(input_string):
    return input_string.lower()

In [8]:
def replace_contractions(input_string):
    return contractions.fix(input_string)
#only replaces common contractions
#is it truly necessary considering removal of stopwords?

In [9]:
def remove_non_ascii_token(tokens):
    result = []
    for word in tokens:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')
        result.append(new_word)
    return result

In [10]:
def remove_numbers(input_string):
    result = re.sub(r'\d+','',input_string)
    return result

def replace_numbers_token(tokens): #unnecessary for tagging, better to just remove?
    p = inflect.engine()
    result = []
    for word in tokens:
        if word.isdigit():
            new_word = p.number_to_words(word)
            results.append(new_word)
        else:
            results.append(word)
    return result

In [11]:
punctuations = string.punctuation
def remove_punctuations(input_string):
    result = input_string
    for i in punctuations:
        result = result.replace(i,'')
    return result

def remove_punctuations_token(tokens):
    result = []
    for word in tokens:
        new_word = re.sub(r'[^\w\s])','',word)
        result.append(new_word)
    return result

In [12]:
def remove_sw_string(input_string):
    result = sw_gensim(input_string)
    return result

def remove_sw_token(tokens):
    
    #nltk stop words
    nltkStopWords=sw_nltk.words('english')

    #sklearn stop words
    sets=[sw_sklearn.ENGLISH_STOP_WORDS]
    sklearnStopWords = [list(x) for x in sets][0]

    ##COMBINE NLTK & SKLEARN STOP WORDS
    allStopWords = nltkStopWords + sklearnStopWords
    allStopWords = list(dict.fromkeys(allStopWords)) #remove repeats

    after_nltk_sklearn = [i for i in tokens if not i in allStopWords]
    
    result = []
    for word in after_nltk_sklearn:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            result.append(word)
    return result
    

In [13]:
#do after removal of stopwords
def lemmatize_token(tokens):
    lemma = WordNetLemmatizer()
    result = []
    for w in tokens:
        w1 = lemma.lemmatize(w, pos = 'n')
        w2 = lemma.lemmatize(w1, pos = 'v')
        w3 = lemma.lemmatize(w2, pos = ('a'))
        result.append(w3)
    return result
#spacy can also lemmatize on tokens in doc

In [14]:
def data_specific_cleaning_token(tokens):
    result = []
    for word in tokens:
        if word != 'applause' and word != 'laughter':
            result.append(word)
    return result

In [15]:
def combined_cleaning(input_string):
    text = remove_whitespaces(input_string)
    text = convert_lower(text)
    text = replace_contractions(text)
    text = remove_numbers(text)
    text = remove_punctuations(text)
    text = remove_sw_string(text)
    
    tokens = token.tokenize(text)
    
    tokens = remove_sw_token(tokens)
    tokens = lemmatize_token(tokens)
    tokens = data_specific_cleaning_token(tokens)
    

    return tokens

In [16]:
# drop rows with empty transcripts
df = df.dropna(subset=['transcript'])
df = df.reset_index(drop=True)

df['clean_transcript'] = df['transcript'].map(lambda x: combined_cleaning(x))

df.head()

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript
0,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,0:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ...",2281.0,"[thank, chris, truly, great, honor, opportunit..."
1,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,0:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ...",2687.0,"[term, invention, like, tell, tale, favorite, ..."
2,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,0:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r...",2506.0,"[public, dewey, long, ago, observe, constitute..."
3,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",0:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst...",3092.0,"[want, start, say, houston, problem, enter, se..."
4,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0,"[want, talk, background, idea, car, art, actua..."


In [23]:
df['clean_transcript_string'] = df['clean_transcript'].map(lambda x: ' '.join(str(elem) for elem in x))
df.head()

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string
0,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,0:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ...",2281.0,"[thank, chris, truly, great, honor, opportunit...",thank chris truly great honor opportunity come...
1,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,0:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ...",2687.0,"[term, invention, like, tell, tale, favorite, ...",term invention like tell tale favorite project...
2,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,0:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r...",2506.0,"[public, dewey, long, ago, observe, constitute...",public dewey long ago observe constitute discu...
3,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",0:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst...",3092.0,"[want, start, say, houston, problem, enter, se...",want start say houston problem enter second ge...
4,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0,"[want, talk, background, idea, car, art, actua...",want talk background idea car art actually mea...


In [24]:
df.to_parquet(DATA_DIR + OUTPUT_FILE_NAME)