# **Vodafone Text Cleaning and Normalisation**

In [1]:
# import libraries
import nltk
import os
import pandas as pd
import pickle
import regex as re
import spellchecker
import time

from gensim.utils import tokenize
from num2words import num2words
from spellchecker import SpellChecker

#### **Section 1: Data ingestion**

In [2]:
vodafone_reviews = pd.read_csv('vodafone_reviews.csv')

# define functions to classify each review into a NPS category
def create_nps_category(row):
    if row.score <= 3:
        category = 'Detractor'
    elif row.score == 4:
        category = 'Passive'
    else:
        category = 'Promoter'
    return category

def create_nps_class(row):
    if row.nps_category == 'Detractor':
        nps_class = -1
    elif row.nps_category == 'Passive':
        nps_class = 0
    else:
        nps_class = 1
    return nps_class


vodafone_reviews['nps_category'] = vodafone_reviews.apply(create_nps_category, axis=1)
vodafone_reviews['nps_class'] = vodafone_reviews.apply(create_nps_class, axis=1)

In [3]:
# combine the title and review columns together
vodafone_reviews['text'] = vodafone_reviews.apply(lambda x: x.title + '. ' + x.review, axis=1)
vodafone_reviews

Unnamed: 0,review_id,title,review,score,timestamp,nps_category,nps_class,text
0,1,Vodaphone Almost Ruined My Life - Stolen Ident...,My email and password were part of a company d...,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...
1,2,Misguided Sale,"Misguided SaleCouple of days ago, I was told t...",1,2021-01-29 08:59:16+00:00,Detractor,-1,Misguided Sale. Misguided SaleCouple of days a...
2,3,Worst network,Worst network for data in Australia. I used Vo...,1,2021-01-28 12:16:46+00:00,Detractor,-1,Worst network. Worst network for data in Austr...
3,4,Insurance contract with them is bulshit,They charged me for over a year a plan with in...,1,2021-01-27 06:21:26+00:00,Detractor,-1,Insurance contract with them is bulshit. They ...
4,5,RECORD all your phonecalls with them when you ...,My parents and I have been with Vodafone for a...,1,2021-01-27 04:18:19+00:00,Detractor,-1,RECORD all your phonecalls with them when you ...
...,...,...,...,...,...,...,...,...
1898,2023,Below average,"Vodafone's extremely cheap, now I know why. Te...",2,2016-11-26 16:51:21+00:00,Detractor,-1,"Below average. Vodafone's extremely cheap, now..."
1899,2024,Poor customer service and tend to charge more ...,I reduced my plan to$22 from $35 thinking it w...,1,2016-11-21 08:40:26+00:00,Detractor,-1,Poor customer service and tend to charge more ...
1900,2025,INSENSITIVE TO THE CUSTOMER SUFFERINGS,Please never ever make mistake of going with V...,1,2016-11-18 23:40:19+00:00,Detractor,-1,INSENSITIVE TO THE CUSTOMER SUFFERINGS. Please...
1901,2026,Extremely Terrible Customer Service at Westfie...,I had to get my address changed (same as on my...,1,2016-11-18 01:10:19+00:00,Detractor,-1,Extremely Terrible Customer Service at Westfie...


In [4]:
# call this new dataframe 'text_df'
text_df = vodafone_reviews.iloc[:, [0,3,4,5,6,7]]
text_df

Unnamed: 0,review_id,score,timestamp,nps_category,nps_class,text
0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...
1,2,1,2021-01-29 08:59:16+00:00,Detractor,-1,Misguided Sale. Misguided SaleCouple of days a...
2,3,1,2021-01-28 12:16:46+00:00,Detractor,-1,Worst network. Worst network for data in Austr...
3,4,1,2021-01-27 06:21:26+00:00,Detractor,-1,Insurance contract with them is bulshit. They ...
4,5,1,2021-01-27 04:18:19+00:00,Detractor,-1,RECORD all your phonecalls with them when you ...
...,...,...,...,...,...,...
1898,2023,2,2016-11-26 16:51:21+00:00,Detractor,-1,"Below average. Vodafone's extremely cheap, now..."
1899,2024,1,2016-11-21 08:40:26+00:00,Detractor,-1,Poor customer service and tend to charge more ...
1900,2025,1,2016-11-18 23:40:19+00:00,Detractor,-1,INSENSITIVE TO THE CUSTOMER SUFFERINGS. Please...
1901,2026,1,2016-11-18 01:10:19+00:00,Detractor,-1,Extremely Terrible Customer Service at Westfie...


In [5]:
# create a copy that splits each review into individual sentences
# the 'sentences' column can be dropped, but is kept just in case
sentence_df = text_df.copy(deep=True)
sentence_df['sentences'] = sentence_df.text.str.split('\.')
temp = sentence_df.sentences.explode()
sentence_df = (sentence_df.merge(temp, how='left', left_index=True, right_index=True).reset_index())
sentence_df = sentence_df[sentence_df.sentences_y != ""]
sentence_df.rename(columns = {'index':'line_num',
                              'sentences_x':'sentences',
                              'sentences_y':'sentence'}, inplace=True)

sentence_df

Unnamed: 0,line_num,review_id,score,timestamp,nps_category,nps_class,text,sentences,sentence
0,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,Vodaphone Almost Ruined My Life - Stolen Ident...
1,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,My email and password were part of a company ...
2,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,"(Vodaphone has 0 security in place, they don'..."
3,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,ber so I had no phone or any means of contacti...
4,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,\n\nWithin 15 minutes my life was upside down
...,...,...,...,...,...,...,...,...,...
16245,1901,2026,1,2016-11-18 01:10:19+00:00,Detractor,-1,Extremely Terrible Customer Service at Westfie...,[Extremely Terrible Customer Service at Westfi...,\nAbsolutely Pathetic Customer Service
16246,1901,2026,1,2016-11-18 01:10:19+00:00,Detractor,-1,Extremely Terrible Customer Service at Westfie...,[Extremely Terrible Customer Service at Westfi...,\n(such a big shame VODAFONE)
16247,1902,2027,5,2016-11-18 01:06:11+00:00,Promoter,1,Perfect For Me. I love this phone company been...,"[Perfect For Me, I love this phone company be...",Perfect For Me
16248,1902,2027,5,2016-11-18 01:06:11+00:00,Promoter,1,Perfect For Me. I love this phone company been...,"[Perfect For Me, I love this phone company be...",I love this phone company been with them for ...


#### **Section 2: Data cleaning**

In [6]:
# create function to clean up the text; nb, at the end of this function would be the spot to check for spelling errors etc
# this the output from this function is most suitable for sentiment analysis using a lexicon-based approach and spacy relies on grammatical structure and punctuation for its PoS tagging etc.
def clean_text(df, col):
    # correct curly apostrophes, long hyphens and encoding errors
    df['clean_text'] = (df[col].str.replace("’", "'", regex=False)
                       .str.replace('[”“]', '"', regex=True)
                       .str.replace("[–]", " - ", regex=True)
                       .str.replace("â€™", "'", regex=False)
                       .str.replace("â€“", " ", regex=False)
                       .str.replace("\r", " ", regex=False)
                       .str.replace("\n", " ", regex=False))

    # create a dictionary of common expansions in the english language
    contractions_dict = {"can't": "can not",
                         "won't": "will not",
                         "don't": "do not",
                         "n't": " not",
                         "'m": " am",
                         "'ll": " will",
                         "'d": " would",
                         "'ve": " have",
                         "'re": " are",
                         "'s": " is", # 's could also be a possessive
                         "s'": "s"}

    # expand the contractions and add to dataframe as new variable
    exp_text = []
    for review in df.clean_text:
        text = []
        for key, value in contractions_dict.items():
            if key in review:
                review = review.replace(key, value)
                text.append(review)
        exp_text.append(review)

    df.clean_text = exp_text

    # replace $ with AUD and % with percent
    df.clean_text = (df.clean_text.str.replace("$", " dollars ", regex=False)
                    .str.replace("%", " percent ", regex=False)
    # clean up the extra white space between words
                    .apply(lambda x: " ".join(x.split()))
    # replace numbers with their word-equivalents
                    .apply(lambda y: re.sub(r'(\d+)', lambda x: num2words(int(x.group(0))), y)))

    return df

In [7]:
# run the function on the sentence dataframe
sentence_df = clean_text(df=sentence_df, col='sentence')
sentence_df

Unnamed: 0,line_num,review_id,score,timestamp,nps_category,nps_class,text,sentences,sentence,clean_text
0,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,Vodaphone Almost Ruined My Life - Stolen Ident...,Vodaphone Almost Ruined My Life - Stolen Ident...
1,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,My email and password were part of a company ...,My email and password were part of a company d...
2,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,"(Vodaphone has 0 security in place, they don'...","(Vodaphone has zero security in place, they do..."
3,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,ber so I had no phone or any means of contacti...,ber so I had no phone or any means of contacti...
4,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,\n\nWithin 15 minutes my life was upside down,Within fifteen minutes my life was upside down
...,...,...,...,...,...,...,...,...,...,...
16245,1901,2026,1,2016-11-18 01:10:19+00:00,Detractor,-1,Extremely Terrible Customer Service at Westfie...,[Extremely Terrible Customer Service at Westfi...,\nAbsolutely Pathetic Customer Service,Absolutely Pathetic Customer Service
16246,1901,2026,1,2016-11-18 01:10:19+00:00,Detractor,-1,Extremely Terrible Customer Service at Westfie...,[Extremely Terrible Customer Service at Westfi...,\n(such a big shame VODAFONE),(such a big shame VODAFONE)
16247,1902,2027,5,2016-11-18 01:06:11+00:00,Promoter,1,Perfect For Me. I love this phone company been...,"[Perfect For Me, I love this phone company be...",Perfect For Me,Perfect For Me
16248,1902,2027,5,2016-11-18 01:06:11+00:00,Promoter,1,Perfect For Me. I love this phone company been...,"[Perfect For Me, I love this phone company be...",I love this phone company been with them for ...,I love this phone company been with them for a...


In [8]:
# run the function on the text dataframe
text_df = clean_text(df=text_df, col='text')
text_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,review_id,score,timestamp,nps_category,nps_class,text,clean_text
0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,Vodaphone Almost Ruined My Life - Stolen Ident...
1,2,1,2021-01-29 08:59:16+00:00,Detractor,-1,Misguided Sale. Misguided SaleCouple of days a...,Misguided Sale. Misguided SaleCouple of days a...
2,3,1,2021-01-28 12:16:46+00:00,Detractor,-1,Worst network. Worst network for data in Austr...,Worst network. Worst network for data in Austr...
3,4,1,2021-01-27 06:21:26+00:00,Detractor,-1,Insurance contract with them is bulshit. They ...,Insurance contract with them is bulshit. They ...
4,5,1,2021-01-27 04:18:19+00:00,Detractor,-1,RECORD all your phonecalls with them when you ...,RECORD all your phonecalls with them when you ...
...,...,...,...,...,...,...,...
1898,2023,2,2016-11-26 16:51:21+00:00,Detractor,-1,"Below average. Vodafone's extremely cheap, now...","Below average. Vodafone is extremely cheap, no..."
1899,2024,1,2016-11-21 08:40:26+00:00,Detractor,-1,Poor customer service and tend to charge more ...,Poor customer service and tend to charge more ...
1900,2025,1,2016-11-18 23:40:19+00:00,Detractor,-1,INSENSITIVE TO THE CUSTOMER SUFFERINGS. Please...,INSENSITIVE TO THE CUSTOMER SUFFERINGS. Please...
1901,2026,1,2016-11-18 01:10:19+00:00,Detractor,-1,Extremely Terrible Customer Service at Westfie...,Extremely Terrible Customer Service at Westfie...


In [9]:
# create a spell-checker function
def run_spell_checker(df, col):
    # convert text to lower-case, tokenise into words and remove accents
    words = sentence_df[col].apply(lambda x: list(tokenize(x, lowercase=True, deacc=True)))
    words_lol = list(words)
    words_list = [word for list in words_lol for word in list]
    
    # convert the vocabulary into a dataframe
    vocabulary = pd.DataFrame(words_list, columns=["words"])
    
    # initalise the spellchecker
    spell = SpellChecker()
    
    # obtain a list of the unknown words plus how many times each occurs in the corpus
    unknown_words = spell.unknown(vocabulary.words.to_list())
    unknown_word_counts = vocabulary[vocabulary.words.isin(unknown_words)].value_counts()
    
    return unknown_words, unknown_word_counts

In [10]:
# run the function and check the top-30 words
# consider removing any single letter words at the modelling stage
_, unknown_word_counts = run_spell_checker(sentence_df, 'clean_text')
unknown_word_counts[:30]

words    
vodafone     2881
telstra       228
vodaphone     201
optus         147
fourg         114
nbn            90
threeg         67
voda           63
vodafail       59
sms            56
samsung        54
gb             48
cbd            46
threerd        35
s              31
postpaid       29
l              26
htc            22
n              22
twond          21
t              21
tpg            20
onest          20
fivegb         20
nz             20
hrs            20
m              19
nsw            18
nn             17
kogan          15
dtype: int64

In [13]:
# replace selected mistakes in the respective dataframes
def correct_spellings(df, col):
    df[col] = (df[col].str.replace("threerd", "third", regex=False)
                      .str.replace("twond", "second", regex=False)
                      .str.replace("onest", "first", regex=False))
    
    return df

# run the function
text_df = correct_spellings(sentence_df, 'clean_text')
sentence_df = correct_spellings(text_df, 'clean_text')

Unnamed: 0,line_num,review_id,score,timestamp,nps_category,nps_class,text,sentences,sentence,clean_text
0,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,Vodaphone Almost Ruined My Life - Stolen Ident...,Vodaphone Almost Ruined My Life - Stolen Ident...
1,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,My email and password were part of a company ...,My email and password were part of a company d...
2,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,"(Vodaphone has 0 security in place, they don'...","(Vodaphone has zero security in place, they do..."
3,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,ber so I had no phone or any means of contacti...,ber so I had no phone or any means of contacti...
4,0,1,1,2021-01-31 01:03:34+00:00,Detractor,-1,Vodaphone Almost Ruined My Life - Stolen Ident...,[Vodaphone Almost Ruined My Life - Stolen Iden...,\n\nWithin 15 minutes my life was upside down,Within fifteen minutes my life was upside down
...,...,...,...,...,...,...,...,...,...,...
16245,1901,2026,1,2016-11-18 01:10:19+00:00,Detractor,-1,Extremely Terrible Customer Service at Westfie...,[Extremely Terrible Customer Service at Westfi...,\nAbsolutely Pathetic Customer Service,Absolutely Pathetic Customer Service
16246,1901,2026,1,2016-11-18 01:10:19+00:00,Detractor,-1,Extremely Terrible Customer Service at Westfie...,[Extremely Terrible Customer Service at Westfi...,\n(such a big shame VODAFONE),(such a big shame VODAFONE)
16247,1902,2027,5,2016-11-18 01:06:11+00:00,Promoter,1,Perfect For Me. I love this phone company been...,"[Perfect For Me, I love this phone company be...",Perfect For Me,Perfect For Me
16248,1902,2027,5,2016-11-18 01:06:11+00:00,Promoter,1,Perfect For Me. I love this phone company been...,"[Perfect For Me, I love this phone company be...",I love this phone company been with them for ...,I love this phone company been with them for a...


#### **Section 3: Text Normalisation**

In [14]:
# install spacy
# conda install -c conda-forge spacy
# python -m spacy download en_core_web_trf # (or whatever language model you want)

# the usual way of importing spacy after installation is to use spacy's built-in loader; the alternative is what is used in this notebook, which imports the language model in explicitly as a Python package
# import spacy
# nlp = spacy.load("en_core_web_trf")

# load the desired English language model
# in this notebook, the transformer pipeline model is used. This is a larger and slower pipeline compared to the "sm", "md" or "lg" models, which are optimised for use on a CPU, but is more accurate.
# see https://spacy.io/models/en for more information
import en_core_web_trf
nlp = en_core_web_trf.load()

# import en_core_web_sm
# nlp = en_core_web_sm.load()

In [15]:
# use spacy's stopword list as default
# note that this is quite an aggressive stopword list, see the stopword analysis inside "https://github.com/NikkiSarah/Misc" for more detail
spacy_stopwords = nlp.Defaults.stop_words
stopwords_series = pd.Series(list(spacy_stopwords))

In [16]:
# create a set of normalised features as a starting point for input into a supervised or unsupervised ML algorithm
def create_features(df, col, stopword_list=spacy_stopwords):
    # create tokenised words and noun phrases
    lemma_lists = []
    phrase_lists = []
    for text in df[col]:
        doc = nlp(text)
        lemmatised_words = [token.lemma_.lower().strip() for token in doc if not token.is_space and not token.is_punct and not token.is_digit]
        noun_chunks = [chunk.text.lower().strip() for chunk in doc.noun_chunks]

        lemma_lists.append(lemmatised_words)
        phrase_lists.append(noun_chunks)

    df['lemmatised_words'] = lemma_lists
    df['lemmatised_words_no_sw'] = df.lemmatised_words.apply(lambda x: [tok for tok in x if tok not in stopword_list])
    df['noun_phrases'] = phrase_lists

    # extract any named entities
    ent_text_lists = []
    ent_label_lists = []
    for text in nlp.pipe(df[col]):
        entities = [ent.text for ent in text.ents]
        labels = [ent.label_ for ent in text.ents]

        ent_text_lists.append(entities)
        ent_label_lists.append(labels)

    df['NEs'] = ent_text_lists
    df['NE_labels'] = ent_label_lists

    # clean up the noun phrases
    df['noun_strings'] = (df.noun_phrases.apply(lambda x: '.'.join(x) if x != '' else x)
                          .str.replace(" ", "_", regex=False)
                          .str.replace(".", " ", regex=False)
                          .str.lower())

    lemma_lists = []
    for text in df.noun_strings:
        doc = nlp(text)
        lemmatised_words = [token.lemma_.lower().strip() for token in doc if not token.is_space and not token.is_punct and not token.is_stop and not token.is_digit]

        lemma_lists.append(lemmatised_words)

    df['lemmatised_phrases_no_sw'] = lemma_lists

    # create bi- and trigrams from 'lemmatised words' and remove any tuples containing at least one stopword
    df['lemmatised_bigrams'] = df.lemmatised_words.apply(lambda x: list(nltk.ngrams(x, 2)))
    df['lemmatised_trigrams'] = df.lemmatised_words.apply(lambda x: list(nltk.ngrams(x, 3)))

    df['lemmatised_bigrams_no_sw'] = df.lemmatised_bigrams.apply(lambda x: [gram for gram in x if not any(sw in gram for sw in stopword_list)])
    df['lemmatised_trigrams_no_sw'] = df.lemmatised_trigrams.apply(lambda x: [gram for gram in x if not any(sw in gram for sw in stopword_list)])

    df['lemmatised_bigrams_no_sw_alt'] = df.lemmatised_words_no_sw.apply(lambda x: list(nltk.ngrams(x, 2)))
    df['lemmatised_trigrams_no_sw_alt'] = df.lemmatised_words_no_sw.apply(lambda x: list(nltk.ngrams(x, 3)))

    return df

In [17]:
# run the function over both the sentence and text dataframes
# CAUTION: This can take a little while, especially if using the trf model. If accuracy is not critical, then substitute spacy's "sm" or "md" language model
start_time = time.time()
sentence_feature_df = create_features(df=sentence_df, col="clean_text")
run_time = time.time() - start_time

print("This took " + str(round(run_time/60, 1)) + " minutes.")

This took 2.4 minutes.


In [19]:
start_time = time.time()
text_feature_df = create_features(df=text_df, col="clean_text")
run_time = time.time() - start_time

print("This took " + str(round(run_time/60, 1)) + " minutes.")

In [20]:
# save both dataframes as pickle files
# note that if these are large files, they can be compressed; see 'https://www.datacamp.com/community/tutorials/pickle-python-tutorial#pickling' for more information
with open('NF_sentence_feature_df', 'wb') as outfile:
    pickle.dump(sentence_feature_df, outfile)
outfile.close()

with open('NF_text_feature_df', 'wb') as outfile:
    pickle.dump(text_feature_df, outfile)
outfile.close()