In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Data processing

## Load

In [9]:
df = pd.read_excel("raw-data.xlsx")
df.head()

Unnamed: 0,provider,text_url,text,names_url,names
0,Artificial Grass GB,https://www.artificialgrassgb.co.uk/,\nWhy choose artificial grass over real grass?...,https://www.artificialgrassgb.co.uk/,"Velvet, English Garden, Cadiz, Gold, Forest, L..."
1,Grass Direct,,,https://www.grass-direct.co.uk/artificial-grass,"Oasis, Sydney, Bordeaux, Antigua, Geneva, Melb..."
2,Express grass,,,https://expressgrass.com/artificial-grass.html,"Woodstock, Classic, Hartfield, Chartwell, Oakh..."
3,Easigrass,https://www.easigrass.com/areas-we-cover/londo...,"No Mud, No Mess, No Mowing.\n\nArtificial Gras...",https://www.easigrass.com/easi-grass-products/...,"Mayfair, Belgravia, Chelsea, Kensington, Holla..."
4,Nustone,https://nustone.co.uk/product-category/artific...,Artificial Grass will transform your garden an...,https://nustone.co.uk/product-category/artific...,"Tahoe, Cleveland, Santa Fe, Nebraska, Ozark"


## Text
We need to clean the text data in the title so that we can do some proper text mining. This includes

- Standardising
- Tokenising and removing stop words
- Lemmatising


### Standardising

We will remove punctuation and clean any other symbols/words as needed.


In [72]:
# make text lower case
text = df["text"].str.lower()

# replace values within titles

# function to loop through the column and replace substrings
def replace_values(text, dic):
    for x, y in dic.items():
        text = text.str.replace(x, y, regex=True)
    return text

# list of values to be replaced, including punctuation
replace_dict = {"&amp;": " ", #xml syntax for &
                "\n":" ",
                "[!\"#$%&()*+,./:;<=>?@[\]^_`{|}~“”-]": " ",
                "  ": " ", #double space
                "’s" :"",  
                "low maintenance" :"lowmaintenance",  
                "’m" : "",
                "y'all": "you all",
                "i'm": "i am",
                "i've": "i have",
                "it'll": "it will",
                "we're": "we are",
                "i'd": "i would",
               }
                
# apply function
text = replace_values(text, replace_dict)

# strip white space at the end
text = text.str.strip()

# add new column to dataframe
df["text_standardised"] = text

print(text)

0     why choose artificial grass over real grass un...
1                                                   NaN
2                                                   NaN
3     no mud no mess no mowing  artificial grass in ...
4     artificial grass will transform your garden an...
5     professional artificial turf installation in l...
6     no more muddy feet get the perfect artificial ...
7                                                   NaN
8     artificial grass is great for people in london...
9                                                   NaN
10    looks great it looks luscious and full of life...
11    lush green grass all year round fake grass loo...
12    if you love grass but don’t want the mess mud ...
13    make the most of your garden with trulawn lowm...
14    all year round play area no more mud  artifici...
15    pet and child friendly artificial grass ur art...
16    say hello to our green grass the artificial gr...
17    struggling to maintain your lawn looking p

### Remove stop words

We now remove stop words that don't really help to identify the sentiment or topic of a sentence.

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lisa.hornung\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [73]:
# create empty list to store text without stop words
text_no_stop_words = []

# iterate through each word in each text row and append those that are no stop words
# split titles into substrings using space as delimiter
for words in text.str.split(" "):
    x = []
    try:
        for word in words:
            if word not in stop_words:
                x.append(word)
    
    #exception for missing data
    except TypeError:
        x.append(" ")
    text_no_stop_words.append(x)

# join titles back together
text_no_stop_words = [" ".join(items) for items in text_no_stop_words]

# add title to dataframe
df["text_no_stop_words"] = text_no_stop_words

print(text_no_stop_words[:3])

["choose artificial grass real grass unlike real grass artificial grass give perfect looking lawn year round usable lawn year round there's mowing mud bald patches weeds waterlogged soggy lawns giving even time enjoy garden artificial grass installed artificial grass installed literally anywhere standard garden installations including decking flagstones block paving concrete tarmac areas balconies terraces schools nurseries artificial grass great commercial use creating big statement business said using artificial grass events whether public private safe children yes perfect choice create safe play area children year round supply softness grass available high level bounce back help keep children safe matter play activities suitable pets yes grass pet friendly fact many kennels use artificial grass colour grass affected pet mess mess easily removed residue easily rinsed away tough stains soap detergent used followed hosing artificial grass require maintenance lowmaintenance product clea

## Lemmatize

Now we want to reduce the inflectional forms of each word into a common base or root. We do this by using a lemmatiser that detects the lemma for each word.

Read [this article](https://blog.bitext.com/what-is-the-difference-between-stemming-and-lemmatization/) for more information on the difference betweent stemmatising and lemmatising.

You will need to install spacy and download the en_core_web_sm package. Explanation [here](https://spacy.io/usage).


In [74]:
import en_core_web_sm

# load model from SpaCy
nlp = en_core_web_sm.load()

# create new list to store lemmatised titles
text_lemmatised = []

# iterate through each word in each title and append the lemmatised version of the word
for words in pd.Series(text_no_stop_words):
    x = []
    for word in nlp(words):
        x.append(word.lemma_)
    text_lemmatised.append(x)

# join titles back together
text_lemmatised = [" ".join(items) for items in text_lemmatised]

df["text_lemmatised"] = text_lemmatised

# Analysis

- [How To Perform Sentiment Analysis in Python 3 Using the Natural Language Toolkit (NLTK)](https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk)
- [Sentiment Analysis on Reddit News Headlines with Python’s Natural Language Toolkit (NLTK)](https://www.learndatasci.com/tutorials/sentiment-analysis-reddit-headlines-pythons-nltk/)

## Word frequency

In [75]:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def process_text(text):
    tokens = []
    lines = []
    for line in text:
        toks = tokenizer.tokenize(line)
        toks = [t.lower() for t in toks if t.lower() not in stop_words]
        tokens.extend(toks)
        lines.append(toks)
    
    return tokens, lines

In [76]:
text_tokens, line_tokens = process_text(df["text_lemmatised"])
df["tokens"] = line_tokens

word_freq = nltk.FreqDist(text_tokens)
word_freq = word_freq.most_common()
word_freq = pd.DataFrame(word_freq, columns=["Word", "Count"])

word_freq

Unnamed: 0,Word,Count
0,grass,207
1,artificial,123
2,lawn,66
3,garden,50
4,pet,50
...,...,...
968,lawncare,1
969,landscaper,1
970,developer,1
971,dense,1


In [77]:
## add provider count

provider = []
for word in word_freq["Word"]:
    counter = 0
    for i in range(len(df)):
        if word in df["tokens"].iloc[i]:
            counter+=1
    provider.append(counter)

word_freq["Provider_count"] = provider

#export output
word_freq.to_csv("word_frequency.csv", index=False)

### Histogram

In [78]:
word_freq[(word_freq["Count"]>2) & (word_freq["Provider_count"]>2)]

Unnamed: 0,Word,Count,Provider_count
0,grass,207,24
1,artificial,123,24
2,lawn,66,19
3,garden,50,19
4,pet,50,19
...,...,...,...
260,pleasing,3,3
261,stunning,3,3
262,hay,3,3
263,fever,3,3


In [81]:
df.to_csv("data-clean.csv", index=False)