In [1]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
import spacy
# !python3 -m spacy download en_core_web_sm
import sklearn

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/qiting/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
reviews = pd.read_csv('raw_reviews.csv')

In [3]:
reviews.head()

Unnamed: 0,Sentiment,Time,Text
0,positive,18/6/21,This is a very healthy dog food. Good for thei...
1,positive,7/7/21,I've been very pleased with the Natural Balanc...
2,positive,18/6/21,"Before I was educated about feline nutrition, ..."
3,positive,7/7/21,"My holistic vet recommended this, along with a..."
4,positive,1/7/21,I bought this coffee because its much cheaper ...


In [4]:
def get_basic_tokens(text):
    #lower case
    text = text.lower()
    
    #remove ' from 's, 've, 'nt
    text = re.sub(r"'", '', text)
    
    #remove br in <br> or </br>
    text = re.sub(r"br", '', text)
    
    #remove numbers
    text = re.sub(r"\d+", '', text)
    
    #split by space and punctuation
    tokens = re.split(r'[^\w]', text)
    
    #remove empty
    tokens = [i for i in tokens if i != '']
    
    return tokens

In [5]:
reviews['tokens'] = reviews.Text.apply(get_basic_tokens)
reviews['num_tokens_original'] = reviews.tokens.apply(lambda x: len(x))

In [6]:
reviews.head()

Unnamed: 0,Sentiment,Time,Text,tokens,num_tokens_original
0,positive,18/6/21,This is a very healthy dog food. Good for thei...,"[this, is, a, very, healthy, dog, food, good, ...",25
1,positive,7/7/21,I've been very pleased with the Natural Balanc...,"[ive, been, very, pleased, with, the, natural,...",62
2,positive,18/6/21,"Before I was educated about feline nutrition, ...","[before, i, was, educated, about, feline, nutr...",127
3,positive,7/7/21,"My holistic vet recommended this, along with a...","[my, holistic, vet, recommended, this, along, ...",93
4,positive,1/7/21,I bought this coffee because its much cheaper ...,"[i, bought, this, coffee, because, its, much, ...",75


In [7]:
nltk.pos_tag(['biG', 'Bigger','bigger'])

[('biG', 'JJ'), ('Bigger', 'NNP'), ('bigger', 'JJR')]

In [8]:
# JJ adjective – ‘big’ 
# JJR adjective, comparative – ‘bigger’ 
# JJS adjective, superlative – ‘biggest’ 
# RB adverb – very, silently, 
# RBR adverb, comparative – better 
# RBS adverb, superlative – best 

def get_adj_pos_count(tokens):
    tagged = nltk.pos_tag(tokens)
    adj_tag = ['JJ', 'JJR', 'JJS']
    adj_pos = [i for i in tagged if i[1] in (adj_tag)]
    return len(adj_pos)

def get_adv_pos_count(tokens):
    tagged = nltk.pos_tag(tokens)
    adv_tag = ['RB', 'RBR', 'RBS']
    adv_pos = [i for i in tagged if i[1] in (adv_tag)]
    return len(adv_pos)

In [9]:
reviews['adj_cnt'] = reviews.tokens.apply(get_adj_pos_count)
reviews['adv_cnt'] = reviews.tokens.apply(get_adv_pos_count)

In [10]:
reviews.head()

Unnamed: 0,Sentiment,Time,Text,tokens,num_tokens_original,adj_cnt,adv_cnt
0,positive,18/6/21,This is a very healthy dog food. Good for thei...,"[this, is, a, very, healthy, dog, food, good, ...",25,4,2
1,positive,7/7/21,I've been very pleased with the Natural Balanc...,"[ive, been, very, pleased, with, the, natural,...",62,17,2
2,positive,18/6/21,"Before I was educated about feline nutrition, ...","[before, i, was, educated, about, feline, nutr...",127,22,6
3,positive,7/7/21,"My holistic vet recommended this, along with a...","[my, holistic, vet, recommended, this, along, ...",93,15,10
4,positive,1/7/21,I bought this coffee because its much cheaper ...,"[i, bought, this, coffee, because, its, much, ...",75,12,7
