## Install and import libraries

In [1]:
pip install google-play-scraper nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import nltk
import re

from google_play_scraper import Sort, reviews_all
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
nltk.download('punkt')
nltk.download("vader_lexicon")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\WT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\WT\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

## Scrape google play user reviews

In [4]:
app_reviews = reviews_all(
        "my.com.tngdigital.ewallet",
        sleep_milliseconds=0, # defaults to 0
        lang='en', # defaults to 'en'
        country='us', # defaults to 'us'
        sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
    )

In [5]:
df = pd.DataFrame(np.array(app_reviews),columns=['review'])
df2 = df.join(pd.DataFrame(df.pop('review').tolist()))

In [6]:
reviews_df = df2[['content', 'score']]

## Text Preprocessing

In [7]:
reviews_df['content'] = reviews_df['content'].astype(str)
reviews_df['content']= reviews_df['content'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df['content'] = reviews_df['content'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_df['content']= reviews_df['content'].apply(lambda x: x.lower())


In [8]:
extracted_text=[]

def extract_reviews(s):
    expe = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
    return expe.sub(r'',s)

for y in reviews_df['content']:
    extracted_text.append(str(extract_reviews(y)))

In [9]:
filtered_text = []
for s in extracted_text:
  filtered_text.append(re.sub('[^a-z *]', "", s.lower()))

In [10]:
reviews_df = reviews_df.join(pd.DataFrame(filtered_text))
reviews_df = reviews_df.drop('content', axis = 1)
reviews_df.columns = ['score', 'content']

In [11]:
sw = []
with open('stopwords.txt', 'r') as f:
  sw.append(f.read().splitlines())

In [12]:
stop_words = stopwords.words('english')
stop_words.extend(sw[0])
stop_words.extend(['pls', 'want', 'n', 'nt', 'tak', 'cant', 'nak', 'x', 'ic'])

In [13]:
filtered_sentence = []
for s in reviews_df['content']:
  words = s.split()
  l = []
  for word in words:
    if(not word in stop_words):
      l.append(word)
  filtered_sentence.append(' '.join(l))

In [14]:
reviews_df = reviews_df.join(pd.DataFrame(filtered_sentence))
reviews_df = reviews_df.drop('content', axis = 1)
reviews_df.columns = ['score', 'content']

In [15]:
pos_reviews = reviews_df[reviews_df['score'] > 3]['content']
neg_reviews = reviews_df[reviews_df['score'] < 3]['content']

In [16]:
with open('pos_corpus.txt', 'w') as f:
  for w in pos_reviews:
    f.write("%s " % w)

In [17]:
with open('neg_corpus.txt', 'w') as f:
  for w in neg_reviews:
    f.write("%s " % w)

## Reference
1. Additional stopwords list: https://www.ranks.nl/stopwords