### 1.0 Chelsea App Review scrapping from google playstore

   ###### 1.1. Install packages installation  
   ######  1.2. libraries import
   ###### 1.3. Reviews scrapping
   ######  1.4. Sentiment analysis 
   ######  1.5 Exploratory data analysis

In [1]:
# all necessary packages were installed using the pip command 
!pip install -q google_play_scraper 

!pip install -q transformers

!pip install -q plotly.express

!pip install tensorflow

!pip3 install torch torchvision torchaudio

In [26]:
!pip install pyyaml

In [6]:
# important libraries were imported 

import numpy as np # for working with arrays

import pandas as pd # for data manipulation and analysis 

import json # for working with JSON files

import plotly.express as px #visualization library

from wordcloud import WordCloud

import matplotlib.pyplot as plt # visualization library

from google_play_scraper  import app, Sort, reviews_all # for scrapping reviews

In [10]:
# reviews scrapping

chelsea_app = reviews_all('com.chelseafc.the5thstand', sleep_milliseconds=0, lang='en', country='US', sort=Sort.NEWEST)

In [12]:
chelsea_app

In [13]:
 # since output from scraped data was in JSON, I converted to CSV to easily work with it 
    
df = pd.json_normalize(chelsea_app) 

In [None]:
# changed pandas dataframe display settings to view entire column and row

pd.options.display.max_columns = False
pd.options.display.max_rows = False

In [14]:
df.columns # previewed all columns in dataframe

# Dropped irrelevant columns and rename some columns
df.drop(columns=[ 'userImage', 'replyContent', 'repliedAt' ], inplace = True)

df.rename(columns={'score':'rating', 'at':'time', 'reviewCreatedVersion': 'version_reviewed'}, inplace = True) 


df.head() # preview dataframe

Unnamed: 0,reviewId,userName,content,rating,thumbsUpCount,version_reviewed,time
0,75f1e09c-8100-4ef0-86d0-60ab89280465,Dan Maluk,"Nice app, quality videos, great",5,0,1.61.0,2022-11-06 20:41:03
1,57b7941e-8cad-45e4-8803-619edbe33bb2,Afolabi Motunrayo,Good,5,0,,2022-11-06 12:55:23
2,86fb6300-6cb8-4d57-8404-5e414e74edcd,Ronald muwata,its an amazing app,5,0,,2022-11-06 11:14:39
3,7b06b804-6ef6-4a28-838e-e49a492fb709,Joseph Okparaejiego Lyon (Dreezy),What happened to play predict and fans chat? T...,1,0,2.0.4,2022-11-06 07:33:14
4,bd9ecf6b-8c55-4e5f-9e4b-053137098715,Uwimana Racine,chelsea3 arsenal1,5,0,2.0.3,2022-11-06 05:24:13


In [15]:
# created new variable for selected columns since all fields are not needed

df_playstore = df[['reviewId', 'userName', 'content', 'rating','thumbsUpCount', 'version_reviewed', 'time']]

In [16]:
df_playstore.head()

Unnamed: 0,reviewId,userName,content,rating,thumbsUpCount,version_reviewed,time
0,75f1e09c-8100-4ef0-86d0-60ab89280465,Dan Maluk,"Nice app, quality videos, great",5,0,1.61.0,2022-11-06 20:41:03
1,57b7941e-8cad-45e4-8803-619edbe33bb2,Afolabi Motunrayo,Good,5,0,,2022-11-06 12:55:23
2,86fb6300-6cb8-4d57-8404-5e414e74edcd,Ronald muwata,its an amazing app,5,0,,2022-11-06 11:14:39
3,7b06b804-6ef6-4a28-838e-e49a492fb709,Joseph Okparaejiego Lyon (Dreezy),What happened to play predict and fans chat? T...,1,0,2.0.4,2022-11-06 07:33:14
4,bd9ecf6b-8c55-4e5f-9e4b-053137098715,Uwimana Racine,chelsea3 arsenal1,5,0,2.0.3,2022-11-06 05:24:13


In [None]:
# in order to install latest TensorsorFlow, pip version 19.0 or above is required.
pip install --upgrade pip

In [17]:
# imported deep learning libraries 

from transformers import pipeline
import tensorflow

In [20]:
# using a sentiment analysis pipeline which is a pre_trained model from 'huggingface.com'

sentiment_analysis = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

In [None]:
# checked data type of columns contents
df_playstore.dtypes

In [18]:
# This analysis is carried out on string data so i converted the 'content' column to string  

df_playstore['content'] = df_playstore['content'].astype('str')

In [25]:
# Created a new column that contains the result of the sentimental analysis of 'content' field

df_playstore['sentiments'] = df_playstore['content'].apply(lambda x: sentiment_analysis(x))

In [28]:
df_playstore.head()  # preview Dataframe

Unnamed: 0,reviewId,userName,content,rating,thumbsUpCount,version_reviewed,time,sentiments
0,75f1e09c-8100-4ef0-86d0-60ab89280465,Dan Maluk,"Nice app, quality videos, great",5,0,1.61.0,2022-11-06 20:41:03,"[{'label': 'POSITIVE', 'score': 0.998813867568..."
1,57b7941e-8cad-45e4-8803-619edbe33bb2,Afolabi Motunrayo,Good,5,0,,2022-11-06 12:55:23,"[{'label': 'POSITIVE', 'score': 0.998415231704..."
2,86fb6300-6cb8-4d57-8404-5e414e74edcd,Ronald muwata,its an amazing app,5,0,,2022-11-06 11:14:39,"[{'label': 'POSITIVE', 'score': 0.998809099197..."
3,7b06b804-6ef6-4a28-838e-e49a492fb709,Joseph Okparaejiego Lyon (Dreezy),What happened to play predict and fans chat? T...,1,0,2.0.4,2022-11-06 07:33:14,"[{'label': 'NEGATIVE', 'score': 0.999450623989..."
4,bd9ecf6b-8c55-4e5f-9e4b-053137098715,Uwimana Racine,chelsea3 arsenal1,5,0,2.0.3,2022-11-06 05:24:13,"[{'label': 'POSITIVE', 'score': 0.964373111724..."


In [29]:
# created 'lsentiment' column from 'sentiments' column

df_playstore['sentiment'] = df_playstore['sentiments'].apply(lambda x: x[0]['label'])

# created 'score' column from 'sentiments' column

df_playstore['score'] = df_playstore['sentiments'].apply(lambda x: x[0]['score'])

In [30]:
# I removed the 'sentiments' column because it was now redundant

df_playstore.drop(columns=['sentiments'], inplace=True)

In [31]:
df_playstore.head()

Unnamed: 0,reviewId,userName,content,rating,thumbsUpCount,version_reviewed,time,sentiment,score
0,75f1e09c-8100-4ef0-86d0-60ab89280465,Dan Maluk,"Nice app, quality videos, great",5,0,1.61.0,2022-11-06 20:41:03,POSITIVE,0.998814
1,57b7941e-8cad-45e4-8803-619edbe33bb2,Afolabi Motunrayo,Good,5,0,,2022-11-06 12:55:23,POSITIVE,0.998415
2,86fb6300-6cb8-4d57-8404-5e414e74edcd,Ronald muwata,its an amazing app,5,0,,2022-11-06 11:14:39,POSITIVE,0.998809
3,7b06b804-6ef6-4a28-838e-e49a492fb709,Joseph Okparaejiego Lyon (Dreezy),What happened to play predict and fans chat? T...,1,0,2.0.4,2022-11-06 07:33:14,NEGATIVE,0.999451
4,bd9ecf6b-8c55-4e5f-9e4b-053137098715,Uwimana Racine,chelsea3 arsenal1,5,0,2.0.3,2022-11-06 05:24:13,POSITIVE,0.964373


In [40]:
# Proceeded to save Dataframe as CSV

df_playstore.to_csv('C:/Users/User\Desktop/CSV and XSLX/sentiments.csv')

###### Create Checkpoint 

In [52]:
df_playstore2 = df_playstore.copy()
df_playstore2

##### Exploratory Analysis of data from Playstore

In [56]:
# looked at the percentage of total of sentiments 

print('92% of reviews dropped were Positive and just 7% were Negative')

print(df_playstore2['sentiment'].value_counts(normalize=True) * 100)


92% of reviews dropped were Positive and just 7% were Negative
POSITIVE    92.157432
NEGATIVE     7.842568
Name: sentiment, dtype: float64


In [57]:
fig = px.histogram(df_playstore2, x='sentiment',color= 'sentiment', height = 400, width = 700,
                  text_auto = True)

fig.show()

In [58]:
#explored negative sentiments only 

negative = df_playstore2.loc[df_playstore2['sentiment']== 'NEGATIVE']

# average rating for all negative reviews 

round(negative['rating'].mean(),2)

3.32

In [62]:
# created a table for negative review content only 

negative_revs = negative['content']

negative_revs.head() #preview

In [60]:
# in order to analyse words that were repeated most often

n = negative['content'].str.split().tolist() #converted 'content' column into a list of strings 

ns = pd.Series(n) # converted it into a pandas series to enable me restructure the lists 

s = ns.apply(pd.Series).stack().reset_index(drop = True) # I restructured the series using the .stack() method

d = pd.DataFrame(s, columns=['words']) # I converted the series into a Dataframe



In [None]:
d.to_csv('C:/Users/User/Desktop/CSV and XSLX/Chelsea App/wordcloud_playstore.csv') # exported as a Csv file