### TASK 1: GATHERING REVIEWS

In [8]:
# Installing panda and google-play-scrapper libraries

!pip install google-play-scraper pandas
import pandas as pd
from google_play_scraper import app, Sort, reviews_all



In [9]:
# Chosen Apps from Playstore

apps = {
    "Google Assistant": "com.google.android.apps.googleassistant",
    "Amazon Alexa": "com.amazon.dee.app",
    "Vision - Smart Voice Assistant": "com.visionforhome",
    "SoundHound Chat AI App": "com.hound.android.app",
    "Microsoft Copilot": "com.microsoft.copilot"
}

In [10]:
# Scraping and storing the reviews from playstore

all_reviews = [] # All the scraped reviews will be present in this variable
total_count_of_reviews = 0  # Total count of reviews scraped

for app_name, app_id in apps.items():
    reviews = reviews_all(app_id, sleep_milliseconds = 0, lang = 'en', country = 'US', sort = Sort.NEWEST)
    for review in reviews:
        all_reviews.append({
            "Package name": app_id,
            "Reviewer name": review['userName'],
            "Review": review['content'],
            "Rating": review['score']
        })
    

    # The total number of reviews scraped from each app
    print(f"Scraped {len(reviews)} reviews from {app_id}")
    total_count_of_reviews += len(reviews)

print(f"\nTotal reviews scraped from all the apps: {total_count_of_reviews}")

Scraped 140565 reviews from com.google.android.apps.googleassistant
Scraped 251094 reviews from com.amazon.dee.app
Scraped 710 reviews from com.visionforhome
Scraped 4894 reviews from com.hound.android.app
Scraped 75182 reviews from com.microsoft.copilot

Total reviews scraped from all the apps: 472445


In [11]:
# Saving all the review to a csv file

df = pd.DataFrame(all_reviews)
output_file = "all_reviews_of_apps_chosen.csv"
df.to_csv(output_file, index=False)
print(f"All reviews saved to {output_file}")

All reviews saved to all_reviews_of_apps_chosen.csv


### TASK 2: PREPROCESSING THE TEXT/REVIEWS

In [12]:
# Installing and importing libraries for text preprocessing

!pip install num2words
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from num2words import num2words



In [15]:
df = pd.read_csv("all_reviews_of_apps_chosen.csv")

def preprocess_reviews(text):

    # Handles non string values
    if not isinstance(text, str):
        return None 
    
    # Remove all characters that are NOT letters, numbers, or spaces
    text = re.sub(r'[^\w\s]+', '', text)

    # Function to covert numbers to words
    def num_to_words(numbers):
        try:
            return num2words(int(numbers.group()))
        except:
            return numbers.group()
    
    text = re.sub(r'\d+', num_to_words, text)

    # Remove all extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Converts all text to lower case
    text = text.lower()

    # Remove all stop words
    stop_words = set(stopwords.words('english'))
    def remove_stopwords(text):

        words = text.split()
        filtered_words = []

        for word in words:
            if word not in stop_words:
                filtered_words.append(word)

        return " ".join(filtered_words)

    text = remove_stopwords(text)

    # Lemmatize the reviews
    lemmatizer = WordNetLemmatizer()
    def lemmatize_text(text):

        words = text.split()
        lemmatized_words = []

        for word in words:
            le = lemmatizer.lemmatize(word)
            lemmatized_words.append(le)
        
        return " ".join(lemmatized_words)
    
    text = lemmatize_text(text)

    # Removing any empty reviews
    preprocessed_text = text.strip()
    if not preprocessed_text:
        return None

    return preprocessed_text

df['Review'] = df['Review'].apply(preprocess_reviews)
df = df.dropna(subset=['Review'])

print("Sample Preprocessed Reviews:")
df.sample(50)



Sample Preprocessed Reviews:


Unnamed: 0,Package name,Reviewer name,Review,Rating
305778,com.amazon.dee.app,A Google user,app good app really work like app,5
461942,com.microsoft.copilot,A Google user,excellent,5
105660,com.google.android.apps.googleassistant,A Google user,rajukumar,5
139491,com.google.android.apps.googleassistant,A Google user,clever,4
167539,com.amazon.dee.app,A Google user,uploading private photo without permission,1
333537,com.amazon.dee.app,A Google user,horrible,1
365506,com.amazon.dee.app,A Google user,awsm,5
416011,com.microsoft.copilot,A Google user,good,1
9993,com.google.android.apps.googleassistant,A Google user,nice nice,5
274997,com.amazon.dee.app,A Google user,good time alexa couldnt understand word,5


In [16]:
df.to_csv("Preprocessed_reviews.csv", index=False)  # Saving the preprocessed reviews to a csv file