# **Scrape Apple App Store Reviews**

In [None]:
pip install requests pandas tqdm numpy

In [None]:
pip install requests

In [None]:
import random
import requests
print(f"requests=={requests.__version__}")
import re
import time
from tqdm import tqdm
import sys
import numpy as np
import pandas as pd


In [None]:
pip install app-store-web-scraper

In [None]:
from app_store_web_scraper import AppStoreSession, AppStoreEntry

country_codes = ['us', 'in', 'gb', 'ca', 'au', 'ae', 'de', 'fr', 'it', 'sg']
all_reviews = []
for i in country_codes:
    try:
        session = AppStoreSession()
    
        entry = AppStoreEntry("726070762", country=i, session=session) #update your app ID here
        reviews = list(entry.reviews())
        for r in reviews:
            if hasattr(r, "__dict__"):
                rd = r.__dict__.copy()
            else:
                rd = dict(r._asdict())
            rd['country'] = i
            all_reviews.append(rd)
        print(f"Fetched {len(reviews)} from {i}")
    except Exception as e:
        print(f"Failed for country: {i} => {str(e)}")

From extracted reviews removing Customer Data 

In [None]:
filtered_reviews = [
    {k: v for k, v in review.items() if k not in ['user_name', 'id']}
    for review in all_reviews
]

In [None]:
len(filtered_reviews)

Converting it into dataframe

In [None]:
from datetime import datetime
reviews_data = []
for review in filtered_reviews :
    reviews_data.append({
        'content': review.get('content', ''),
        'score': review.get('rating', 0),
        'at': review.get('date', datetime.now()),
        'title': review.get('title', ''),
        'reviewCreatedVersion': review.get('app_version', ''),
        'Country':review.get('country',''),
        'replyContent': None,
        'repliedAt': None
    })

reviews_df = pd.DataFrame(reviews_data)

In [None]:
reviews_df

In [None]:
reviews_df.to_csv('Appstore_reviews.csv', index=False)

In [None]:
pip freeze > requirements.txt

In [None]:
pip install textblob 

In [None]:
pip uninstall numpy scipy -y


In [None]:
pip install numpy scipy 

In [None]:
pip install --upgrade nltk


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import re

def _categorize_rating(score):
    """Categorize rating into groups"""
    if score >= 4:
        return 'Positive'
    elif score >= 3:
        return 'Neutral'
    else:
        return 'Negative'

def _clean_text(text):
    """Clean text for analysis"""
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_reviews():
    """Minimal preprocessing optimized for LLM analysis"""
    print("AppStoreEntryPreprocessing reviews...")

    if reviews_df is None or reviews_df.empty:
        print("No reviews to preprocess")
        return

    processed_df = reviews_df.copy()
    processed_df['content_clean'] = processed_df['content'].apply(_clean_text)
    processed_df['review_length'] = processed_df['content'].str.len()
    processed_df['word_count'] = processed_df['content'].str.split().str.len()
    processed_df['sentence_count'] = processed_df['content'].apply(lambda x: len(sent_tokenize(x)))
    processed_df['review_date'] = pd.to_datetime(processed_df['at'])
    processed_df['year'] = processed_df['review_date'].dt.year
    processed_df['month'] = processed_df['review_date'].dt.month
    processed_df['day_of_week'] = processed_df['review_date'].dt.day_name()
    processed_df['rating_category'] = processed_df['score'].apply(_categorize_rating)
    print(f"✓ Preprocessed {len(processed_df)} reviews")
    return processed_df


In [None]:
preprocess_reviews()