# Top 100 AppIDs with maximum reviews 

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col
import time

In [3]:
spark = (
    SparkSession.builder
    .appName("Top 100 Apps By Reviews")
    .master('local[*]')
    .getOrCreate()
)

line_df = spark.read.text('review_lengths.txt')

parsed_df = (
    line_df
    .withColumn("appId", regexp_extract(col("value"), r"appId\s+(\d+)", 1).cast("int"))
    .withColumn("review_count", regexp_extract(col("value"), r":\s+(\d+)", 1).cast("long"))
    .select("appId", "review_count")
)

top100_df = (
    parsed_df
    .orderBy(col("review_count").desc())
    .limit(100)
)


top100_df.show(100, truncate=False)
spark.stop()

+------+------------+
|appId |review_count|
+------+------------+
|322330|453500      |
|291550|383839      |
|444090|346430      |
|304930|343298      |
|284160|314513      |
|322170|278092      |
|261550|261961      |
|107410|222400      |
|264710|210400      |
|238960|209999      |
|294100|207592      |
|393380|206625      |
|311210|190500      |
|435150|184200      |
|268910|182149      |
|48700 |165776      |
|49520 |152400      |
|304390|149355      |
|244210|148438      |
|262060|147687      |
|444200|147505      |
|70    |146672      |
|250900|139200      |
|292030|138995      |
|379430|138100      |
|319630|137600      |
|433340|136456      |
|391220|135114      |
|270880|133391      |
|306130|133100      |
|438100|131700      |
|244850|128887      |
|218620|127397      |
|204360|124729      |
|323190|121988      |
|620   |121900      |
|211820|121649      |
|412020|119798      |
|10    |117100      |
|239140|115598      |
|220200|114699      |
|219740|109354      |
|394360|10

In [4]:
import time

def parse_line(line):
    """
    Given a line like "Total reviews for appId 10: 117100",
    returns (appId:int, review_count:int).
    """
    parts = line.strip().split()
    # parts = ['Total', 'reviews', 'for', 'appId', '10:', '117100']
    # appId with colon at index 4, review count at index 5
    app_id = int(parts[4].rstrip(':'))
    review_count = int(parts[5])
    return app_id, review_count

def top_n_apps_from_file(file_path, n=100):
    app_reviews = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip():
                app_id, count = parse_line(line)
                app_reviews.append((app_id, count))
    # sort descending by count
    app_reviews.sort(key=lambda x: x[1], reverse=True)
    return app_reviews[:n]

if __name__ == "__main__":
    start_time = time.time()
    top100 = top_n_apps_from_file("review_lengths.txt", n=100)
    elapsed = time.time() - start_time

    print(f"Top {len(top100)} apps by review count:")
    for app_id, count in top100:
        print(f"AppId {app_id}: {count}")

    print(f"\nElapsed time: {elapsed:.4f} seconds")


Top 100 apps by review count:
AppId 322330: 453500
AppId 291550: 383839
AppId 444090: 346430
AppId 304930: 343298
AppId 284160: 314513
AppId 322170: 278092
AppId 261550: 261961
AppId 107410: 222400
AppId 264710: 210400
AppId 238960: 209999
AppId 294100: 207592
AppId 393380: 206625
AppId 311210: 190500
AppId 435150: 184200
AppId 268910: 182149
AppId 48700: 165776
AppId 49520: 152400
AppId 304390: 149355
AppId 244210: 148438
AppId 262060: 147687
AppId 444200: 147505
AppId 70: 146672
AppId 250900: 139200
AppId 292030: 138995
AppId 379430: 138100
AppId 319630: 137600
AppId 433340: 136456
AppId 391220: 135114
AppId 270880: 133391
AppId 306130: 133100
AppId 438100: 131700
AppId 244850: 128887
AppId 218620: 127397
AppId 204360: 124729
AppId 323190: 121988
AppId 620: 121900
AppId 211820: 121649
AppId 412020: 119798
AppId 10: 117100
AppId 239140: 115598
AppId 220200: 114699
AppId 219740: 109354
AppId 394360: 106697
AppId 440900: 106436
AppId 356190: 106013
AppId 219150: 105182
AppId 220: 103800