In [1]:
import sys
import os
import random
from operator import add, mul
from pyspark import SparkContext, SparkConf
from pyspark import SparkFiles
from pyspark.sql import SparkSession, SQLContext

In [2]:
spark = SparkSession.builder.appName("").getOrCreate() #

In [3]:
sc = spark.sparkContext
spark.sparkContext.applicationId

u'application_1528714825862_139422'

### 1. Load csv into spark as a text file

In [4]:
location = "/AppleStore.csv"
apple = sc.textFile(location)

### 2. Parse the data

In [5]:
import re
pattern = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
apple_lines = apple.map(lambda line: re.split(pattern, line)).filter((lambda cols: cols[0]!='""'))
apple_lines.count()

7197

### 3. MB и GB в новую колонку

In [6]:
MB = apple_lines.map(lambda x: float(x[3]) / 1000.)
GB = MB.map(lambda x: x / 1000.)

### 4. Список из 10 самых топовых приложений

In [8]:
# id, track_name, rating_count_ver
apple_lines.sortBy(lambda line: float(line[7]), ascending=False).map(lambda line: (line[2], line[7])).take(10)

[(u'"Infinity Blade"', u'177050'),
 (u'"Geometry Dash Meltdown"', u'117470'),
 (u'"My Verizon"', u'107245'),
 (u'"Real Basketball"', u'94315'),
 (u'"Zillow Real Estate - Homes for Sale & for Rent"', u'88478'),
 (u'"WhatsApp Messenger"', u'73088'),
 (u'"Clear Vision (17+)"', u'69225'),
 (u'"Guess My Age \ue020 Math Magic"', u'68841'),
 (u'"Trigger Fist"', u'58269'),
 (u'"Zappos: shop shoes & clothes, fast free shipping"', u'39452')]

### 5. Разница в кол-ве скриншотов между самыми рейтинговыми и не популярными (по рейтингу) приложениями

In [9]:
# "ipadSc_urls.num"

# рейтинг больше 3х
highest = apple_lines.filter(lambda line: float(line[8]) > 3.0)
higest_sc = highest.map(lambda line: float(line[14])).mean()
lowest = apple_lines.filter(lambda line: float(line[8]) < 3.0)
lowest_sc = lowest.map(lambda line: float(line[14])).mean()

print(higest_sc - lowest_sc)

1.3675834041


### 6. Сколько топовых приложений поддерживает мультиязычность

In [10]:
# lang.num
total_higest = highest.count()
print(total_higest)
highest_multiple = highest.map(lambda line: float(line[15])).filter(lambda value: value > 1).count()
print(str(highest_multiple * 100 / total_higest) + " %")

5483
50 %


### 8. Сравните статистики по разным группам приложений (на пример по жанрам)


In [54]:
genre_prices = apple_lines.map(lambda line: (line[12], float(line[5])))
grp = genre_prices.groupByKey()
print(list((j[0], sum(list(j[1]))/len(list(j[1])) ) for j in grp.take(10)))

[(u'"Travel"', 1.1203703703703702), (u'"Food & Drink"', 1.552380952380952), (u'"Photo & Video"', 1.4732951289398313), (u'"Music"', 4.835434782608699), (u'"Reference"', 4.836875000000004), (u'"Business"', 5.116315789473686), (u'"Navigation"', 4.1247826086956545), (u'"Lifestyle"', 0.8854166666666657), (u'"Social Networking"', 0.3398802395209582), (u'"Productivity"', 4.330561797752813)]


In [None]:
# посмотрите ещё статистики