## 1. Установим соединение со SPARK

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_json, col
from pyspark.sql.types import *
from os.path import abspath

spark = SparkSession \
    .builder \
    .appName("pyspark-notebook") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "512m") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

24/10/17 08:23:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## 2. Load Data

In [2]:
!curl -sSLo data.csv 'https://davidmegginson.github.io/ourairports-data/airports.csv'

In [3]:
data = spark.read.csv(path="data.csv", sep=",", header=True)

                                                                                

In [5]:
data.count()

80864

In [6]:
data.head()

Row(id='6523', ident='00A', type='heliport', name='Total RF Heliport', latitude_deg='40.070985', longitude_deg='-74.933689', elevation_ft='11', continent='NA', iso_country='US', iso_region='US-PA', municipality='Bensalem', scheduled_service='no', gps_code='K00A', iata_code=None, local_code='00A', home_link='https://www.penndot.pa.gov/TravelInPA/airports-pa/Pages/Total-RF-Heliport.aspx', wikipedia_link=None, keywords=None)

## 3. Обработка данных с использованием pyspark.sql.dataframe

In [7]:
df1 = data.select(["id", "name", "municipality", "iso_country"])

In [8]:
df1.head()

Row(id='6523', name='Total RF Heliport', municipality='Bensalem', iso_country='US')

In [9]:
cols_description = df1.filter(df1['iso_country'] == 'US')

In [10]:
df1 = df1.join(other=cols_description, on=['iso_country'], how='left_anti')

In [11]:
df1 = df1.withColumnRenamed("id1", 'c1').withColumnRenamed("name", "c2").withColumnRenamed("municipality",
                                                                                           "c3").withColumnRenamed(
    "iso_country", "c4")

## 4. Сохраняем данные в таблицу

In [13]:
df1.write.saveAsTable('hive_table')

                                                                                

## 5. Получаем данные из БД sql-запросом

In [14]:
ndf = spark.sql("""
select * from hive_table
""")

In [15]:
ndf.show(10)

+---+------+--------------------+----------------+
| c4|    id|                  c2|              c3|
+---+------+--------------------+----------------+
| PR|  6680|     Cuylers Airport|       Vega Baja|
| MH|  4650|      Utirik Airport|   Utirik Island|
| PR|349103|Advanced Public H...|         Isabela|
| MP|  4777|    Dynasty Heliport|San Jose, Tinian|
| PR|  7793|Villamil-304 Ponc...|        San Juan|
| PR|330679|Emp. Coco Beach G...|      Rio Grande|
| PR|  8626|San Patricio Heli...|        Guaynabo|
| PR|  9222|La Concepcion Hos...|      San German|
| PR| 46121|Caribbean Constr ...|        Guaynabo|
| PR|346082|PR Police-Ponce A...|           Ponce|
+---+------+--------------------+----------------+
only showing top 10 rows

