In [73]:
# create df by reading a file
df = spark.read.json("/home/mutwiri/hadoop/spark/data/package.json",  multiLine=True)

In [53]:
df.columns

['description',
 'devDependencies',
 'homepage',
 'license',
 'maintainers',
 'name',
 'peerDependencies',
 'repository',
 'scripts',
 'version']

In [67]:
df.show()

+--------------------+---------------+--------------------+-------+--------------------+---------+--------------------+--------------------+------------------+-------+
|         description|devDependencies|            homepage|license|         maintainers|     name|    peerDependencies|          repository|           scripts|version|
+--------------------+---------------+--------------------+-------+--------------------+---------+--------------------+--------------------+------------------+-------+
|Modern, fast Reac...|       {^8.0.1}|https://github.co...|    MIT|[{me@rreverser.co...|acorn-jsx|{^6.0.0 || ^7.0.0...|{git, https://git...|{node test/run.js}|  5.3.2|
+--------------------+---------------+--------------------+-------+--------------------+---------+--------------------+--------------------+------------------+-------+



In [55]:
df.printSchema()

root
 |-- description: string (nullable = true)
 |-- devDependencies: struct (nullable = true)
 |    |-- acorn: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- license: string (nullable = true)
 |-- maintainers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- web: string (nullable = true)
 |-- name: string (nullable = true)
 |-- peerDependencies: struct (nullable = true)
 |    |-- acorn: string (nullable = true)
 |-- repository: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- scripts: struct (nullable = true)
 |    |-- test: string (nullable = true)
 |-- version: string (nullable = true)



In [56]:
df.describe()

DataFrame[summary: string, description: string, homepage: string, license: string, name: string, version: string]

In [57]:
df.describe().show()

+-------+--------------------+--------------------+-------+---------+-------+
|summary|         description|            homepage|license|     name|version|
+-------+--------------------+--------------------+-------+---------+-------+
|  count|                   1|                   1|      1|        1|      1|
|   mean|                NULL|                NULL|   NULL|     NULL|   NULL|
| stddev|                NULL|                NULL|   NULL|     NULL|   NULL|
|    min|Modern, fast Reac...|https://github.co...|    MIT|acorn-jsx|  5.3.2|
|    max|Modern, fast Reac...|https://github.co...|    MIT|acorn-jsx|  5.3.2|
+-------+--------------------+--------------------+-------+---------+-------+



## Defining Schema

In [60]:
from pyspark.sql.types import StructField, IntegerType, StringType, StructType, ArrayType

In [64]:
main_struct = StructType([
    StructField("name", StringType(), True),
    StructField("description", StringType(), True),
    StructField("homepage", StringType(), True),
    StructField("version", StringType(), True),
    StructField("maintainers", ArrayType(StructType([
        StructField("name", StringType(), True),
        StructField("email", StringType(), True ),
        StructField("web", StringType(), True )
    ])
    ) ),
    StructField("repository", StructType([
        StructField("type", StringType(), True),
        StructField("url", StringType(), True )
    ]) ),
    StructField("license", StringType(), True),
    StructField("scripts", StructType([
        StructField("test", StringType(), True )
    ]) ),
    StructField("peerDependencies", StructType([
        StructField("acorn", StringType(), True )
    ]) ),
      StructField("devDependencies", StructType([
        StructField("acorn", StringType(), True )
    ]) )
    
    
])

In [77]:
df2 = spark.read.json("/home/mutwiri/hadoop/spark/data/package.json", multiLine=True, schema=main_struct)

In [78]:
df2.show()

+---------+--------------------+--------------------+-------+--------------------+--------------------+-------+------------------+--------------------+---------------+
|     name|         description|            homepage|version|         maintainers|          repository|license|           scripts|    peerDependencies|devDependencies|
+---------+--------------------+--------------------+-------+--------------------+--------------------+-------+------------------+--------------------+---------------+
|acorn-jsx|Modern, fast Reac...|https://github.co...|  5.3.2|[{Ingvar Stepanya...|{git, https://git...|    MIT|{node test/run.js}|{^6.0.0 || ^7.0.0...|       {^8.0.1}|
+---------+--------------------+--------------------+-------+--------------------+--------------------+-------+------------------+--------------------+---------------+



In [79]:
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- version: string (nullable = true)
 |-- maintainers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- web: string (nullable = true)
 |-- repository: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- license: string (nullable = true)
 |-- scripts: struct (nullable = true)
 |    |-- test: string (nullable = true)
 |-- peerDependencies: struct (nullable = true)
 |    |-- acorn: string (nullable = true)
 |-- devDependencies: struct (nullable = true)
 |    |-- acorn: string (nullable = true)



## Grabbing Data

In [80]:
df2.head()

Row(name='acorn-jsx', description='Modern, fast React.js JSX parser', homepage='https://github.com/acornjs/acorn-jsx', version='5.3.2', maintainers=[Row(name='Ingvar Stepanyan', email='me@rreverser.com', web='http://rreverser.com/')], repository=Row(type='git', url='https://github.com/acornjs/acorn-jsx'), license='MIT', scripts=Row(test='node test/run.js'), peerDependencies=Row(acorn='^6.0.0 || ^7.0.0 || ^8.0.0'), devDependencies=Row(acorn='^8.0.1'))

In [81]:
df["name"]

Column<'name'>

In [82]:
type(df["name"])

pyspark.sql.column.Column

In [84]:
df.select("name").show()


+---------+
|     name|
+---------+
|acorn-jsx|
+---------+



In [85]:
NAMES = df.select("name")

In [86]:
df.select("name", "description").show()

+---------+--------------------+
|     name|         description|
+---------+--------------------+
|acorn-jsx|Modern, fast Reac...|
+---------+--------------------+



## Creating new Column

In [89]:
# By copying existing column

df.withColumn("new_name", df["name"]).show()

+--------------------+---------------+--------------------+-------+--------------------+---------+--------------------+--------------------+------------------+-------+---------+
|         description|devDependencies|            homepage|license|         maintainers|     name|    peerDependencies|          repository|           scripts|version| new_name|
+--------------------+---------------+--------------------+-------+--------------------+---------+--------------------+--------------------+------------------+-------+---------+
|Modern, fast Reac...|       {^8.0.1}|https://github.co...|    MIT|[{me@rreverser.co...|acorn-jsx|{^6.0.0 || ^7.0.0...|{git, https://git...|{node test/run.js}|  5.3.2|acorn-jsx|
+--------------------+---------------+--------------------+-------+--------------------+---------+--------------------+--------------------+------------------+-------+---------+



In [91]:
# rename a column
df.withColumnRenamed("license", "licenses").show()

+--------------------+---------------+--------------------+--------+--------------------+---------+--------------------+--------------------+------------------+-------+
|         description|devDependencies|            homepage|licenses|         maintainers|     name|    peerDependencies|          repository|           scripts|version|
+--------------------+---------------+--------------------+--------+--------------------+---------+--------------------+--------------------+------------------+-------+
|Modern, fast Reac...|       {^8.0.1}|https://github.co...|     MIT|[{me@rreverser.co...|acorn-jsx|{^6.0.0 || ^7.0.0...|{git, https://git...|{node test/run.js}|  5.3.2|
+--------------------+---------------+--------------------+--------+--------------------+---------+--------------------+--------------------+------------------+-------+



In [None]:
df.withColumn("double_age", df["age"]*2).show()

In [103]:
df.createOrReplaceTempView("react_project")

In [107]:
results = spark.sql("select * from react_project")

In [108]:
results.show()

+--------------------+---------------+--------------------+-------+--------------------+---------+--------------------+--------------------+------------------+-------+
|         description|devDependencies|            homepage|license|         maintainers|     name|    peerDependencies|          repository|           scripts|version|
+--------------------+---------------+--------------------+-------+--------------------+---------+--------------------+--------------------+------------------+-------+
|Modern, fast Reac...|       {^8.0.1}|https://github.co...|    MIT|[{me@rreverser.co...|acorn-jsx|{^6.0.0 || ^7.0.0...|{git, https://git...|{node test/run.js}|  5.3.2|
+--------------------+---------------+--------------------+-------+--------------------+---------+--------------------+--------------------+------------------+-------+



In [109]:
df.write.saveAsTable("react", mode="overwrite")


24/01/04 06:56:51 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/01/04 06:56:51 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/01/04 06:57:16 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
24/01/04 06:57:16 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore mutwiri@127.0.1.1
24/01/04 06:57:16 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
24/01/04 06:57:22 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
24/01/04 06:57:22 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
24/01/04 06:57:22 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/01/04 06:57:22 WARN H

## Create Dataframe from rdd 

In [124]:
text = sc.textFile("/home/mutwiri/hadoop/spark/data/stack.txt", 2)

In [125]:
text.collect()

['ABSA BigDataDataEngineer Hadoop',
 'KOKO CLoudDataEngineer AWS',
 'DATASEAL DataArchitect Azure']

In [126]:
parts = text.map(lambda line: line.split(" "))

In [129]:
parts.top(2)

[['KOKO', 'CLoudDataEngineer', 'AWS'], ['DATASEAL', 'DataArchitect', 'Azure']]

In [138]:
from pyspark.sql import Row

In [139]:
companies = parts.map(lambda p: Row( company=p[0], role=p[1], platform=p[2] ) )

In [140]:
companies.top(2)

[Row(company='KOKO', role='CLoudDataEngineer', platform='AWS'),
 Row(company='DATASEAL', role='DataArchitect', platform='Azure')]

In [141]:
df3 = spark.createDataFrame(companies)

In [143]:
df3.show()

+--------+-------------------+--------+
| company|               role|platform|
+--------+-------------------+--------+
|    ABSA|BigDataDataEngineer|  Hadoop|
|    KOKO|  CLoudDataEngineer|     AWS|
|DATASEAL|      DataArchitect|   Azure|
+--------+-------------------+--------+



In [145]:
df3.createOrReplaceTempView("comps")

In [149]:
all_companies = spark.sql("select company from comps")

In [150]:
all_companies.show()

+--------+
| company|
+--------+
|    ABSA|
|    KOKO|
|DATASEAL|
+--------+



In [153]:
all_comps = all_companies.rdd.map(lambda p: "name  " +p.company )

In [155]:
all_comps.collect()

                                                                                

['name  ABSA', 'name  KOKO', 'name  DATASEAL']

#### Using Struct to make a dataset from a rdd

In [166]:
text = sc.textFile("/home/mutwiri/hadoop/spark/data/stack.txt")

In [167]:
companies = text.map(lambda p: p.split())

In [168]:
companies.take(2)

[['ABSA', 'BigDataDataEngineer', 'Hadoop'],
 ['KOKO', 'CLoudDataEngineer', 'AWS']]

In [172]:
# convering each row to a tuple
companies_rdd = companies.map(lambda p: (p[0], p[1], p[2]))

In [173]:
companies_rdd.take(2)

[('ABSA', 'BigDataDataEngineer', 'Hadoop'),
 ('KOKO', 'CLoudDataEngineer', 'AWS')]

In [175]:
schema = StructType([
    StructField("name", StringType(), True),
    StructField("Role", StringType(), True),
    StructField("Stack", StringType(), True)
])

In [181]:
companies_df = spark.createDataFrame(companies_rdd, schema)

In [182]:
companies_df.show()

+--------+-------------------+------+
|    name|               Role| Stack|
+--------+-------------------+------+
|    ABSA|BigDataDataEngineer|Hadoop|
|    KOKO|  CLoudDataEngineer|   AWS|
|DATASEAL|      DataArchitect| Azure|
+--------+-------------------+------+

