### unzipping file

In [0]:
dbutils.fs.ls('/FileStore/tables/')

Out[16]: [FileInfo(path='dbfs:/FileStore/tables/accounts.zip', name='accounts.zip', size=5297592),
 FileInfo(path='dbfs:/FileStore/tables/devicestatus.zip', name='devicestatus.zip', size=23873574),
 FileInfo(path='dbfs:/FileStore/tables/json_activations.zip', name='json_activations.zip', size=8411369),
 FileInfo(path='dbfs:/FileStore/tables/letter_frequencies.txt', name='letter_frequencies.txt', size=2593894),
 FileInfo(path='dbfs:/FileStore/tables/logs.zip', name='logs.zip', size=18168065),
 FileInfo(path='dbfs:/FileStore/tables/webpage.zip', name='webpage.zip', size=1582)]

#### Copying uploaded file to temporary storage

In [0]:
dbutils.fs.cp("dbfs:/FileStore/tables/webpage.zip", "file:/tmp")

Out[17]: True

#### Unzipping devicestatus.zip file into local temp folder

In [0]:
%sh
unzip -d /tmp/ /tmp/webpage.zip

Archive:  /tmp/webpage.zip
   creating: /tmp/webpage/
  inflating: /tmp/webpage/part-m-00001  
  inflating: /tmp/webpage/part-m-00003  
  inflating: /tmp/webpage/part-m-00000  
  inflating: /tmp/webpage/part-m-00002  


In [0]:
dbutils.fs.mkdirs('dbfs:/FileStore/webpage')

Out[19]: True

#### Moving the txt file to the dbfs FileStore

In [0]:
dbutils.fs.mv("file:/tmp/webpage", "dbfs:/FileStore/webpage", True)

Out[20]: True

In [0]:
dbutils.fs.head('/FileStore/webpage/part-m-00000')

Out[22]: '1\tsorrento_f00l_sales.html\ttheme.css,code.js,sorrento_f00l.jpg\n2\ttitanic_2100_sales.html\ttheme.css,code.js,titanic_2100.jpg\n3\tmeetoo_3.0_sales.html\ttheme.css,code.js,meetoo_3.0.jpg\n4\tmeetoo_3.1_sales.html\ttheme.css,code.js,meetoo_3.1.jpg\n5\tifruit_1_sales.html\ttheme.css,code.js,ifruit_1.jpg\n6\tifruit_3_sales.html\ttheme.css,code.js,ifruit_3.jpg\n7\tifruit_2_sales.html\ttheme.css,code.js,ifruit_2.jpg\n8\tifruit_5_sales.html\ttheme.css,code.js,ifruit_5.jpg\n9\ttitanic_1000_sales.html\ttheme.css,code.js,titanic_1000.jpg\n10\tmeetoo_1.0_sales.html\ttheme.css,code.js,meetoo_1.0.jpg\n11\tsorrento_f21l_sales.html\ttheme.css,code.js,sorrento_f21l.jpg\n12\tifruit_4_sales.html\ttheme.css,code.js,ifruit_4.jpg\n13\tsorrento_f23l_sales.html\ttheme.css,code.js,sorrento_f23l.jpg\n'

In [0]:
for line in dbutils.fs.head('/FileStore/webpage/part-m-00000').splitlines():
    print(line)

1	sorrento_f00l_sales.html	theme.css,code.js,sorrento_f00l.jpg
2	titanic_2100_sales.html	theme.css,code.js,titanic_2100.jpg
3	meetoo_3.0_sales.html	theme.css,code.js,meetoo_3.0.jpg
4	meetoo_3.1_sales.html	theme.css,code.js,meetoo_3.1.jpg
5	ifruit_1_sales.html	theme.css,code.js,ifruit_1.jpg
6	ifruit_3_sales.html	theme.css,code.js,ifruit_3.jpg
7	ifruit_2_sales.html	theme.css,code.js,ifruit_2.jpg
8	ifruit_5_sales.html	theme.css,code.js,ifruit_5.jpg
9	titanic_1000_sales.html	theme.css,code.js,titanic_1000.jpg
10	meetoo_1.0_sales.html	theme.css,code.js,meetoo_1.0.jpg
11	sorrento_f21l_sales.html	theme.css,code.js,sorrento_f21l.jpg
12	ifruit_4_sales.html	theme.css,code.js,ifruit_4.jpg
13	sorrento_f23l_sales.html	theme.css,code.js,sorrento_f23l.jpg


### number 3 (a) createDataFrame()

In [0]:
from pyspark.sql.types import *

mySchema = StructType ([StructField ("index", IntegerType()),
                        StructField ("webpage", StringType()),
                        StructField ("associated_files", StringType())
                       ])

myRDD = sc.textFile("/FileStore/webpage/*"). \
        map(lambda line: line.split("\t")). \
        map(lambda values: [int(values[0]), values[1], values[2]])

webpages = spark.createDataFrame(myRDD, mySchema)

webpages.show(truncate = False)

+-----+-------------------------------+------------------------------------------+
|index|webpage                        |associated_files                          |
+-----+-------------------------------+------------------------------------------+
|1    |sorrento_f00l_sales.html       |theme.css,code.js,sorrento_f00l.jpg       |
|2    |titanic_2100_sales.html        |theme.css,code.js,titanic_2100.jpg        |
|3    |meetoo_3.0_sales.html          |theme.css,code.js,meetoo_3.0.jpg          |
|4    |meetoo_3.1_sales.html          |theme.css,code.js,meetoo_3.1.jpg          |
|5    |ifruit_1_sales.html            |theme.css,code.js,ifruit_1.jpg            |
|6    |ifruit_3_sales.html            |theme.css,code.js,ifruit_3.jpg            |
|7    |ifruit_2_sales.html            |theme.css,code.js,ifruit_2.jpg            |
|8    |ifruit_5_sales.html            |theme.css,code.js,ifruit_5.jpg            |
|9    |titanic_1000_sales.html        |theme.css,code.js,titanic_1000.jpg        |
|10 

### number 3 (b) spark.read.csv()

In [0]:
webpages2 = spark.read.options(delimiter ="\t").csv('/FileStore/webpage/*')
webpages2.show(truncate = False)

+---+-------------------------------+------------------------------------------+
|_c0|_c1                            |_c2                                       |
+---+-------------------------------+------------------------------------------+
|14 |titanic_2200_sales.html        |theme.css,code.js,titanic_2200.jpg        |
|15 |ronin_novelty_note_1_sales.html|theme.css,code.js,ronin_novelty_note_1.jpg|
|16 |titanic_2500_sales.html        |theme.css,code.js,titanic_2500.jpg        |
|17 |ronin_novelty_note_3_sales.html|theme.css,code.js,ronin_novelty_note_3.jpg|
|18 |ronin_novelty_note_2_sales.html|theme.css,code.js,ronin_novelty_note_2.jpg|
|19 |ronin_novelty_note_4_sales.html|theme.css,code.js,ronin_novelty_note_4.jpg|
|20 |ifruit_3a_sales.html           |theme.css,code.js,ifruit_3a.jpg           |
|21 |titanic_2300_sales.html        |theme.css,code.js,titanic_2300.jpg        |
|22 |sorrento_f24l_sales.html       |theme.css,code.js,sorrento_f24l.jpg       |
|23 |sorrento_f20l_sales.htm

### number 3 (c) .toDF()

In [0]:
from pyspark.sql.types import Row

def f(x):
    d = {}
    for i in range(len(x)):
        d[str(i)] = x[i]
    return d

webpages3My = myRDD.map(lambda x: Row(**f(x))).toDF()
webpages3My.show(truncate = False)

+---+-------------------------------+------------------------------------------+
|0  |1                              |2                                         |
+---+-------------------------------+------------------------------------------+
|1  |sorrento_f00l_sales.html       |theme.css,code.js,sorrento_f00l.jpg       |
|2  |titanic_2100_sales.html        |theme.css,code.js,titanic_2100.jpg        |
|3  |meetoo_3.0_sales.html          |theme.css,code.js,meetoo_3.0.jpg          |
|4  |meetoo_3.1_sales.html          |theme.css,code.js,meetoo_3.1.jpg          |
|5  |ifruit_1_sales.html            |theme.css,code.js,ifruit_1.jpg            |
|6  |ifruit_3_sales.html            |theme.css,code.js,ifruit_3.jpg            |
|7  |ifruit_2_sales.html            |theme.css,code.js,ifruit_2.jpg            |
|8  |ifruit_5_sales.html            |theme.css,code.js,ifruit_5.jpg            |
|9  |titanic_1000_sales.html        |theme.css,code.js,titanic_1000.jpg        |
|10 |meetoo_1.0_sales.html  

### class approach

In [0]:
myRDD = sc.textFile("/FileStore/webpage/*")
colRDD = myRDD.map(lambda line: line.split('\t'))
webpages3 = colRDD.toDF(["num", "webpage", "associated_files"])
webpages3.show(truncate = False)

+---+-------------------------------+------------------------------------------+
|num|webpage                        |associated_files                          |
+---+-------------------------------+------------------------------------------+
|1  |sorrento_f00l_sales.html       |theme.css,code.js,sorrento_f00l.jpg       |
|2  |titanic_2100_sales.html        |theme.css,code.js,titanic_2100.jpg        |
|3  |meetoo_3.0_sales.html          |theme.css,code.js,meetoo_3.0.jpg          |
|4  |meetoo_3.1_sales.html          |theme.css,code.js,meetoo_3.1.jpg          |
|5  |ifruit_1_sales.html            |theme.css,code.js,ifruit_1.jpg            |
|6  |ifruit_3_sales.html            |theme.css,code.js,ifruit_3.jpg            |
|7  |ifruit_2_sales.html            |theme.css,code.js,ifruit_2.jpg            |
|8  |ifruit_5_sales.html            |theme.css,code.js,ifruit_5.jpg            |
|9  |titanic_1000_sales.html        |theme.css,code.js,titanic_1000.jpg        |
|10 |meetoo_1.0_sales.html  

In [0]:
webpages.printSchema()
webpages2.printSchema()
webpages3.printSchema()
webpages3My.printSchema()

root
 |-- index: integer (nullable = true)
 |-- webpage: string (nullable = true)
 |-- associated_files: string (nullable = true)

root
 |-- index: integer (nullable = true)
 |-- webpage: string (nullable = true)
 |-- associated_files: string (nullable = true)

root
 |-- num: string (nullable = true)
 |-- webpage: string (nullable = true)
 |-- associated_files: string (nullable = true)

root
 |-- 0: long (nullable = true)
 |-- 1: string (nullable = true)
 |-- 2: string (nullable = true)



### column rename section

In [0]:
webpages2 = webpages.withColumnRenamed("_c0", "num") \
            .withColumnRenamed("_c1", "webpage") \
            .withColumnRenamed("_c2", "associated_files")
#namedW = webpages.withColumn

In [0]:
webpages2.printSchema()

root
 |-- index: integer (nullable = true)
 |-- webpage: string (nullable = true)
 |-- associated_files: string (nullable = true)



### cloumn rename julhas approach

In [0]:
webpages2 = webpages2.withColumn("num", webpages2.num.cast("int"))
webpages3 = webpages3.withColumn("num", webpages3.num.cast("int"))

In [0]:
from pyspark.sql.types import IntegerType,BooleanType,DateType
# Convert String to Integer Type
webpages2ChangedColumnCast = webpages2.withColumn("num",webpages2._c0.cast(IntegerType())) \
    
#df.withColumn("age",df.age.cast('int'))
#df.withColumn("age",df.age.cast('integer'))


In [0]:
webpages2ChangedColumnCast.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- num: integer (nullable = true)



In [0]:
#from pyspark.sql.functions import col
#newDF = webpages.select(col("webpage"),col("associated_files")).show(truncate = False)
newDF = webpages.select(webpages.webpage, webpages.associated_files)
newDF.show()

+--------------------+--------------------+
|             webpage|    associated_files|
+--------------------+--------------------+
|sorrento_f00l_sal...|theme.css,code.js...|
|titanic_2100_sale...|theme.css,code.js...|
|meetoo_3.0_sales....|theme.css,code.js...|
|meetoo_3.1_sales....|theme.css,code.js...|
| ifruit_1_sales.html|theme.css,code.js...|
| ifruit_3_sales.html|theme.css,code.js...|
| ifruit_2_sales.html|theme.css,code.js...|
| ifruit_5_sales.html|theme.css,code.js...|
|titanic_1000_sale...|theme.css,code.js...|
|meetoo_1.0_sales....|theme.css,code.js...|
|sorrento_f21l_sal...|theme.css,code.js...|
| ifruit_4_sales.html|theme.css,code.js...|
|sorrento_f23l_sal...|theme.css,code.js...|
|titanic_2200_sale...|theme.css,code.js...|
|ronin_novelty_not...|theme.css,code.js...|
|titanic_2500_sale...|theme.css,code.js...|
|ronin_novelty_not...|theme.css,code.js...|
|ronin_novelty_not...|theme.css,code.js...|
|ronin_novelty_not...|theme.css,code.js...|
|ifruit_3a_sales.html|theme.css,

In [0]:
pairRDD = newDF.rdd.map(lambda x: (x.webpage, x.associated_files))
pairRDD.take(2)


Out[73]: [('sorrento_f00l_sales.html', 'theme.css,code.js,sorrento_f00l.jpg'),
 ('titanic_2100_sales.html', 'theme.css,code.js,titanic_2100.jpg')]

In [0]:
finalRDD = pairRDD.flatMapValues(lambda value: value.split(','))

In [0]:
finalRDD.take(10)

Out[76]: [('sorrento_f00l_sales.html', 'theme.css'),
 ('sorrento_f00l_sales.html', 'code.js'),
 ('sorrento_f00l_sales.html', 'sorrento_f00l.jpg'),
 ('titanic_2100_sales.html', 'theme.css'),
 ('titanic_2100_sales.html', 'code.js'),
 ('titanic_2100_sales.html', 'titanic_2100.jpg'),
 ('meetoo_3.0_sales.html', 'theme.css'),
 ('meetoo_3.0_sales.html', 'code.js'),
 ('meetoo_3.0_sales.html', 'meetoo_3.0.jpg'),
 ('meetoo_3.1_sales.html', 'theme.css')]

In [0]:
newDF2 = spark.createDataFrame(finalRDD)

In [0]:
newDF2.show()

+--------------------+-----------------+
|                  _1|               _2|
+--------------------+-----------------+
|sorrento_f00l_sal...|        theme.css|
|sorrento_f00l_sal...|          code.js|
|sorrento_f00l_sal...|sorrento_f00l.jpg|
|titanic_2100_sale...|        theme.css|
|titanic_2100_sale...|          code.js|
|titanic_2100_sale...| titanic_2100.jpg|
|meetoo_3.0_sales....|        theme.css|
|meetoo_3.0_sales....|          code.js|
|meetoo_3.0_sales....|   meetoo_3.0.jpg|
|meetoo_3.1_sales....|        theme.css|
|meetoo_3.1_sales....|          code.js|
|meetoo_3.1_sales....|   meetoo_3.1.jpg|
| ifruit_1_sales.html|        theme.css|
| ifruit_1_sales.html|          code.js|
| ifruit_1_sales.html|     ifruit_1.jpg|
| ifruit_3_sales.html|        theme.css|
| ifruit_3_sales.html|          code.js|
| ifruit_3_sales.html|     ifruit_3.jpg|
| ifruit_2_sales.html|        theme.css|
| ifruit_2_sales.html|          code.js|
+--------------------+-----------------+
only showing top

In [0]:
newDF2.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)



In [0]:
newDF2 = newDF2.withColumnRenamed("_1", "webpage_number") \
                .withColumnRenamed("_2", "associated_files")

In [0]:
newDF2.show()

+--------------------+-----------------+
|      webpage_number| associated_files|
+--------------------+-----------------+
|sorrento_f00l_sal...|        theme.css|
|sorrento_f00l_sal...|          code.js|
|sorrento_f00l_sal...|sorrento_f00l.jpg|
|titanic_2100_sale...|        theme.css|
|titanic_2100_sale...|          code.js|
|titanic_2100_sale...| titanic_2100.jpg|
|meetoo_3.0_sales....|        theme.css|
|meetoo_3.0_sales....|          code.js|
|meetoo_3.0_sales....|   meetoo_3.0.jpg|
|meetoo_3.1_sales....|        theme.css|
|meetoo_3.1_sales....|          code.js|
|meetoo_3.1_sales....|   meetoo_3.1.jpg|
| ifruit_1_sales.html|        theme.css|
| ifruit_1_sales.html|          code.js|
| ifruit_1_sales.html|     ifruit_1.jpg|
| ifruit_3_sales.html|        theme.css|
| ifruit_3_sales.html|          code.js|
| ifruit_3_sales.html|     ifruit_3.jpg|
| ifruit_2_sales.html|        theme.css|
| ifruit_2_sales.html|          code.js|
+--------------------+-----------------+
only showing top

In [0]:
newDF2.printSchema()

root
 |-- webpage_number: string (nullable = true)
 |-- associated_files: string (nullable = true)



In [0]:
type(newDF2)

Out[88]: pyspark.sql.dataframe.DataFrame

In [0]:
newDF2.write.save("/FileStore/webpage_files")

In [0]:
dbutils.fs.ls("/FileStore/webpage_files")

Out[85]: [FileInfo(path='dbfs:/FileStore/webpage_files/_delta_log/', name='_delta_log/', size=0),
 FileInfo(path='dbfs:/FileStore/webpage_files/part-00000-5dd8adb0-1ed3-485b-8682-97b760a92e66-c000.snappy.parquet', name='part-00000-5dd8adb0-1ed3-485b-8682-97b760a92e66-c000.snappy.parquet', size=1253),
 FileInfo(path='dbfs:/FileStore/webpage_files/part-00001-72628ce1-4ce1-4139-b0e2-d2cbb92d41cc-c000.snappy.parquet', name='part-00001-72628ce1-4ce1-4139-b0e2-d2cbb92d41cc-c000.snappy.parquet', size=1269),
 FileInfo(path='dbfs:/FileStore/webpage_files/part-00002-ff28fc95-4e9b-4acd-aef5-bc322a6f953e-c000.snappy.parquet', name='part-00002-ff28fc95-4e9b-4acd-aef5-bc322a6f953e-c000.snappy.parquet', size=1307),
 FileInfo(path='dbfs:/FileStore/webpage_files/part-00003-2a21cfe7-0f5b-4089-974b-f52ba5347692-c000.snappy.parquet', name='part-00003-2a21cfe7-0f5b-4089-974b-f52ba5347692-c000.snappy.parquet', size=1278)]

### Lab session 2

In [0]:
newDF2.createOrReplaceTempView('webpages')

In [0]:
newDF2.show()

+--------------------+-----------------+
|      webpage_number| associated_files|
+--------------------+-----------------+
|sorrento_f00l_sal...|        theme.css|
|sorrento_f00l_sal...|          code.js|
|sorrento_f00l_sal...|sorrento_f00l.jpg|
|titanic_2100_sale...|        theme.css|
|titanic_2100_sale...|          code.js|
|titanic_2100_sale...| titanic_2100.jpg|
|meetoo_3.0_sales....|        theme.css|
|meetoo_3.0_sales....|          code.js|
|meetoo_3.0_sales....|   meetoo_3.0.jpg|
|meetoo_3.1_sales....|        theme.css|
|meetoo_3.1_sales....|          code.js|
|meetoo_3.1_sales....|   meetoo_3.1.jpg|
| ifruit_1_sales.html|        theme.css|
| ifruit_1_sales.html|          code.js|
| ifruit_1_sales.html|     ifruit_1.jpg|
| ifruit_3_sales.html|        theme.css|
| ifruit_3_sales.html|          code.js|
| ifruit_3_sales.html|     ifruit_3.jpg|
| ifruit_2_sales.html|        theme.css|
| ifruit_2_sales.html|          code.js|
+--------------------+-----------------+
only showing top

In [0]:
sparkDF = spark.sql("SELECT * FROM webpages WHERE webpage_number LIKE 's%'")


In [0]:
sparkDF.show(truncate=False)

+------------------------+-----------------+
|webpage_number          |associated_files |
+------------------------+-----------------+
|sorrento_f00l_sales.html|theme.css        |
|sorrento_f00l_sales.html|code.js          |
|sorrento_f00l_sales.html|sorrento_f00l.jpg|
|sorrento_f21l_sales.html|theme.css        |
|sorrento_f21l_sales.html|code.js          |
|sorrento_f21l_sales.html|sorrento_f21l.jpg|
|sorrento_f23l_sales.html|theme.css        |
|sorrento_f23l_sales.html|code.js          |
|sorrento_f23l_sales.html|sorrento_f23l.jpg|
|sorrento_f24l_sales.html|theme.css        |
|sorrento_f24l_sales.html|code.js          |
|sorrento_f24l_sales.html|sorrento_f24l.jpg|
|sorrento_f20l_sales.html|theme.css        |
|sorrento_f20l_sales.html|code.js          |
|sorrento_f20l_sales.html|sorrento_f20l.jpg|
|sorrento_f32l_sales.html|theme.css        |
|sorrento_f32l_sales.html|code.js          |
|sorrento_f32l_sales.html|sorrento_f32l.jpg|
|sorrento_f22l_sales.html|theme.css        |
|sorrento_

In [0]:
spark.sql("CREATE OR REPLACE TEMP VIEW s_webpages AS SELECT * FROM webpages WHERE webpage_number LIKE 's%'")

++
||
++
++



In [0]:
spark.sql("select * from s_webpages").show()

+--------------------+-----------------+
|      webpage_number| associated_files|
+--------------------+-----------------+
|sorrento_f00l_sal...|        theme.css|
|sorrento_f00l_sal...|          code.js|
|sorrento_f00l_sal...|sorrento_f00l.jpg|
|sorrento_f21l_sal...|        theme.css|
|sorrento_f21l_sal...|          code.js|
|sorrento_f21l_sal...|sorrento_f21l.jpg|
|sorrento_f23l_sal...|        theme.css|
|sorrento_f23l_sal...|          code.js|
|sorrento_f23l_sal...|sorrento_f23l.jpg|
|sorrento_f24l_sal...|        theme.css|
|sorrento_f24l_sal...|          code.js|
|sorrento_f24l_sal...|sorrento_f24l.jpg|
|sorrento_f20l_sal...|        theme.css|
|sorrento_f20l_sal...|          code.js|
|sorrento_f20l_sal...|sorrento_f20l.jpg|
|sorrento_f32l_sal...|        theme.css|
|sorrento_f32l_sal...|          code.js|
|sorrento_f32l_sal...|sorrento_f32l.jpg|
|sorrento_f22l_sal...|        theme.css|
|sorrento_f22l_sal...|          code.js|
+--------------------+-----------------+
only showing top

### number 5 lab-(2nd session)

In [0]:
spark.sql('show databases').show()

+------------+
|databaseName|
+------------+
|     default|
+------------+



In [0]:
spark.catalog.listTables()

Out[126]: [Table(name='explode_webpages', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='newdf2', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='s_webpages', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='spliteg', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='spliteg2', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='webpages', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [0]:
spark.sql("CREATE OR REPLACE TEMP VIEW spliteg AS SELECT split ( webpage_number , '_') as split_wp , associated_files FROM webpages ")

Out[115]: DataFrame[]

In [0]:
spark.sql("select * from spliteg").show(truncate=False)

+----------------------------+-----------------+
|split_wp                    |associated_files |
+----------------------------+-----------------+
|[sorrento, f00l, sales.html]|theme.css        |
|[sorrento, f00l, sales.html]|code.js          |
|[sorrento, f00l, sales.html]|sorrento_f00l.jpg|
|[titanic, 2100, sales.html] |theme.css        |
|[titanic, 2100, sales.html] |code.js          |
|[titanic, 2100, sales.html] |titanic_2100.jpg |
|[meetoo, 3.0, sales.html]   |theme.css        |
|[meetoo, 3.0, sales.html]   |code.js          |
|[meetoo, 3.0, sales.html]   |meetoo_3.0.jpg   |
|[meetoo, 3.1, sales.html]   |theme.css        |
|[meetoo, 3.1, sales.html]   |code.js          |
|[meetoo, 3.1, sales.html]   |meetoo_3.1.jpg   |
|[ifruit, 1, sales.html]     |theme.css        |
|[ifruit, 1, sales.html]     |code.js          |
|[ifruit, 1, sales.html]     |ifruit_1.jpg     |
|[ifruit, 3, sales.html]     |theme.css        |
|[ifruit, 3, sales.html]     |code.js          |
|[ifruit, 3, sales.h

In [0]:
spark.sql("CREATE OR REPLACE TEMP VIEW spliteg2 AS SELECT split ( associated_files, 'c') as split_af FROM spliteg ")

Out[135]: DataFrame[]

In [0]:
spark.sql("select * from spliteg2").show(truncate=False)

+-------------------+
|split_af           |
+-------------------+
|[theme., ss]       |
|[, ode.js]         |
|[sorrento_f00l.jpg]|
|[theme., ss]       |
|[, ode.js]         |
|[titani, _2100.jpg]|
|[theme., ss]       |
|[, ode.js]         |
|[meetoo_3.0.jpg]   |
|[theme., ss]       |
|[, ode.js]         |
|[meetoo_3.1.jpg]   |
|[theme., ss]       |
|[, ode.js]         |
|[ifruit_1.jpg]     |
|[theme., ss]       |
|[, ode.js]         |
|[ifruit_3.jpg]     |
|[theme., ss]       |
|[, ode.js]         |
+-------------------+
only showing top 20 rows



explode

In [0]:
spark.sql("CREATE OR REPLACE TEMP VIEW explode_webpages AS SELECT explode(split_af) FROM spliteg2")


Out[137]: DataFrame[]

In [0]:
spark.sql('select * from explode_webpages').show()

+-----------------+
|              col|
+-----------------+
|           theme.|
|               ss|
|                 |
|           ode.js|
|sorrento_f00l.jpg|
|           theme.|
|               ss|
|                 |
|           ode.js|
|           titani|
|        _2100.jpg|
|           theme.|
|               ss|
|                 |
|           ode.js|
|   meetoo_3.0.jpg|
|           theme.|
|               ss|
|                 |
|           ode.js|
+-----------------+
only showing top 20 rows

