### create dataframe from tuples

In [0]:
list = [('Ankit',25),('Jalfaizy',22),('saurabh',20),('Bala',26)]
list

In [0]:
schemaPeople = sqlContext.createDataFrame(list)
type(schemaPeople)

### read files

In [0]:
sql_train = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("/FileStore/tables/u5_6_3/train_u5_6_3.csv")

In [0]:
sql_test = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("/FileStore/tables/u5_6_3/test_u5_6_3.csv")

### investigate

In [0]:
sql_train.printSchema()

In [0]:
sql_test.printSchema()

In [0]:
sql_test.head(5)

In [0]:
type(sql_test)

In [0]:
sql_train.show(2, truncate=True)

In [0]:
sql_train.count(), sql_test.count()

### get column info

In [0]:
len(sql_train.columns) , sql_train.columns

In [0]:
len(sql_test.columns) , sql_test.columns

### statistics

In [0]:
sql_train.describe().show()

In [0]:
sql_train.describe(['Purchase' , 'Product_ID']).show()

### select individual columns

In [0]:
sql_train.select('User_ID' , 'Age').show(5)

### unique values

In [0]:
sql_train.select('Product_ID').distinct().count() , sql_test.select('Product_ID').distinct().count()

###### `sql_test` has categores not in `sql_train`

In [0]:
dif_product = sql_test.select('Product_ID').subtract(sql_train.select('Product_ID'))
dif_product.distinct().count()

### pair wise frequency
###### `groupby` gives same values but result is different structure

In [0]:
sql_train.crosstab('Age' , 'Gender').show()

### drop duplicates
###### demonstrate by picking only 2 columns, then dropping

In [0]:
sql_train.select('Age' , 'Gender').dropDuplicates().show()

### handle `null` values
###### `dropna` has args `how` `thresh` `subset`
###### this example drops row if any column has `null`

In [0]:
sql_train.count() , sql_train.dropna().count()

###### `fillna`
###### this example replaces every `null` with `-1`

In [0]:
sql_train.fillna(-1).show(2)

### filter

In [0]:
large_purchase = sql_train.filter(sql_train.Purchase > 15000)
large_purchase.count() , large_purchase.show(5)

### `groupby` and `agg`
###### `agg` functions: `sum` , `max` , `min` , `mean`

In [0]:
sql_train.groupby('Age').agg({'Purchase' : 'mean'}).show()

In [0]:
sql_train.groupby('Age').count().show()

In [0]:
# `groupby` gives same values but result is different structure
sql_train.groupby('Age' , 'Gender').count().show()

### `sample`
###### 1st example: sample size is 20%, random seed is 42, `withReplacement=False`

In [0]:
t1 = sql_train.sample(False, 0.2, 42)
type(t1) , t1.count()

In [0]:
t2 = sql_train.sample(False, 0.2, 43)
type(t2) , t2.count()

### apply function: `map`

In [0]:
# sql_train.select('User_ID').map(lambda x:(x,1)).take(5)

sql_train.select('User_ID').rdd.map(lambda x:(x,1)).take(5)

### sorting: `orderBy`

In [0]:
sql_train.orderBy('Purchase', ascending=False).show(5)

In [0]:
sql_train.orderBy(sql_train.Purchase.desc()).show(5)

In [0]:
sql_train.orderBy(['Purchase' , 'User_ID'], ascending=[False,True]).show(15)

### add column: `withColumn`

In [0]:
sql_train.withColumn('Purchase_new', sql_train.Purchase/2.0).select('Purchase', 'Purchase_new').show(5)

### delete column: `drop`

In [0]:
sql_test.drop('Product_Category_3').columns

### remove rows with particular value in particular column
###### example: remove from test data `Product_ID` values not in train data
###### find the product IDs

In [0]:
dif_cat = sql_test.select('Product_ID').subtract(sql_train.select('Product_ID'))
dif_cat.count() , dif_cat.show(10)

In [0]:
dif_cat.distinct().count()

In [0]:
dif_cat.distinct().show(10)

###### make `list` of unique categories not in test data; values are strings

In [0]:
not_found_cat = dif_cat.distinct().rdd.map(lambda x: x[0]).collect()
len(not_found_cat)

In [0]:
not_found_cat

###### create `udf`--user defined function
result of function is a string because `Product_ID` is a string

In [0]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

In [0]:
F1 = udf(lambda x: '-1' if x in not_found_cat else x, StringType())

###### apply `udf` to make new column

In [0]:
test_new_col = sql_test.withColumn('NEW_Product_ID' , F1(sql_test['Product_ID'])).select('NEW_Product_ID')

In [0]:
new_dif_cat = test_new_col.select('NEW_Product_ID').subtract(sql_train.select('Product_ID'))
new_dif_cat.count() , new_dif_cat.show(10)

###### `filter` on value in new column

In [0]:
test_new_col.count()

In [0]:
test_new_col.columns

In [0]:
filtered = test_new_col.filter(test_new_col.NEW_Product_ID != '-1')
filtered.count()

### run SQL queries
###### register DataFrame as a table
###### do SQL queries

In [0]:
sqlContext.registerDataFrameAsTable(sql_train, 'train_table')

In [0]:
sqlContext.sql('select Product_ID from train_table').show(5)

In [0]:
sqlContext.dropTempTable('train_table')

In [0]:
# gives error since table no longer exists
#sqlContext.sql('select Product_ID from train_table').show(5)