
# PySpark Assignment

## RealEstate Housing Data

1. Extract: Load the data
 - Read data all csv as txt as rdd
2. Transform: Exploratory data analysis using rdd
 - Unique records count
 - Extract full address from the column url*
 - from http://www.zillow.com/homes/for_sale//homedetails/V-l-Buell-Newstead-NY10001/2089629334_zpid/
 - to V-l-Buell-Newstead-NY-10001
 - Replace NA by zero in all numerical columns
 - concat - bedrooms*, bathrooms* as bed_bath_rooms* 3b2bh
 - GroupBy zip,bed_bath_rooms* and avg, max, min
3. Load: Save analysis report
 - GroupBy zip,bed_bath_rooms* and avg, max, min, save as files


In [1]:
from random import random
import os
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local").\
        appName("SparkApplication").\
        config("spark.driver.bindAddress","localhost").\
        config("spark.ui.port","4041").\
        getOrCreate()

In [3]:
sc = spark.sparkContext

### To read multiple CSV files in Spark into single RDD.

In [4]:
data=sc.textFile("2018-05-12_154616.csv,2018-05-12_155104.csv,2018-05-12_155435.csv")

In [5]:
# Filter out header row
header=data.first()

In [6]:
print(header)

address,city,state,zip,price,sqft,bedrooms,bathrooms,days_on_zillow,sale_type,url


In [7]:
# remove header
step1= data.filter(lambda line: line !=header)

In [8]:
step1.take(2)

['V/l Buell,Newstead,NY,10001,49000,NA,NA,NA,2,Lot/Land For Sale,http://www.zillow.com/homes/for_sale//homedetails/V-l-Buell-Newstead-NY-10001/2089629334_zpid/',
 '263 9th Ave # PHD,New York,NY,10001,4495000,2250,3,2,1,Condo For Sale,http://www.zillow.com/homes/for_sale//homedetails/263-9th-Ave-PHD-New-York-NY-10001/2103425273_zpid/']

### Total records count

In [9]:
step1.count()

1117

### Total unique records count

In [10]:
step1.distinct().count()

1064

In [11]:
### Filtering out duplicate records

In [12]:
step2=step1.distinct()

In [13]:
step2.count()

1064

### Extract full address from url

In [14]:
#Splitting each line by commma to form array
step3= step2.map(lambda line: line.split(","))

In [None]:
step3.first()

In [None]:
step3.take(2)

In [None]:
# Module/self designed function for extracting address from url
def extract_address(url):
    after_split=url.split("/")
    return(after_split[-3])

In [None]:
extract_address("http://www.zillow.com/homes/for_sale//homedetails/252-7th-Ave-APT-4L-New-York-NY-10001/55501383_zpid/")

In [None]:
step4=step3.map(lambda x: (x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],x[8],x[9],x[10],extract_address(x[-1])))

In [None]:
step4.take(2)

In [None]:
step4.take(2)

### Replacing NA by 0 in all numerical columns

In [None]:
# function to convert numerical columns from string to int and replace NA values by 0 for a list of columns
def replace_na_0(column_val):
    try:
        return int(float(column_val))
    except:
        return 0

In [None]:
num_columns=[3,4,5,6,7,8]

In [None]:
step5=step4.map(lambda x: (x[0],x[1],x[2],replace_na_0(x[3]),replace_na_0(x[4]),replace_na_0(x[5]),
                           replace_na_0(x[6]),replace_na_0(x[7]),replace_na_0(x[8]),x[9],x[10],x[11]))

In [None]:
step4.take(2)

* In the above output we can see the second record has NA values for a numerical column.
* This has been replaced in step5 and this is reflected in the following output

In [None]:
step5.take(2)

### Concat - bedrooms*, bathrooms* as bed_bath_rooms* 3b2bh

In [None]:
# function to concat values in 6 and 7 th columns to give new column bed_bath_rooms
def bed_n_bath_combined(val1,val2):
    return str(val1)+"b"+str(val2)+"bh"

In [None]:
step6=step5.map(lambda x: (x[0],x[1],x[2],x[3],x[4],x[5],bed_n_bath_combined(x[6],x[7]),x[8],x[9],x[10],x[11]))

In [None]:
step5.take(2)

In [None]:
step6.take(2)

###  GroupBy zip,bed_bath_rooms* and avg, max, min


In [None]:
# Creating an rdd that contains only the zipid, bed_bath_rooms and price columns
step7 = step6.map(lambda x: (x[3],x[6],x[4]))

In [None]:
step7.take(10)

In [None]:
# Grouping by zip and then bed_bath_rooms
step8 = step7.groupBy(lambda x: (x[0],x[1]))

In [None]:
step8.take(10)

In [None]:
step8.mapValues(list).take(2)

In [None]:
# Aggregate min
step9= step8.map(lambda x: min(x[1]))

In [None]:
step9.take(10)

In [None]:
# aggregating by max value
step10= step8.map(lambda x: max(x[1]))

In [None]:
step10.take(10)

In [None]:
# function to find mean

def mean_val(x):
    sums=0
    l=0
    for i in x:
        sums=(i[2])+sums
        l=l+1
    return (round(sums/l,2))

In [None]:
# aggregating by mean
step11= step8.map(lambda x: (x[0][0],x[0][1], mean_val(x[1])))

In [None]:
step11.take(10)

In [None]:
# aggregate all 3 (min,max and average) in one step
step12= step8.map(lambda x: (x[0][0],x[0][1],mean_val(x[1]),min(x[1])[2],max(x[1])[2]))

In [None]:
step12.take(10)

### Saving outputs as csv files

In [None]:
# Header 
col1=["zip_code","bed_bath_rooms","min_price"]
col2=["zip_code","bed_bath_rooms","max_price"]
col3=["zip_code","bed_bath_rooms","avg_price"]
col=["zip_code","bed_bath_rooms","avg_price","min_price","max_price"]

# conversion to Data Frame
f1=step9.toDF(col1)
f2=step10.toDF(col2)
f3=step11.toDF(col3)
f=step12.toDF(col)

In [None]:
f1.show(10)

In [None]:
f2.show(10)

In [None]:
f3.show(10)

In [None]:
f.show(10)

In [None]:
f1.toPandas().to_csv("min.csv")
f2.toPandas().to_csv("max.csv")
f3.toPandas().to_csv("average.csv")
f.toPandas().to_csv("combined.csv")

### Saving combined output as text file

In [None]:
import os
f4 = f.toPandas()
f4.to_csv('Output.txt', index=False)
print('Successfully saved at ',os.getcwd())

### Submitted By:
* **Lakshmi V Aji         (20BDA09)**
* **Josmi Agnes Jose      (20BDA27)**
* **Aishwarya Nair M J    (20BDA42)**
* **Mariya Biju           (20BDA61)**
    