In [4]:
import json
import numpy as np
import pandas as pd
from pyspark import *
from pyspark.ml import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from kafka import KafkaConsumer, KafkaProducer

In [5]:
#   Transformers
class ColumnRenamer(Estimator, Transformer):
    def __init__(self, columnsNameOld = "", columnsNameNew = ""):
        self.columnsNameOld = columnsNameOld
        self.columnsNameNew = columnsNameNew
    
    def _fit(self, X, y = None):
        return self
    
    def _transform(self, X):
        Xaux = X
        return Xaux.withColumnRenamed(self.columnsNameOld, self.columnsNameNew)

class ColumnDropper(Estimator, Transformer):
    def __init__(self, columnsName = ["id_region"]):
        self.columnsName = columnsName
    
    def _fit(self, X, y = None):
        return self
    
    def _transform(self, X):
        Xaux = X
        for g in self.columnsName:
            Xaux = Xaux.drop(g)
        return Xaux

class ColumnTransformer(Estimator, Transformer):
    def _fit(self, X, y = None):
        return self
    
    def _transform(self, X):
        Xaux = X
        Xaux = Xaux.withColumn('object_type', 
            when(Xaux.object_type.endswith('2'),regexp_replace(Xaux.object_type,'2','1')) \
            .otherwise(Xaux.object_type))
        Xaux = Xaux.withColumn('rooms', 
            when(Xaux.rooms.endswith('-1'),regexp_replace(Xaux.rooms,'-1','0')) \
            .otherwise(Xaux.rooms))
        Xaux = Xaux.withColumn('kitchen_area', 
            when(Xaux.kitchen_area == '-100.0',regexp_replace(Xaux.kitchen_area,'-100','0')) \
            .otherwise(Xaux.kitchen_area))
        # Xaux = Xaux.withColumn('object_type',Xaux.object_type.cast(IntegerType()))
        Xaux = Xaux.withColumn('object_type',Xaux.object_type.cast(BooleanType()))
        Xaux = Xaux.withColumn('rooms',Xaux.rooms.cast(IntegerType()))
        Xaux = Xaux.withColumn('kitchen_area',Xaux.kitchen_area.cast(FloatType()))
        return Xaux

class RecoveryDataTransformer(Estimator,Transformer):
    def __init__(self, columnsName = [""]):
        self.columnsName = columnsName

    def find_median(self, values_list) -> float:
        result_median = np.nanmedian(np.array(values_list, dtype=float))
        return float(result_median)

    def _fit(self, X, y = None):
        return self
    
    def _transform(self, X):
        Xaux = X

        if len(self.columnsName) > 0:
            for field in self.columnsName:
                Xaux_list = Xaux.select(field).collect()
                Xaux_array = [int(row[field]) for row in Xaux_list]
                median_result = self.find_median(Xaux_array)
                Xaux.fillna(value=median_result, subset=field)

        return Xaux

class LineDistincter(Estimator, Transformer):    
    def _fit(self, X, y = None):
        return self
    
    def _transform(self, X):
        Xaux = X
        return Xaux.distinct()

class LineDropper(Estimator, Transformer):
    def _fit(self, X, y = None):
        return self
    
    def _transform(self, X):
        Xaux = X
        return Xaux.na.drop()


In [None]:
####################### Train model #######################
hdfsurl="hdfs://10.84.128.47:9000"

spark = SparkSession \
        .builder \
        .appName("House Prices") \
        .getOrCreate()
df = spark.read.load(hdfsurl + "/grupo8/input_data_1000_lines.csv",format="csv", sep=";", inferSchema="true", header="true")
df = df.withColumn("price",df.price.cast(DoubleType()))
df.head()

### Limpeza
to_encode_col_names = ['price','level','levels','rooms','area','kitchen_area','building_type','object_type','id_region']
to_remove_col_names = ['date','house_id','postal_code','geo_lat','geo_lon'] # ,'street_id'
to_recover_col_names = ['price','level','levels','rooms','area','kitchen_area','building_type']

df_clean = Pipeline(stages=[ColumnDropper(to_remove_col_names), RecoveryDataTransformer(to_recover_col_names), LineDistincter(), LineDropper(), ColumnTransformer()]).fit(df).transform(df)

data = df_clean.toPandas()

y=data.iloc[:,0]
data=data.drop('price', axis=1)
x=data.iloc[:, 0:9]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)

model = linear_model.Ridge(alpha = 300)
model.fit(X_train, y_train)

In [None]:
def cleanData(data):
    data.drop('id', axis=1, inplace=True)
    data.drop('price', axis=1, inplace=True)
    data.drop('date', axis=1, inplace=True)
    data.drop('house_id', axis=1, inplace=True)
    data.drop('postal_code', axis=1, inplace=True)
    data.drop('geo_lat', axis=1, inplace=True)
    data.drop('geo_lon', axis=1, inplace=True)

    # print(data.dtypes)
    data["object_type"].replace({2:1}, inplace=True)
    data["rooms"].replace({-1:0}, inplace=True)
    data["kitchen_area"].replace({-100:0}, inplace=True)

# Kafka Consumer (Topic: streaming-bd)
consumer = KafkaConsumer(
    'streaming-views-input',
    bootstrap_servers='172.25.24.45:9092',
    auto_offset_reset='earliest'
)

def serializer(message):
    return json.dumps(message).encode('utf-8')

# Kafka Producer (Topic: streaming-feedback)
producer = KafkaProducer(
    bootstrap_servers='172.25.24.45:9092',
    value_serializer=serializer
)

## Kafka loop
for message in consumer:
    streaming_data = json.loads(message.value)
    data_normalized = pd.json_normalize(json.loads(message.value))
    new_data = pd.DataFrame.from_dict(data_normalized)

    if 'id' in new_data.columns: 

        #################### New data predition
        id = new_data.iloc[:,0].to_string(index=False)
        atual_price = new_data.iloc[:,2].to_string(index=False)

        cleanData(new_data)

        predPrice = model.predict(new_data)

        message = 'ID: ' + id
        print(id)
        
        pred = predPrice[0].round(1)
        if float(pred) > float(atual_price):
            message += ' has a Good Price!'
        else: 
            message += ' has a Bad Price!'
        
        message += ' House price: ' + atual_price + ' | Predition: ' + str(np.abs(pred))
        print(message)
        producer.send('streaming-feedback-bd', message)