In [37]:
import pandas as pd
import json_lines
import numpy as np
# from regex import reg
import re
import types

def get_int_value(in_str):
    return float(in_str.replace(',', '.'))

regex_pattern = {
    "Kind": {
        "patterns": ["(CCMN|[Cc]hung cư)", "[Hh]omestay"],
        "values" : ["Chung cư", "Homestay"],
        "default": "Phòng trọ"
    },
    "Price": {
        "patterns": ["([0-9]+,[0-9]+|[0-9]+)"],
        "values" : [get_int_value],
        "default": None
    },
    "Size": {
        "patterns": ["[0-9]+"],
        "values" : [get_int_value],
        "default": None
    },
    "Parking_slot": {
        "patterns": ["[Đđ]ể xe"],
        "values" : [1],
        "default": None,
    },
    "Air_condition": {
        "patterns": ["[Đđ]iều hòa"],
        "values" : [1],
        "default": None
    },
    "Heater_shower": {
        "patterns": ["[Nn]óng lạnh"],
        "values" : [1],
        "default": None
    },
    "Furnish": {
        "patterns": ["([Tt]ủ|[Ff]ull đồ|tivi|nội thất đầy đủ)"],
        "values" : [1],
        "default": None
    },
    "Inner_toilet": {
        "patterns": ["khép kín"],
        "values" : [1],
        "default": None
    },
    "Size_total_bool": {
        "patterns": ["[Tt]ổng diện tích"],
        "values" : [1],
        "default": None
    }
}

def reg(prop_type, data):
    data = str(data)
    
    patterns = regex_pattern[prop_type]["patterns"]
    values = regex_pattern[prop_type]["values"]
    default_value = regex_pattern[prop_type]["default"]

    for idx, pattern in enumerate(patterns):
#         print(pattern, data)
        result = re.findall(pattern, data)
        if result:
            if isinstance(values[idx], types.FunctionType):
                return values[idx](result[0])
            return values[idx]
    return default_value

def read_data_from_jl(file_path):
    with open(file_path, 'rb') as f:
        data = json_lines.reader(f)
        return [e for e in data]
    return []

In [51]:
def get_alonhadat():
    alo_list = read_data_from_jl('../raw_data/alonhadat.jl')
    main_list = []
    for idx, row in enumerate(alo_list):
        description = row['description']
        obj = {
            'Idx': idx,
            'Price': reg('Price', row['price']),
            'Description': description,
            'Kind': reg('Kind', description),
            'Location': row['location'],
            'Size': reg('Size', row['area']),
            'Parking_slot': reg('Parking_slot', description),
            'Air_condition': reg('Air_condition', description),
            'Heater_shower': reg('Heater_shower', description),
            'Furnish': reg('Furnish', description),
            'Inner_toilet': reg('Inner_toilet', description),
            'Size_total_bool': reg('Size_total_bool', description),
            'Source': 'alonhadat'
        }
        main_list.append(obj)
    return pd.DataFrame(main_list)


In [52]:
def get_chotot():
    alo_list = read_data_from_jl('../raw_data/chotot.jl')
    main_list = []
    for idx, row in enumerate(alo_list):
        if 'price' not in row['ad'].keys() or 'size' not in row['ad'].keys() or 'address' not in row['ad_params'].keys():
            continue
        description = row['ad']['body']
        obj = {
            'Idx': idx,
            'Price': row['ad']['price']/1000000,
            'Description': description,
            'Kind': reg('Kind', description),
            'Location': row['ad_params']['address']['value'],
            'Size': row['ad']['size'],
            'Parking_slot': reg('Parking_slot', description),
            'Air_condition': reg('Air_condition', description),
            'Heater_shower': reg('Heater_shower', description),
            'Furnish': reg('Furnish', description),
            'Inner_toilet': reg('Inner_toilet', description),
            'Size_total_bool': reg('Size_total_bool', description),
            'Source': 'chotot'
        }
        main_list.append(obj)
    return pd.DataFrame(main_list)


In [53]:
def get_phongtro123():
    alo_list = read_data_from_jl('../raw_data/phongtro123.jl')
    main_list = []
    for idx, row in enumerate(alo_list):
        description = row['description']
        obj = {
            'Idx': idx,
            'Price': reg('Price', row['price']),
            'Description': description,
            'Kind': reg('Kind', description),
            'Location': row['location'],
            'Size': reg('Size', row['area']),
            'Parking_slot': reg('Parking_slot', description),
            'Air_condition': reg('Air_condition', description),
            'Heater_shower': reg('Heater_shower', description),
            'Furnish': reg('Furnish', description),
            'Inner_toilet': reg('Inner_toilet', description),
            'Size_total_bool': reg('Size_total_bool', description),
            'Source': 'alonhadat'
        }
        main_list.append(obj)
    return pd.DataFrame(main_list)


In [50]:
def get_thuephongtro():
    alo_list = read_data_from_jl('../raw_data/thuephongtro.jl')
    main_list = []
    for idx, row in enumerate(alo_list):
        description = row['description']
        obj = {
            'Idx': idx,
            'Price': reg('Price', row['price']),
            'Description': description,
            'Kind': reg('Kind', description),
            'Location': row['location'],
            'Size': reg('Size', row['area']),
            'Parking_slot': reg('Parking_slot', description),
            'Air_condition': reg('Air_condition', description),
            'Heater_shower': reg('Heater_shower', description),
            'Furnish': reg('Furnish', description),
            'Inner_toilet': reg('Inner_toilet', description),
            'Size_total_bool': reg('Size_total_bool', description),
            'Source': 'alonhadat'
        }
        main_list.append(obj)
    return pd.DataFrame(main_list)

In [54]:
df = pd.concat([get_alonhadat(),get_chotot(),get_phongtro123(),get_thuephongtro()], axis=0)

In [58]:
df.to_csv('combined_data.csv', index=False, sep='\t')