In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math

In [2]:
all_df = pd.read_csv('../data/process_v1/process_data_6.csv')

In [3]:
basic_featureset = {
    "cat_cols": [
        "typeOfRealEstate",
        "accessibility",
        "ward",
        "certificateOfLandUseRight",
        "houseDirection",
        "street",
        "district"
    ],
    "num_cols": [
        "facade",
        "lon",
        "frontWidth",
        "lat",
        "w",
        "numberOfBathRooms",
        "landSize",
        "numberOfBedRooms",
        "num_of_atm_in_1000m_radius"
    ]
}

In [4]:
facility_cols = ['num_of_marketplace_in_1000m_radius',
 'num_of_bank_in_1000m_radius',
 'num_of_kindergarten_in_2000m_radius',
 'num_of_atm_in_2000m_radius',
 'num_of_school_in_2000m_radius',
 'num_of_kindergarten_in_500m_radius',
 'num_of_place_of_worship_in_500m_radius',
 'num_of_university_in_1000m_radius',
 'num_of_cafe_in_1000m_radius',
 'num_of_atm_in_500m_radius',
 'num_of_hospital_in_2000m_radius',
 'num_of_parking_entrance_in_2000m_radius',
 'num_of_restaurant_in_1000m_radius',
 'num_of_parking_entrance_in_500m_radius',
 'num_of_place_of_worship_in_1000m_radius',
 'num_of_parking_in_500m_radius',
 'num_of_marketplace_in_500m_radius',
 'num_of_university_in_500m_radius',
 'num_of_university_in_2000m_radius',
 'num_of_police_in_1000m_radius',
 'num_of_fuel_in_1000m_radius',
 'num_of_fast_food_in_500m_radius',
 'num_of_atm_in_1000m_radius',
 'num_of_hospital_in_500m_radius',
 'num_of_bank_in_500m_radius',
 'num_of_police_in_500m_radius',
 'num_of_bank_in_2000m_radius',
 'num_of_restaurant_in_500m_radius',
 'num_of_parking_in_2000m_radius',
 'num_of_fuel_in_2000m_radius',
 'num_of_parking_entrance_in_1000m_radius',
 'num_of_parking_in_1000m_radius',
 'num_of_restaurant_in_2000m_radius',
 'num_of_townhallcommunity_centre_in_1000m_radius',
 'num_of_school_in_500m_radius',
 'num_of_fast_food_in_2000m_radius',
 'num_of_townhallcommunity_centre_in_2000m_radius',
 'num_of_cafe_in_2000m_radius',
 'num_of_school_in_1000m_radius',
 'num_of_fuel_in_500m_radius',
 'num_of_police_in_2000m_radius',
 'num_of_hospital_in_1000m_radius',
 'num_of_cafe_in_500m_radius',
 'num_of_townhallcommunity_centre_in_500m_radius',
 'num_of_marketplace_in_2000m_radius',
 'num_of_place_of_worship_in_2000m_radius',
 'num_of_fast_food_in_1000m_radius',
 'num_of_kindergarten_in_1000m_radius']

basic_with_facility_featureset = {
    "cat_cols": [
        "typeOfRealEstate",
        "accessibility",
        "ward",
        "certificateOfLandUseRight",
        "houseDirection",
        "street",
        "district"
    ],
    "num_cols": [
        "facade",
        "lon",
        "frontWidth",
        "lat",
        "w",
        "numberOfBathRooms",
        "landSize",
        "numberOfBedRooms"
    ] + facility_cols
}

In [5]:
for prefix, city in zip(['hcm', 'hn'], [0, 1]):

	df = all_df[all_df['city'] == city].reset_index(drop = True)

	df = df[df['landSize'] <= 5000]

	df['is_street_house'] = (df['accessibility'] == 0).astype(np.int32)
	df['landSize_ratio'] = df['landSize'] / df['acreage']

	mean_land_size_df = df.groupby('administrative_genre')['landSize'].mean().reset_index().rename(columns = {'landSize': 'meanLandSize'})

	df = df.merge(mean_land_size_df, how='left', on = 'administrative_genre')

	df['landSize_ratio_with_administrative_genre'] = df['landSize'] / df['meanLandSize']
	df['acreage_ratio_with_meanLandSize'] = df['acreage'] / df['meanLandSize']


	del df['meanLandSize']

	mean_land_size_df.to_csv(f'{prefix}_mean_land_size_df.csv', index = False)

	s1 = u'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
	s0 = u'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy'
	def remove_accents(input_str):
		s = ''
		for c in input_str:
			if c in s1:
				s += s0[s1.index(c)]
			else:
				s += c

		s = "".join([c for c in list(s) if c == '_' or c.isalnum()])
		return s

	full_cols = df.columns.tolist()
	format_full_cols = [remove_accents(col) for col in full_cols]

	df = df.rename(columns = dict(zip(full_cols, format_full_cols)))

	cat_cols = ['is_street_house', 'ward', 'street', 'district', 'typeOfRealEstate', 'houseDirection', 'accessibility', 'certificateOfLandUseRight'] + [c for c in df.columns.tolist() if ('_district' in c or '_ward' in c or '_street' in c) and 'distance' not in c and 'num' not in c]
	target_cols = ['target']
	remove_cols = ['price', 'description', 'city', 'endWidth'] + [c for c in df.columns if f'distance_{prefix}' in c]
	time_cols = ['time']
	num_cols = [c for c in df.columns.tolist() if c not in (cat_cols + target_cols + remove_cols + time_cols)]

	cat_cols = list(set(cat_cols))
	target_cols = list(set(target_cols))
	remove_cols = list(set(remove_cols))
	num_cols = list(set(num_cols))

	print(len(cat_cols), len(target_cols), len(remove_cols), len(num_cols))

	with open(f"../data/featureset/{prefix}_v0.json", "w") as outfile:
		json.dump(basic_featureset, outfile)

	with open(f"../data/featureset/{prefix}_v4.json", "w") as outfile:
		json.dump(basic_with_facility_featureset, outfile)


	featureset = {
		"cat_cols": cat_cols,
		"num_cols": num_cols,
	}

	with open(f"../data/featureset/{prefix}_v1.json", "w") as outfile:
		json.dump(featureset, outfile)

	df[cat_cols] = df[cat_cols].fillna(100)
	df[cat_cols] = df[cat_cols].astype(np.int32)

	df = df[cat_cols + num_cols + time_cols + target_cols]

	df = df[df[target_cols[0]] != np.inf]
	df = df[df[target_cols[0]] != -np.inf]

	df = df[df[target_cols[0]] >= 10]
	df = df[df[target_cols[0]] < 100 * 10]

	mean_dict = dict()

	for col in num_cols:
		mean_dict[col] = df[col].mean()
		df[col] = df[col].fillna(df[col].mean())
	import json

	with open(f"./{prefix}_mean_num_col.json", "w") as outfile:
		json.dump(mean_dict, outfile)

	df.to_csv(f'../data/process_v1/process_data_7_{prefix}.csv', index = False)


	gmm_config = {
		'num_of_restaurant_in_2000m_radius': 2,
		'num_of_restaurant_in_1000m_radius': 2,
		'num_of_bank_in_2000m_radius': 3,
		'num_of_marketplace_in_2000m_radius': 2,
		'num_of_cafe_in_1000m_radius': 2
	}

	from sklearn.mixture import GaussianMixture
	from joblib import dump, load


	gmm_dict = {}


	for col in gmm_config.keys():
		gmm = GaussianMixture(n_components=gmm_config[col], random_state=42)
		gmm.fit(df[col].values.reshape(-1, 1))
		df[f'gmm_{gmm_config[col]}_component_{col}'] = gmm.predict(df[col].values.reshape(-1, 1))

		gmm_dict[col] = gmm

	for col in gmm_config.keys():
		dump(gmm_dict[col], f"./{prefix}_gmm_{col}.joblib")


	cat_cols = ['is_street_house', 'ward', 'street', 'district', 'typeOfRealEstate', 'houseDirection', 'accessibility', 'certificateOfLandUseRight'] + [c for c in df.columns.tolist() if ('_district' in c or '_ward' in c or '_street' in c) and 'distance' not in c and 'num' not in c]
	cat_cols += [c for c in df.columns if 'gmm' in c]
	cat_cols = list(set(cat_cols))
	target_cols = ['target']
	remove_cols = ['price', 'description', 'city', 'endWidth'] + [c for c in df.columns if f'distance_{prefix}' in c]
	time_cols = ['time']
	num_cols = [c for c in df.columns.tolist() if c not in (cat_cols + target_cols + remove_cols + time_cols)]

	cat_cols = list(set(cat_cols))
	target_cols = list(set(target_cols))
	remove_cols = list(set(remove_cols))
	num_cols = list(set(num_cols))


	print(len(cat_cols), len(target_cols), len(remove_cols), len(num_cols))

	featureset = {
		"cat_cols": cat_cols,
		"num_cols": num_cols,
	}

	with open(f"../data/featureset/{prefix}_v2.json", "w") as outfile:
		json.dump(featureset, outfile)




	df = df.reset_index(drop = True)
	df.to_csv(f'../data/process_v1/process_data_8_{prefix}.csv', index = False)


	from sklearn.decomposition import PCA

	pca = PCA(n_components=2)
	X_pca = pca.fit_transform(df[num_cols])

	component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
	X_pca = pd.DataFrame(X_pca, columns=component_names)

	from joblib import dump, load

	dump(pca, f"./{prefix}_pca.joblib")


	cat_cols = ['is_street_house', 'ward', 'street', 'district', 'typeOfRealEstate', 'houseDirection', 'accessibility', 'certificateOfLandUseRight'] + [c for c in df.columns.tolist() if ('_district' in c or '_ward' in c or '_street' in c) and 'distance' not in c and 'num' not in c]
	cat_cols += [c for c in df.columns if 'gmm' in c]
	cat_cols = list(set(cat_cols))
	target_cols = ['target']
	remove_cols = ['price', 'description', 'city', 'endWidth'] + [c for c in df.columns if f'distance_{prefix}' in c]
	time_cols = ['time']
	num_cols = [c for c in df.columns.tolist() if c not in (cat_cols + target_cols + remove_cols + time_cols)]

	cat_cols = list(set(cat_cols))
	target_cols = list(set(target_cols))
	remove_cols = list(set(remove_cols))
	num_cols = list(set(num_cols))

	print(len(cat_cols), len(target_cols), len(remove_cols), len(num_cols))


	featureset = {
		"cat_cols": cat_cols,
		"num_cols": num_cols,
	}

	with open(f"../data/featureset/{prefix}_v3.json", "w") as outfile:
		json.dump(featureset, outfile)

	df = df.reset_index(drop = True)
	df.to_csv(f'../data/process_v1/process_data_9_{prefix}.csv', index = False)


35 1 53 148


NameError: name 'json' is not defined

In [None]:
df.columns.tolist()