In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# 额外数据读入都放到这个块里

population_info = pd.read_csv('auxiliary-data/sg-population-demographics.csv')

In [4]:
# author by ZhangML
# todo
def addPopulationInfo(dataset, populationInfo):
    result = pd.DataFrame(columns=['planning_area','subzone','underadult','young','mid','old'])
    age_list = [['0-4', '5-9', '10-14', '15-19'], ['20-24', '25-29', '30-34', '35-39'], ['40-44', '45-49', '50-54', '55-59'], ['60-64', '65-69', '70-74',
       '75-79', '80-84', '85+']]
    area_list = list(populationInfo.groupby(['plannin_area', 'subzone']).groups.keys())
    for p, s in area_list:
        temp = populationInfo[(populationInfo.plannin_area == p) & (populationInfo.subzone == s)]
        temp_res = [p, s]
        for age in age_list:
            temp_res.append(temp[temp.age_group.isin(age)]['count'].sum())
        result.loc[len(result.index)] = temp_res
    result = pd.merge(dataset, result, how = 'left', on = ['planning_area', 'subzone'])
    return result

In [5]:
# author by Li Xingchen

import pandas as pd
from geopy.distance import distance

def getProcessedDataset(dataset1,datasetName2) :

  dataset2 = pd.read_csv('auxiliary-data/'+datasetName2+'.csv')

  # Create new Series
  nearestDistanceColName = datasetName2+'_nearestDistance/KM'
  nearestDistanceCol = pd.Series(name=nearestDistanceColName)
  lessHalfKMNumColName  = datasetName2+'_lessHalfKMNum'
  lessHalfKMNumCol = pd.Series(name=lessHalfKMNumColName)
  half2OneKMNumColName  = datasetName2+'_half2OneKMNum'
  half2OneKMNumCol = pd.Series(name=half2OneKMNumColName)
  one2ThreeKMNumColName  = datasetName2+'_one2ThreeKMNum'
  one2ThreeKMNumCol = pd.Series(name=one2ThreeKMNumColName)

  for index1,row in dataset1.iterrows() :
    #print('-------------'+'train1'+'_'+str(index1)+'_begin'+'-------------')

    dataset1_lat = row['latitude']
    dataset1_lng = row['longitude']

    dataset1_location = (dataset1_lat,dataset1_lng)

    nearestDistance = 99999999999999999999.99
    lessHalfKMNum = 0
    half2OneKMNum = 0
    one2ThreeKMNum = 0

    for index,row in dataset2.iterrows() :
      #print('========'+datasetName2+'_'+str(index)+'_begin'+'========')

      dataset2_lat = row['lat']
      dataset2_lng = row['lng']

      dataset2_location = (dataset2_lat,dataset2_lng)

      distance_between = distance(dataset1_location, dataset2_location).km

      if distance_between < nearestDistance :
        nearestDistance = distance_between

      if distance_between < 0.5 :
        lessHalfKMNum += 1
      elif distance_between < 1 :
        half2OneKMNum += 1
      elif distance_between < 3 :
        one2ThreeKMNum += 1

      #print('nearestDistance',nearestDistance)
      #print('lessHalfKMNum',lessHalfKMNum)
      #print('half2OneKMNum',half2OneKMNum)
      #print('one2ThreeKMNum',half2OneKMNum)
      #print('========'+datasetName2+'_'+str(index)+'_end'+'========')

    #print('nearestDistance',nearestDistance)
    #print('lessHalfKMNum',lessHalfKMNum)
    #print('half2OneKMNum',half2OneKMNum)
    #print('one2ThreeKMNum',half2OneKMNum)

    nearestDistanceCol.loc[index1] = nearestDistance
    lessHalfKMNumCol.loc[index1] = lessHalfKMNum
    half2OneKMNumCol.loc[index1] = half2OneKMNum
    one2ThreeKMNumCol.loc[index1] = one2ThreeKMNum

    #print('-------------'+'train1'+'_'+str(index1)+'_end'+'-------------')

  dataset1 = pd.concat([dataset1, nearestDistanceCol], axis=1)
  dataset1 = pd.concat([dataset1, lessHalfKMNumCol], axis=1)
  dataset1 = pd.concat([dataset1, half2OneKMNumCol], axis=1)
  dataset1 = pd.concat([dataset1, one2ThreeKMNumCol], axis=1)

  return dataset1

def concatAdditionalInfo(dataset) :

  result = getProcessedDataset(dataset,'sg-primary-schools')
  result = getProcessedDataset(result,'sg-commerical-centres')
  result = getProcessedDataset(result,'sg-secondary-schools')
  result = getProcessedDataset(result,'sg-shopping-malls')
  result = getProcessedDataset(result,'sg-train-stations')
  result = getProcessedDataset(result,'sg-gov-markets-hawker-centres')

  #result.to_csv('result.csv', index=False)
  return result


In [6]:
# author by Wang Tong

from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

storey_range_target = pd.DataFrame()
flat_model_target = pd.DataFrame()
planning_area_target = pd.DataFrame()

def preprocessData(dataset):
    # 1. Drop outliers if any
    dataset = dataset[dataset['floor_area_sqm'] > 0]
    dataset = dataset[(dataset['latitude'] < 1.5) | (dataset['latitude'] > 1.2)]
    dataset = dataset[(dataset['longitude'] < 104) | (dataset['longitude'] > 103)]

    # 2. Align naming conventions for 'flat_type'
    dataset['flat_type'].replace('-', ' ', inplace = True, regex=True)

    # 3. Change Datetime format for 'month'
    # https://stackoverflow.com/questions/54313463/pandas-datetime-to-unix-timestamp-seconds
    dataset['month'] = pd.to_datetime(dataset['month'])
    dataset['month'] = (dataset['month'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

    # 4. (IGNORE) Combining 'block' and 'street_name' into 1 column
    #dataset['block_street_name'] = dataset['block'].str.upper() + ' ' + dataset['street_name'].str.upper()
    #dataset = dataset.drop(columns=['block', 'street_name'])
    
    # 5. OneHotEncoder (adding 7 columns): 'flat_type' (7 cat)
    one_hot = OneHotEncoder()
    encoded = one_hot.fit_transform(dataset[['flat_type']])
    column_headers = np.hstack(one_hot.categories_)
    dataset[column_headers] = encoded.toarray()
    
    # 6. TargetEncoder: 'storey_range', 'planning_area', 'flat_model'
    if 'resale_price' in dataset.columns:
        tenc=ce.TargetEncoder()
        # 'storey_range'
        global storey_range_target 
        storey_range_target = tenc.fit_transform(dataset['storey_range'],dataset['resale_price'])
        dataset = storey_range_target.join(dataset.drop('storey_range',axis = 1))
        # 'flat_model'
        global flat_model_target
        flat_model_target = tenc.fit_transform(dataset['flat_model'],dataset['resale_price'])
        dataset = flat_model_target.join(dataset.drop('flat_model',axis = 1))
        # 'planning_area'
        global planning_area_target
        planning_area_target = tenc.fit_transform(dataset['planning_area'],dataset['resale_price'])
        dataset = planning_area_target.join(dataset.drop('planning_area',axis = 1))
    else:
        dataset = storey_range_target.join(dataset.drop('storey_range',axis = 1))
        dataset = flat_model_target.join(dataset.drop('flat_model',axis = 1))
        dataset = planning_area_target.join(dataset.drop('planning_area',axis = 1))
        
    # 7. Drop unused columns
    # dataset = dataset.drop(columns=['elevation', 'eco_category', 'town', 'flat_type',
    #                                 'storey_range', 'flat_model', 'region','block',
    #                                 'subzone', 'street_name', 'latitude', 'longitude'])
    
    return dataset
train = addPopulationInfo(train, population_info)
test = addPopulationInfo(test, population_info)
train = concatAdditionalInfo(train)
test = concatAdditionalInfo(test)

train = preprocessData(train)
test = preprocessData(test)

  if sys.path[0] == "":
  
  


KeyboardInterrupt: 

In [None]:
train.to_csv('processed_train.csv')
test.to_csv('processed_test.csv')