# Preprocessing data

## Preprocessing original data to only include 2021 data

In [1]:
import pandas as pd
import numpy as np

In [2]:
original_data = pd.read_csv("./Flat prices.csv")

In [3]:
original_data['month']= pd.to_datetime(original_data['month'])

In [4]:
data_2021 = original_data[original_data["month"] > "2020-12-12"] 

In [5]:
data_2021 = data_2021.sample(frac = 1)
data_2021

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
88008,2021-01-01,BUKIT MERAH,4 ROOM,5,DELTA AVE,07 TO 09,91.0,New Generation,1985,63 years 10 months,640000.0
88994,2021-01-01,PUNGGOL,4 ROOM,619D,PUNGGOL DR,10 TO 12,93.0,Premium Apartment,2013,91 years 08 months,463000.0
88213,2021-01-01,CHOA CHU KANG,4 ROOM,807B,CHOA CHU KANG AVE 1,10 TO 12,92.0,Model A,2017,95 years 04 months,465000.0
88337,2021-01-01,CLEMENTI,3 ROOM,608,CLEMENTI WEST ST 1,04 TO 06,67.0,New Generation,1979,57 years 11 months,338000.0
92171,2021-02-01,YISHUN,3 ROOM,235,YISHUN ST 21,07 TO 09,67.0,New Generation,1985,63 years 05 months,305000.0
...,...,...,...,...,...,...,...,...,...,...,...
91581,2021-02-01,SENGKANG,3 ROOM,472B,FERNVALE ST,10 TO 12,68.0,Model A,2016,94 years 08 months,363000.0
88704,2021-01-01,JURONG WEST,5 ROOM,662A,JURONG WEST ST 64,04 TO 06,110.0,Improved,2001,79 years 06 months,520000.0
89403,2021-01-01,SENGKANG,4 ROOM,185C,RIVERVALE CRES,10 TO 12,85.0,Model A2,2000,78 years 11 months,370000.0
91117,2021-02-01,KALLANG/WHAMPOA,3 ROOM,33,JLN BAHAGIA,01 TO 03,56.0,Standard,1970,48 years 05 months,218000.0


## Preprocessing data by removing unneeded columns for GA

### The columns to be removed are: month, block, street_name, lease_commence_date, remaining_lease

In [6]:
data_2021.drop(["month", "block", "street_name", "lease_commence_date", "remaining_lease", "storey_range", "flat_model"], axis=1, inplace=True)
data_2021

Unnamed: 0,town,flat_type,floor_area_sqm,resale_price
88008,BUKIT MERAH,4 ROOM,91.0,640000.0
88994,PUNGGOL,4 ROOM,93.0,463000.0
88213,CHOA CHU KANG,4 ROOM,92.0,465000.0
88337,CLEMENTI,3 ROOM,67.0,338000.0
92171,YISHUN,3 ROOM,67.0,305000.0
...,...,...,...,...
91581,SENGKANG,3 ROOM,68.0,363000.0
88704,JURONG WEST,5 ROOM,110.0,520000.0
89403,SENGKANG,4 ROOM,85.0,370000.0
91117,KALLANG/WHAMPOA,3 ROOM,56.0,218000.0


In [7]:
# data_2021 = data_2021.loc[data_2021['town'] == "TAMPINES"]
# data_2021

### Turning the data_2021 df to a dictionary for GA

In [8]:
items = data_2021.to_dict(orient='records')
items[0:2] # sample of first 2 elements in dictionary

[{'town': 'BUKIT MERAH',
  'flat_type': '4 ROOM',
  'floor_area_sqm': 91.0,
  'resale_price': 640000.0},
 {'town': 'PUNGGOL',
  'flat_type': '4 ROOM',
  'floor_area_sqm': 93.0,
  'resale_price': 463000.0}]

# Defining parameters for GA and generating initial population

In [9]:
from numpy.random import randint
import random
n_population = 100
n_dimensions = len(items)
n_iterations = 10
probability_mutation = 0.00001
probability_crossover = 0.1



#population is like this because Generation 0 should be random

def x_generator(x,n):
    list = []
    list.extend([1]*x)
    list.extend([0]*(n-x))
    random.shuffle(list)
    return list

# test = x_generator(10, n_dimensions)
# test
population = [x_generator(10, n_dimensions) for _ in range(n_population)]
print(population[0:2])

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Rules/Scenario for the fitness function

<p>You are a real estate broker in Singapore tasked to provide a high profile client with property recommendations within a certain budget</p>

<p>You are tasked to recommend a set of 10 properties from the overall 2021 real estate market in singapore based on a number of conditions.</p>

<ul>
    <li>The client would prefer properties within 'BUKIT MERAH', 'ANG MO KIO', 'TAMPINES', 'YISHUN', 'PASIR RIS',
       'SENGKANG', 'GEYLANG', 'KALLANG/WHAMPOA', 'PUNGGOL', or 'BEDOK' </li>
    <li>If the properties are not within those locations, the client would hesitate to buy them </li>
    <li>Since the client wants to rent out rooms within the properties, multi-room properties would be preferred </li>
    <li>More floor space would mean rooms would be spacier, which means that the client would prefer properties with a larger floor area </li>
    <li>The client has a budget of 6 millions SGD, so any 10 properties that would not fall within that range, the client would not buy </li>
</ul>

In [10]:
items[0:11]

[{'town': 'BUKIT MERAH',
  'flat_type': '4 ROOM',
  'floor_area_sqm': 91.0,
  'resale_price': 640000.0},
 {'town': 'PUNGGOL',
  'flat_type': '4 ROOM',
  'floor_area_sqm': 93.0,
  'resale_price': 463000.0},
 {'town': 'CHOA CHU KANG',
  'flat_type': '4 ROOM',
  'floor_area_sqm': 92.0,
  'resale_price': 465000.0},
 {'town': 'CLEMENTI',
  'flat_type': '3 ROOM',
  'floor_area_sqm': 67.0,
  'resale_price': 338000.0},
 {'town': 'YISHUN',
  'flat_type': '3 ROOM',
  'floor_area_sqm': 67.0,
  'resale_price': 305000.0},
 {'town': 'PUNGGOL',
  'flat_type': '5 ROOM',
  'floor_area_sqm': 113.0,
  'resale_price': 665000.0},
 {'town': 'CHOA CHU KANG',
  'flat_type': '4 ROOM',
  'floor_area_sqm': 93.0,
  'resale_price': 450000.0},
 {'town': 'HOUGANG',
  'flat_type': 'EXECUTIVE',
  'floor_area_sqm': 143.0,
  'resale_price': 688000.0},
 {'town': 'JURONG EAST',
  'flat_type': '3 ROOM',
  'floor_area_sqm': 67.0,
  'resale_price': 375000.0},
 {'town': 'PUNGGOL',
  'flat_type': '4 ROOM',
  'floor_area_sqm': 

In [11]:
data_2021["flat_type"].unique()

array(['4 ROOM', '3 ROOM', '5 ROOM', 'EXECUTIVE', '2 ROOM', '1 ROOM',
       'MULTI-GENERATION'], dtype=object)

# Defining the objective/fitness function

In [12]:
# Objective function
def fitness_function(solution):
    total_score = 0
    total_price = 0
    threshold = 6000000
    preferred_flat_type = ["4 ROOM", "5 ROOM", "3 ROOM", "EXECUTIVE"]
    preferred_properties = [ 'BUKIT MERAH', 'ANG MO KIO', 'TAMPINES', 'YISHUN', 'PASIR RIS', 
                            'SENGKANG', 'GEYLANG', 'KALLANG/WHAMPOA', 'PUNGGOL', 'BEDOK']
    for i in range(len(solution)):
        if solution[i] == 1:
            
            # Adding/subtracting to/from total_score based on property location
            temp_location = items[i]['town']
            if temp_location in preferred_properties:
                total_score += 1
            else:
                total_score -= 1
            
            # Adding/subtracting to/from total_score based on flat_type
            temp_flat_type = items[i]['flat_type']
            if temp_flat_type in preferred_flat_type:
                total_score+=1
            else:
                total_score -= 1
                
            # Adding/subtracting to/from total_score based on floor area
            temp_floor_area = items[i]['floor_area_sqm']
            if temp_floor_area >= 80:
                total_score += 1
            else:
                total_score -= 1
            
            total_price += items[i]['resale_price']
            
    if total_price <= threshold:
        return total_score
    else:
        return 0

# Defining the Selection Function

In [13]:
# Selection
def selection(population, scores, k=2):
    # Random selection
    selected_index = randint(len(population))
    for i in randint(0, len(population), k - 1):
        # Check of best performance
        # Example: Maximiziation
        if scores[i] > scores[selected_index]:
            selected_index = i
    return population[selected_index]

# Defining the CrossOver function

In [14]:
from numpy.random import rand
# Crossover
def crossover(parent1, parent2, probability_crossover):
    # Children are copies of the parent
    child1, child2 = parent1.copy(), parent2.copy()
    if rand() < probability_crossover:
        pt = randint(1, len(parent1) - 2)
        # Perform breeding
        child1 = parent1[:pt] + parent2[pt:]
        child2 = parent2[:pt] + parent1[pt:]
    return [child1, child2]

# Defining the mutation function

In [15]:
# Mutation
def mutation(gene, probability_mutation):
    for i in range(len(gene)):
        if rand() < probability_mutation:
            # flip the characteristic
            gene[i] = 1 - gene[i]
            
def get_items(optimal_solution):
    optimal_combinations = []
    for i in range(len(optimal_solution)):
        if optimal_solution[i] == 1:
            optimal_combinations.append(items[i])
    return optimal_combinations

# Evolutionary process

In [16]:
best_score, best_gene = 0, fitness_function(population[0])


# Iterate throughout each generation
for gen in range(n_iterations):
    print("Generation {}".format(gen))
    
    # Evaluation
    scores = [fitness_function(p) for p in population]
    
    # Check for the new best solution
    found = False
    for i in range(len(population)):
        if scores[i] > best_score:
            best_score, best_gene = scores[i], population[i]
            print("new best score {}".format(best_score))
            found = True
    if not found:
        print("new best score {}".format(best_score))
    parents = [selection(population, scores) for _ in range(n_population)]
    
    
    # Create the next generation
    children = []
    for i in range(0, n_population, 2):
        # Get selected parents
        parent1, parent2 = parents[i], parents[i+1]
        # Crossover and mutation
        for child in crossover(parent1, parent2, probability_crossover):
            mutation(child, probability_mutation)
            children.append(child)
    # Replace the population
    
    
    population = children
print("Timeline over...")
print("Best Gene: {}".format(best_gene))
print("Score: {}".format(best_score))
optimized_items = get_items(best_gene)
print("Items:")
for item in optimized_items:
    print(item)

Generation 0
new best score 18
new best score 22
new best score 24
Generation 1
new best score 25
Generation 2
new best score 27
new best score 28
Generation 3
new best score 28
Generation 4
new best score 28
Generation 5
new best score 28
Generation 6
new best score 28
Generation 7
new best score 30
Generation 8
new best score 30
Generation 9
new best score 30
Timeline over...
Best Gene: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0