In [1]:
#importing the libraries
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import random
import copy
import math
from IPython.display import display
from sklearn.preprocessing import LabelEncoder

In [2]:
# reading the training csv and storing it as a pandas dataframe 
train_data = pd.read_csv('train.csv', index_col=False)

In [None]:
"""DATA PRE-PROCESSING"""

In [3]:
def find_null_values(train_data):
    #finding the rows that contain null values in them
    null_val_dict = {}
    #iterating for the coloumns
    for col in train_data.columns:
        if train_data[col].isnull().sum() != 0:
            """train_data[col].isnull().sum() gives us the number of rows that contain null values for the particular coloumn 
            dividing that sum by the number of samples
            adding value for that particular column to the dictionary""" 
            null_val_dict[col] = train_data[col].isnull().sum()/train_data.shape[0]
    return null_val_dict

In [4]:
def remove_null_valued_columns(train_data,null_val_dict):
    #iterating for the columns in the dictionary
    for col in null_val_dict.keys():
        #if that particular column contains 50% null valued rows we'll remove that coloumn
        if null_val_dict[col] > 0.5:
            #drop that column from the training set
            train_data = train_data.drop(col,axis=1)    
        #if that particular column contains less than 50% null valued rows
        else:
            #if the datatype of that column is an object
            if train_data[col].dtypes == 'object':
                """using ffil method to fill the rows containing the null values
                ffil lis applied across the index then any missing value is filled based on 
                the corresponding value in the previous row"""
                train_data[col]=train_data[col].fillna(method='ffill')
            #if not an object
            else:
                #we fill the null values with the mean value of that column
                train_data[col]=train_data[col].fillna(value=train_data[col].mean())  
    return train_data

In [5]:
#there are some columns in the dataset that are strongly correlated with each other e.g OverallQual,GarageCars,GarageArea,
#TotalBsmtSF,1stFlrSF and GrLivArea
#removing the outlier values from columns
def remove_outliers(train_data):
    train_data.drop(train_data[train_data['GrLivArea'] > 4000].index, inplace = True)
    train_data.drop(train_data[(train_data['1stFlrSF'] > 3000) & (train_data['SalePrice'] > 700000)].index)
    train_data.drop(train_data[(train_data['GarageArea'] > 1200) & (train_data['SalePrice'] > 700000)].index)
    train_data.drop(train_data[(train_data['TotalBsmtSF'] > 4000) & (train_data['SalePrice'] > 700000)].index)
    return train_data

In [6]:
#function to perform label encoding on the data items
def label_encoding(train_data):
#iterating for all the columns in the dataset
    for col in train_data.columns:
        #if the column has a datatype of object we perform label encoding
        if train_data[col].dtypes == 'object':
            train_data[col] = LabelEncoder().fit_transform(train_data[col])
    return train_data

In [None]:
"""PART 1 OF THE ASSIGNMENT"""

In [7]:
linear_regression = LinearRegression()
rf_reg = RandomForestRegressor()
#function to train the linear regression classifier
def train_classifier(train_data):
    #dropping the sales price column
    xtrain = train_data.loc[: , train_data.columns.drop('SalePrice')]
    #sale price column contains the labels 
    ytrain = train_data['SalePrice']
    #applying linear regression classifier
    #fitting the classifier
#     linear_regression.fit(xtrain, ytrain)
    rf_reg.fit(xtrain, ytrain)

In [8]:
#function to apply linear regression classifier on a random data sample and find its label
def get_random_from_population(col_name):
    #list to store the randomly created data samples
    rand_sample_list =[]
    #iterating for all the columns in the dataset
    for j in range(0,len(col_name)):
        #picking a random row from tha dataset
        rand_sample = train_data.sample()
        #taking the values of the random row
        rand_sample = rand_sample.values[0]
        #adding the column value of that random to list
        rand_sample_list.append(rand_sample[j])
    #creating a dataframe of the sample and its column 
    rand_ = pd.DataFrame ([rand_sample_list], columns = col_name)
    #returning the randomly created sample
    return rand_

#function to get 10 random populations
def get_random_population(col_name):
    random_10_populations = []
    for i in range(0,10):
        random_10_populations.append(get_random_from_population(col_name))
    
    return random_10_populations
#function to create n random populations
def create_random_population(col_name):
    populations = []
    for i in range(0,5):
        populations.append(get_random_population(col_name))
    
    return populations

In [9]:
#function to calculate the mean squared error
def calculate_MSE(predicted_salesprice,sales_price):
    summ = 0
    for i in range(0,len(predicted_salesprice)):
        summ += (sales_price - predicted_salesprice[i] )**2 
    mse = summ/len(predicted_salesprice)
    #returning the value of mean squared error
    return mse

#function to find fitness of a individual sample
def individual_fitness(pop):
    #applying label encoding on the columns having object datatype
    pop_ = label_encoding(pop)
    #predicting the value of sales price which is our fitness value
#     y_pred = linear_regression.predict(pop_)
    pop_ = pop_.loc[: , pop_.columns.drop('SalePrice')]
    y_pred = rf_reg.predict(pop_)
    #returning the fitness value
    return y_pred[0]

#function to find the fitness for of a single population
def get_individual_fitness(population):
    predicted_salesprice = []
    #iterating for all the genes in the population
    for i in range(0,len(population)):
        #finding their individual fitness which is the value of saleprice
        predicted_salesprice.append(individual_fitness(population[i]))
    return predicted_salesprice

#function to fitness value
def calculate_fitness(population,sales_price):
    #finding fitness of all the individual gene in population
    predicted_salesprice = get_individual_fitness(population)
    """calculating the mean squared error value for the all the genes in the population
    mse is calculated between the actual value of saleprice and our predicted value"""
    fit=calculate_MSE(predicted_salesprice,sales_price)
    #returning the fitness value
    return fit

#function to find fitness for all the populations
def get_all_fitness(populations):
    fitness = []
    for i in range(0,len(populations)):
        #appending the sum of fitness of an individual population to a list 
        fitness.append(sum(get_individual_fitness(populations[i])))
    return fitness

In [10]:
#function to perform single point crossover
def crossover(randPopA, randPopB):
    #finding a random point to perform crossover
    point = random.randint(0, len(randPopA)-1)
    """single point crossover will be performed as if we have
    a --> 1 2 3 4 5
    b --> 4 5 6 7 8
    if we perform single point crossover on 3 it will become
    a --> 1 2 3 7 8
    b --> 4 5 6 4 5"""
    i = 0
    aa = []
    bb = []
    while i != point:
        aa.append(randPopA[i])
        bb.append(randPopB[i])
        i += 1
    while i < len(randPopA):
        aa.append(randPopB[i])
        bb.append(randPopA[i])
        i += 1
    return aa, bb

"""This function performs mutation on the population. The initial mutation rate is first selected that 
is the phenotype mutation rate. Then a genotype mutation rate is selected. if that G_mutation rate is less 
than Phenotype_mutation rate, then the genotype gets swapped."""
def mutate(pop, randPopA, PhenotypeMutationRate):
    i = 0
    for a, b in zip(randPopA, pop):
        GenemutationR = random.uniform(0, 1)
        if GenemutationR < PhenotypeMutationRate:
            randPopA[i] = pop[i]
        i += 1
    return randPopA

def performMutation(randPopA, randPopB,col_name):
    #creating a random population to mutate the population with 
    randomPoP = get_random_population(col_name)
    #calling the mutate function to mutate both the populations
    randPopA = mutate(randomPoP, randPopA, random.uniform(0, 1))
    randPopB = mutate(randomPoP, randPopB, random.uniform(0, 1))
    return randPopA, randPopB


In [11]:
"""This technique sorts the population on the basis of fitness value and returns the top 2 minimum populations"""
def elitist_wheel_selection(fitness,populations):
    for i in range(0,len(fitness)):
        for j in range(0,len(fitness)-i-1):
            if fitness[j] > fitness[j+1]:
                temp = fitness[j]
                temp1 = populations[j]
                fitness[j] = fitness[j+1]
                populations[j] = populations[j+1]
                fitness[j+1] = temp
                populations[j+1] = temp1
    return populations[0],populations[1],fitness

"""This function is used to add the newly created random population to the list of populations
This function works in a way that it first adds the random populations and their fitness values to the lists of populations
and fitness. It then sorts the list in ascending order of fitness value and removes the last 2 populations from the list
the two worst populations having the maximum fitness values is removed"""
def add_back_to_population(fitA, randPopA, fitB, randPopB, fitness,pop):
    fitness.append(fitA)
    fitness.append(fitB)
    pop.append(randPopA)
    pop.append(randPopB)
    for i in range(0,len(fitness)):
        for j in range(0,len(fitness)-i-1):
            if fitness[j] > fitness[j+1]:
                temp = fitness[j]
                temp1 = pop[j]
                fitness[j] = fitness[j+1]
                pop[j] = pop[j+1]
                fitness[j+1] = temp
                pop[j+1] = temp1
    
    for i in range(0,2):
        fitness.pop()
        pop.pop()
    return pop,fitness 

In [15]:
#function to get the input of price threshold from the user
def get_input():
    price_list = []
    price_points = int(input("Enter the number of price points for which you wish to find house design options"))
    for i in range(0,price_points):
        price_list.append(int(input("Enter price")))
    return price_list

#function to create initial random population and compute its fitness value
def apply_initial_steps():
    price_list = get_input()
    #getting all the columns of the dataset
    col_name = train_data.columns
    #converting the col names to list
    col_name = col_name.to_list()
    #removing the saleprice coloumn from the columns because we'll use our model to predict the values
#     col_name.pop()
    populations = create_random_population(col_name)
    fitness = get_all_fitness(populations)
    return price_list,populations,fitness,col_name

def display_results(best):
    for i in range(0,len(best)):
        display(best[i])
    

"""working of genetic algorithm"""
def genetic_algorithm(population,fitness_,price_list,col_name):
    pop = copy.deepcopy(population)
    fitness = copy.deepcopy(fitness_)
    #for all the price thresholds given
    for i in range(0,len(price_list)):
        j=0
        bestOVERALL = [float('inf'), None] #to store the best overall population
        fitA = 0
        fitB = 0
        #algorithms works for 20 child populations
        while j < 20:
            j += 1
            randPopA, randPopB,fitness = elitist_wheel_selection(fitness,pop)#selection
            randPopA, randPopB = crossover(randPopA, randPopB)#crosover
            randPopA, randPopB = performMutation(randPopA, randPopB,col_name)#mutation
            fitA = calculate_fitness(randPopA,price_list[i])#computing fitness value
            fitB = calculate_fitness(randPopB,price_list[i])#computing fitness value
            pop,fitness = add_back_to_population(fitA, randPopA, fitB, randPopB, fitness,pop)#adding back to population
            if fitA < bestOVERALL[0]:
                bestOVERALL = (fitA, randPopA)
            if fitB < bestOVERALL[0]:
                bestOVERALL = (fitB, randPopB)
        print("Best solution for price ",price_list[i]," is : ")
        print(display_results(bestOVERALL[1]))

In [16]:
#data preprocessing
null_dict = find_null_values(train_data)
train_data = remove_null_valued_columns(train_data,null_dict)
train_data = remove_outliers(train_data)
train_data = label_encoding(train_data)
#training the linear regression classifier
train_classifier(train_data)

In [17]:
price_list,populations,fitness,col_name = apply_initial_steps() 
genetic_algorithm(populations,fitness,price_list,col_name)

Enter the number of price points for which you wish to find house design options5
Enter price100000
Enter price150000
Enter price90000
Enter price75000
Enter price50000
Best solution for price  100000  is : 


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,734.0,60.0,3.0,73.0,11787.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2006.0,6.0,4.0,189000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,94.0,120.0,3.0,65.0,9100.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,11.0,2008.0,8.0,4.0,180000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1359.0,60.0,3.0,65.0,17920.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2010.0,8.0,5.0,180000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,283.0,50.0,3.0,78.0,8816.0,1.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,2007.0,8.0,1.0,200000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,406.0,20.0,4.0,70.0,8125.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,8.0,4.0,213500.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,588.0,90.0,4.0,73.0,8640.0,1.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,450.0,12.0,2009.0,8.0,4.0,187000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,930.0,50.0,3.0,73.0,9600.0,1.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2010.0,8.0,4.0,159500.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,979.0,160.0,3.0,90.0,10011.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2007.0,0.0,4.0,175000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,557.0,80.0,3.0,70.049958,11250.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,180.0,0.0,0.0,7.0,2010.0,8.0,4.0,123000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,889.0,20.0,3.0,65.0,13811.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,160.0,0.0,0.0,10.0,2008.0,8.0,4.0,116000.0


None
Best solution for price  150000  is : 


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1064.0,20.0,3.0,79.0,11999.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2009.0,8.0,4.0,309000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,69.0,60.0,3.0,55.0,2592.0,1.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,2009.0,8.0,4.0,200000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1024.0,20.0,3.0,80.0,10335.0,1.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2008.0,8.0,4.0,127000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1270.0,20.0,3.0,54.0,8777.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2009.0,8.0,4.0,109500.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,406.0,20.0,4.0,70.0,8125.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,8.0,4.0,213500.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,588.0,90.0,4.0,73.0,8640.0,1.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,450.0,12.0,2009.0,8.0,4.0,187000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,930.0,50.0,3.0,73.0,9600.0,1.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2010.0,8.0,4.0,159500.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,979.0,160.0,3.0,90.0,10011.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2007.0,0.0,4.0,175000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1179.0,50.0,3.0,87.0,8445.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,8.0,4.0,271000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,551.0,50.0,3.0,80.0,7560.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2006.0,8.0,4.0,202900.0


None
Best solution for price  90000  is : 


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,734.0,60.0,3.0,73.0,11787.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2006.0,6.0,4.0,189000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,94.0,120.0,3.0,65.0,9100.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,11.0,2008.0,8.0,4.0,180000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,712.0,60.0,3.0,60.0,8521.0,1.0,3.0,3.0,0.0,0.0,...,96.0,0.0,0.0,0.0,0.0,4.0,2009.0,8.0,4.0,124000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,283.0,50.0,3.0,78.0,8816.0,1.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,2007.0,8.0,1.0,200000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,406.0,20.0,4.0,70.0,8125.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,8.0,4.0,213500.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,588.0,90.0,4.0,73.0,8640.0,1.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,450.0,12.0,2009.0,8.0,4.0,187000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,991.0,50.0,3.0,70.049958,12615.0,1.0,1.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2009.0,8.0,4.0,130000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,979.0,160.0,3.0,90.0,10011.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2007.0,0.0,4.0,175000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,557.0,80.0,3.0,70.049958,11250.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,180.0,0.0,0.0,7.0,2010.0,8.0,4.0,123000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,532.0,20.0,1.0,32.0,15523.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2008.0,8.0,4.0,190000.0


None
Best solution for price  75000  is : 


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,734.0,60.0,3.0,73.0,11787.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2006.0,6.0,4.0,189000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,94.0,120.0,3.0,65.0,9100.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,11.0,2008.0,8.0,4.0,180000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1359.0,60.0,3.0,65.0,17920.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2010.0,8.0,5.0,180000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,283.0,50.0,3.0,78.0,8816.0,1.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,2007.0,8.0,1.0,200000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,406.0,20.0,4.0,70.0,8125.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,8.0,4.0,213500.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,588.0,90.0,4.0,73.0,8640.0,1.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,450.0,12.0,2009.0,8.0,4.0,187000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,930.0,50.0,3.0,73.0,9600.0,1.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2010.0,8.0,4.0,159500.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,979.0,160.0,3.0,90.0,10011.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2007.0,0.0,4.0,175000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,557.0,80.0,3.0,70.049958,11250.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,180.0,0.0,0.0,7.0,2010.0,8.0,4.0,123000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,889.0,20.0,3.0,65.0,13811.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,160.0,0.0,0.0,10.0,2008.0,8.0,4.0,116000.0


None
Best solution for price  50000  is : 


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,517.0,30.0,3.0,41.0,11796.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2010.0,8.0,0.0,239000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,941.0,120.0,3.0,60.0,9760.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,2007.0,8.0,3.0,142000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1246.0,20.0,3.0,70.049958,7577.0,1.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2009.0,6.0,4.0,200000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,781.0,120.0,3.0,70.049958,9786.0,1.0,0.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2009.0,8.0,4.0,256000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1452.0,20.0,3.0,52.0,8993.0,1.0,3.0,2.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,10.0,2010.0,8.0,4.0,185000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,588.0,90.0,4.0,73.0,8640.0,1.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,450.0,12.0,2009.0,8.0,4.0,187000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,930.0,50.0,3.0,73.0,9600.0,1.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2010.0,8.0,4.0,159500.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,450.0,20.0,3.0,35.0,11216.0,1.0,3.0,3.0,0.0,0.0,...,0.0,180.0,0.0,0.0,0.0,10.0,2006.0,8.0,4.0,141000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,557.0,80.0,3.0,70.049958,11250.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,180.0,0.0,0.0,7.0,2010.0,8.0,4.0,123000.0


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1252.0,50.0,3.0,30.0,3182.0,1.0,3.0,3.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2008.0,8.0,4.0,305000.0


None


In [None]:
"""PART 2 OF THE ASSIGNMENT"""

In [18]:
linear_regression = LinearRegression()
rf_reg = RandomForestRegressor()
#function to train the linear regression classifier
def train_classifier(train_data,col):
    #dropping the sales price column
    xtrain = train_data.loc[: , train_data.columns.drop(col)]
    #sale price column contains the labels 
    ytrain = train_data[col]
    #applying linear regression classifier
    #fitting the classifier
#     rf_reg.fit(xtrain, ytrain)
    linear_regression.fit(xtrain, ytrain)

In [19]:
#function to calculate the mean squared error
def calculate_MSE(predicted_salesprice,sales_price,gar_area,lot_area):
    summ = 0
    for i in range(0,len(predicted_salesprice)):
        summ += (((sales_price - predicted_salesprice[i] )) + ((gar_area[i] + lot_area[i])*-1))
    mse = summ/len(predicted_salesprice)
    #returning the value of mean squared error
    return mse

#function to find fitness of a individual sample
def individual_fitness(pop):
    #applying label encoding on the columns having object datatype
    pop_ = label_encoding(pop)
    #predicting the value of sales price which is our fitness value
    #     y_pred = rf_reg.predict(pop_)
    train_classifier(pop_,'SalePrice')
    pop1 = pop_.loc[: , pop_.columns.drop('SalePrice')]
    sale_price = linear_regression.predict(pop1)[0]
    
    train_classifier(pop_,'GarageArea')
    pop2 = pop_.loc[: , pop_.columns.drop('GarageArea')]
    gar_area = linear_regression.predict(pop2)[0]
    
    train_classifier(pop_,'LotArea')
    pop3 = pop_.loc[: , pop_.columns.drop('LotArea')]
    lot_area = linear_regression.predict(pop3)[0]

    #returning the fitness values
    return sale_price,gar_area,lot_area

#function to find the fitness for of a single population
def get_individual_fitness(population):
    predicted_salesprice = []
    gar_area = []
    lot_area = []
    #iterating for all the genes in the population
    for i in range(0,len(population)):
        #finding their individual fitness which is the value of saleprice
        s_price,g_area,l_area = individual_fitness(population[i])
        predicted_salesprice.append(s_price)
        gar_area.append(g_area)
        lot_area.append(l_area)
    return predicted_salesprice,gar_area,lot_area

#function to fitness value
def calculate_fitness(population,sales_price):
    #finding fitness of all the individual gene in population
    predicted_salesprice,gar_area,lot_area = get_individual_fitness(population)
    """calculating the mean squared error value for the all the genes in the population
    mse is calculated between the actual value of saleprice and our predicted value"""
    fit = calculate_MSE(predicted_salesprice,sales_price,gar_area,lot_area)
    #returning the fitness value
    return fit

#function to find fitness for all the populations
def get_all_fitness(populations):
    fitness = []
    for i in range(0,len(populations)):
        #appending the sum of fitness of an individual population to a list 
        pop_fit = get_individual_fitness(populations[i])
        s_price = sum(pop_fit[0])
        g_area = sum(pop_fit[1])
        l_area = sum(pop_fit[2])
        summ = s_price + g_area + l_area
        fitness.append(summ)
    return fitness

In [23]:
#function to get the input of price threshold from the user
def get_input():
    price_list = []
    price_points = int(input("Enter the number of price points for which you wish to find house design options"))
    for i in range(0,price_points):
        price_list.append(int(input("Enter price")))
    return price_list

#function to create initial random population and compute its fitness value
def apply_initial_steps():
    price_list = get_input()
    #getting all the columns of the dataset
    col_name = train_data.columns
    #converting the col names to list
    col_name = col_name.to_list()
    populations = create_random_population(col_name)
    fitness = get_all_fitness(populations)
    return price_list,populations,fitness,col_name

def display_results(best):
    for i in range(0,len(best)):
        display(best[i])
    

"""working of genetic algorithm"""
def genetic_algorithm(population,fitness_,price_list,col_name,garage_area,lot_area):
    pop = copy.deepcopy(population)
    fitness = copy.deepcopy(fitness_)
    thres = ((garage_area + lot_area)*-1)
    #for all the price thresholds given
    for i in range(0,len(price_list)):
        j=0
        bestOVERALL = [float('inf'), None] #to store the best overall population
        fitA = 0
        fitB = 0
        counter = 0
        #algorithms works for 20 child populations
        while True:
            j += 1
            randPopA, randPopB,fitness = elitist_wheel_selection(fitness,pop)#selection
            randPopA, randPopB = crossover(randPopA, randPopB)#crosover
            randPopA, randPopB = performMutation(randPopA, randPopB,col_name)#mutation
            fitA = calculate_fitness(randPopA,price_list[i])#computing fitness value
            fitB = calculate_fitness(randPopB,price_list[i])#computing fitness value
            pop,fitness = add_back_to_population(fitA, randPopA, fitB, randPopB, fitness,pop)#adding back to population
            if fitA < bestOVERALL[0]:
                bestOVERALL = (fitA, randPopA)
            elif fitB < bestOVERALL[0]:
                bestOVERALL = (fitB, randPopB)
            print("child population: ",j)
            if fitness.count(fitA) > 3 or fitness.count(fitB) > 3 :
                break
        print("Best solution for price ",price_list[i]," is : ")
        print(display_results(bestOVERALL[1]))

In [24]:
train_data = pd.read_csv('train.csv', index_col=False)
null_dict = find_null_values(train_data)
train_data = remove_null_valued_columns(train_data,null_dict)
train_data = remove_outliers(train_data)
train_data = label_encoding(train_data)

In [None]:
price_list,populations,fitness,col_name = apply_initial_steps()
lot_area = train_data['LotArea'].max()
garage_area = train_data['GarageArea'].max()
genetic_algorithm(populations,fitness,price_list,col_name,garage_area,lot_area)

Enter the number of price points for which you wish to find house design options2
Enter price95000
Enter price78000
child population:  1
child population:  2
child population:  3
child population:  4
child population:  5
child population:  6
child population:  7
child population:  8
child population:  9
child population:  10
child population:  11
child population:  12
child population:  13
child population:  14
child population:  15
child population:  16
child population:  17
child population:  18
child population:  19
child population:  20
child population:  21
child population:  22
child population:  23
child population:  24
child population:  25
child population:  26
child population:  27
child population:  28
child population:  29
child population:  30
child population:  31
child population:  32
child population:  33
child population:  34
child population:  35
child population:  36
child population:  37
child population:  38
child population:  39
child population:  40
child populat

# 