In [1]:
import numpy as np
from deap import base, creator, tools, algorithms
import random
import pandas as pd

In [2]:
df = pd.read_csv('C:/Users/Fathima/Desktop/Me/Trustline Digital/Genetic-Algorithm-For-Keyword-Optimization/Keyword-data.csv')
keywords = df['Keywords'].tolist()
volumes = df['Volume'].tolist()
difficulties = df['Keyword Difficulty'].tolist() 

In [3]:
df.head()

Unnamed: 0,Keywords,Volume,Keyword Difficulty
0,digital marketing agency dubai,3600,40
1,cms website development dubai,90,8
2,conversion rate optimization service in dubai,90,9
3,brochure design dubai,170,11
4,advertising agencies in dubai,1000,26


### Parameter Setting

In [4]:
difficulty_threshold = 0.75
penalty = 1000

# Crossover operator parameter
alpha = 0.5  # Blend factor for crossover

# Mutation operator parameters
mu = 0  # Mean of Gaussian distribution for mutation
sigma = 1  # Standard deviation of Gaussian distribution for mutation
indpb = 0.2  # Probability of gene being mutated

# Selection operator parameter
tournsize = 3  # Tournament selection size

# Population size
pop_size = 50  # Initial population size

# Number of generations
ngen = 40  # Number of generations

# Crossover and mutation probabilities
cxpb = 0.5  # Probability of crossover
mutpb = 0.2  # Probability of mutation 

In [5]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

In [6]:
toolbox = base.Toolbox()
toolbox.register("attr_float", random.random)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, n=len(keywords))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [None]:
def evaluate(individual):
    total_volume = np.dot(individual, volumes)
    total_difficulty = np.dot(individual, difficulties)
    if total_difficulty > difficulty_threshold:
        total_volume -= penalty * (total_difficulty - difficulty_threshold)
    return total_volume, #must return a tuple for DEAP


toolbox.register("mate", tools.cxBlend, alpha=alpha)
toolbox.register("mutate", tools.mutGaussian, mu=mu, sigma=sigma, indpb=indpb)
toolbox.register("select", tools.selTournament, tournsize= tournsize)
toolbox.register("evaluate", evaluate)

# toolbox.register("mate", tools.cxBlend, alpha=0.5)
# toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=1, indpb=0.2)
# toolbox.register("select", tools.selTournament, tournsize=3)
# toolbox.register("evaluate", evaluate)


In [None]:
def main():
    random.seed(64)
    pop = toolbox.population(n=pop_size)
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=ngen,
                                   stats=stats, halloffame=hof, verbose=True)
    # pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=40, 
    #                                 stats=stats, halloffame=hof, verbose=True)
    return pop, log, hof

if __name__ == "__main__":
    pop, log, hof = main()
    print("Best individual is %s\nwith fitness: %s" % (hof[0], hof[0].fitness))

    # Normalize weights to sum to 1 and not get negative weights
    best_weights = np.clip(hof[0], 0, None)
    best_weights = best_weights / np.sum(best_weights)
    print("Best weights:", best_weights)

    # Create output dataframe
    result_df = pd.DataFrame({
        'Keyword': keywords,
        'Optimal Weight': best_weights
    })

    # Save to csv file
    result_df.to_csv('Optimization_Output.csv', index=False)
    print("Results saved to Optimization_Output.csv")

gen	nevals	avg   	std    	min    	max     
0  	50    	-87122	19362.8	-126784	-43514.4
1  	29    	-69182.5	19424.3	-110502	-19652.1
2  	32    	-54274.4	20386.9	-98762.6	-1504.29
3  	39    	-35171.9	28258.1	-127377 	-401.337
4  	26    	-22033.9	28389.9	-121656 	606.987 
5  	28    	-7678.53	12507.6	-68181.8	606.987 
6  	36    	-2538.96	5884.37	-39835.5	606.987 
7  	28    	-594.38 	1696.32	-11713.5	993.657 
8  	29    	-4102.68	12842.9	-80429  	1454.49 
9  	22    	-2019.61	6359.43	-28326.9	1454.49 
10 	36    	-4241.33	19040.6	-125468 	1841.59 
11 	18    	722.071 	1918.61	-11198.4	1841.59 
12 	36    	-421.653	4447.88	-17464.3	1841.59 
13 	37    	-2713.89	11131.7	-54966.1	2517.75 
14 	28    	-1299.99	9244.21	-44593.1	2541.15 
15 	31    	-3585.87	18983.6	-116280 	2541.15 
16 	24    	-330.944	8071.85	-39306.1	2541.15 
17 	32    	-1213.37	9229.27	-47269.1	2600.79 
18 	34    	637.03  	4629.19	-24953.7	3083.71 
19 	22    	1637.65 	5074.09	-33637.1	3018.42 
20 	29    	1200.08 	5365.28	-30601.7	2909