<a href="https://colab.research.google.com/github/Ghiles1010/Wikipedia-Path-Finder/blob/master/Link_Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load word embeddings

In [259]:
from gensim.models import KeyedVectors
import gensim.downloader 

model = gensim.downloader.load('glove-wiki-gigaword-200')

# Class to contain a node

In [266]:
class Node: 

  def __init__(self, name, depth, similarity, visited=False):
    self.name = name
    self.depth = depth
    self.similarity = similarity
    self.cost = 1 - similarity + depth * 0.2

# Text treatment

In [267]:
import urllib.request
import re


def get_link(title):
  return "https://en.wikipedia.org/wiki/"+"_".join(title.split())


def get_refs(article):
  """ returns references of an article """

  # fetch html code
  link = get_link(article)
  html = urllib.request.urlopen(link).read().decode('utf-8')

  # get the tiltles of the other articles
  references = re.findall("href=\"\/wiki/([A-Za-z0-9_]+?)\"", html)
  references = list(map(lambda x : " ".join(x.split("_")), references))

  return references

# Sub-Functions

In [268]:
import numpy as np


def get_vector(word):

  matrix = []
  for i in word.split():
    matrix.append( model[i] )
  
  return np.average(matrix, axis=0)



def get_distance(node, goal):
  
  vector_1 = get_vector(node)
  vector_2 = get_vector(goal)

  return np.dot(vector_1, vector_2) / (np.linalg.norm(vector_1) * np.linalg.norm(vector_2))


def is_goal(article, goal, depth):
  """ tells if it is the goal """

  references = get_refs(article)

  references = list(map(lambda x : x.lower(), references))
  
  # if it's a goal, we return the 
  for ref in references :
    if ref.lower() == goal.lower():
      return True, get_link(ref)

  i, length = 0, len(references)
  
  while i < length :

    if references[i] in model.vocab : 

      similarity = get_distance(references[i], goal)
      references[i] = Node(references[i], depth, similarity)
      i += 1
    
    else :

      del references[i]
      length -= 1
  
  return False, references



def combine(open, result):

  " Order closed and open lists, note that we are inserting in a sorted list"

  length = len(open)

  for res in result :
    index = 0
    
    while index < length and res.cost > open[index].cost :
      index += 1

    open = open[:index] + [res] + open[index:]
  
  
  return open


def gen_path(came_from, node, goal):
  path = [node]

  while came_from.get(node):
    parent = came_from.get(node)
    path = [parent] + path
    node = parent

  node_goal = Node(goal, parent.depth+1, 0)
  path.append(node_goal)
  return path



# A* Algorithm

In [269]:

def A_star(start, goal):

  """ the actual algorithm """


  came_from = {}

  node = Node(start, 1, 1)

  open = [node]
  closed = []

  while open :

    node = open[0]
    open = open[1:]

    if not ( node.name in closed ) :

      closed.append(node.name)
      found, result = is_goal(node.name, goal, node.depth)

      # memorise path
      for w in result : came_from[w] = node

      if found :
        path = gen_path(came_from, node, goal)
        return path, came_from

      # combine according to evaluation function
      open = combine(open, result)



def find_path(start, goal):

  import time

  """ Interface """
  start = start.lower()
  goal = goal.lower()

  top = time.time()
  path, nodes = A_star(start, goal)
  end = time.time()

  t = end - top

  print("Solution found in ", t, "seconds\n")
  for p in path:
    print(p.name, get_link(p.name))

# Test

In [270]:
find_path("Lionel Messi", 'plywood')

Solution found in  1.4419524669647217 seconds

lionel messi https://en.wikipedia.org/wiki/lionel_messi
isco https://en.wikipedia.org/wiki/isco
alicante https://en.wikipedia.org/wiki/alicante
anapa https://en.wikipedia.org/wiki/anapa
novorossiysk https://en.wikipedia.org/wiki/novorossiysk
steel https://en.wikipedia.org/wiki/steel
aluminium https://en.wikipedia.org/wiki/aluminium
furniture https://en.wikipedia.org/wiki/furniture
hardwood https://en.wikipedia.org/wiki/hardwood
plywood https://en.wikipedia.org/wiki/plywood
