In [None]:
!pip install pymupdf unidecode fuzzywuzzy
# !sudo apt install tesseract-ocr
# !pip install pytesseract
!pip install -q -U google-generativeai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.1/163.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m717.3/717.3 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25h

#Final Code

In [None]:
import fitz
# import pytesseract
# from PIL import Image
# from io import BytesIO
import pandas as pd
from unidecode import unidecode
import re
import numpy as np
from fuzzywuzzy import process, fuzz
from datetime import datetime
from dateutil import relativedelta
import google.generativeai as genai
from google.colab import userdata
GOOGLE_API_KEY=userdata.get('GeminiAPI')
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')

In [None]:
PDF = fitz.open('CV.pdf')

# Get the dictionary blocks for each page
blocks_dict = {}
page_num = 1

for page in PDF:
  file_dictBlocks = page.get_text('dict')['blocks']
  blocks_dict[page_num] = file_dictBlocks
  page_num += 1

# Get the spans, clean, and find if a span is is_upper and is_bold
spans = []

for page, blocks in blocks_dict.items():
  for block in blocks:
    if block['type'] == 0:
      for line in block['lines']:
        for span in line['spans']:

          x0, y0, x1, y1 = list(span['bbox'])
          font_size = span['size']
          text = unidecode(span['text'].strip())
          text = re.sub(r'\s+', ' ', text)
          span_font = span['font']
          is_upper = False
          is_bold = False

          if "bold" in span_font.lower():
            is_bold = True

          if text.isupper():
            is_upper = True
          if text.replace(" ","") !=  "" and font_size > 1:
            spans.append((x0, y0, x1, y1, text, is_upper, is_bold, span_font, font_size))

# Converting spans to a dataframe
span_df = pd.DataFrame(spans, columns=['x0','y0','x1','y1', 'text', 'is_upper','is_bold','span_font', 'font_size'])

# Giving a score for each span based in font_size, is_bold, and is_upper
span_scores = []

for index, span_row in span_df.iterrows():
  score = round(span_row.font_size)
  text = span_row.text
  if span_row.is_bold:
    score += 1
  if span_row.is_upper:
    score += 1

  span_scores.append(score)

values, freq = np.unique(span_scores, return_counts=True) # Getting unique values and their frequency

p_size = max(zip(values, freq), key=lambda pair: pair[1])[0] # Get the key with the highest frequency

# Giving each span a tag of hx, sx, or p
hx = 0 # Level of headers
sx = 0 # Level of smaller text
fontSize_tags = {} # Store tags with the value for each tag

for value in sorted(values, reverse = True):
  if value == p_size:
    fontSize_tags[value] = 'p'
  if value > p_size:
    hx += 1
    fontSize_tags[value] = 'h{0}'.format(hx)
  if value < p_size:
    sx += 1
    fontSize_tags[value] = 's{0}'.format(sx)

span_tags = [fontSize_tags[score] for score in span_scores] # Giving each span a tag based on scores in span_scores
span_df['tag'] = span_tags # Adding column to the span_df dataframe

# Structuring the dataframe based on tags
PDF_structure = []
header_content = "" # Store the content of each header
i = 0 # Loop over headers
j = 0 # Loop over p and s

while i < len(span_df): # Looping span_df records
  if 'h' in span_df.iloc[i]['tag']: # Finding headers
    if i != len(span_df) - 1: # If i is not already on the final row
      j = i + 1
      while j < len(span_df) and 'h' not in span_df.iloc[j]['tag']: # Getting spans with p and s tags as header content
        header_content += (" " if header_content else "") + span_df.iloc[j]['text']
        j += 1

    # Storing headers and their content
    header = {
        span_df.iloc[i]['tag'] : span_df.iloc[i]['text'],
        'content' : [header_content]
    }
    PDF_structure.append(header)
    header_content = "" # Empty header_content for the next header
    if i != len(span_df) - 1: # If i is not already in the final row
      i = j # Jump i to the value of j
    else:
      i += 1
  else: # If p or is is not the content of any header
    not_header = {
        span_df.iloc[i]['tag'] : span_df.iloc[i]['text']
    }
    PDF_structure.append(not_header)
    i += 1

# Storing lower header level inside of higher levels
headers_tmp = [] # Temporary header store for headers with equal levels under a higher level header

for loop in range(hx - 1): # Loop for each header level
  i = len(PDF_structure) - 1

  while i >= 0:

    if next(iter(PDF_structure[i])) > next(iter(PDF_structure[i - 1])): # If the header is a lower level than the previous one
      if headers_tmp == []:
        PDF_structure[i - 1]['content'].append(PDF_structure[i]) # Store the lower header in the higher level content
        del PDF_structure[i] # Delete the last header
      else: # If headers_tmp is not empty
        headers_tmp.append(PDF_structure[i])
        PDF_structure[i - 1]['content'].extend(reversed(headers_tmp)) # Store the content of headers_tmp in the higher level header
        del PDF_structure[i] # Delete the last header
        headers_tmp = []

    elif next(iter(PDF_structure[i])) == next(iter(PDF_structure[i - 1])) and next(iter(PDF_structure[i])) != 'h1': # If headers are of equal levels
      headers_tmp.append(PDF_structure[i])
      del PDF_structure[i]

    else: # If a higher header level is before the a lower level
      if headers_tmp != []:
        PDF_structure = PDF_structure[:i + 1] + list(reversed(headers_tmp)) + PDF_structure[i + 1:] # Return the content of headers_tmp to their place of no higher level found in this iteration
        headers_tmp = []

    i -= 1

# Common nouns for each header of the required section
headers = {
    "work_experience": ['working experience', 'professional experience', 'employment history', 'experience', 'work experience', 'hands-on experience', 'practical experience', 'work record', 'field experience', 'employment record', 'job experience', 'professional background', 'career history', 'expertise', 'past experience'],
    "skills": ['skills', 'hard skills', 'technical skills', 'work skills', 'abilities', 'knowledgeable in', 'competencies', 'strengths', 'capabilities'],
    "Education": ['education', 'education history', 'educational background', 'study', 'academic background', 'university', 'graduation']
    }

def search_headers(data, target_header):
  results = ""
  found = False

  def recursive_search(data, target_header): # Find the target_header
    nonlocal found, results

    for header_item in data:
      if next(iter(header_item.values())) == target_header: # If target_header is found
        results += next(iter(header_item.values())) + "\n" + header_item['content'][0]
        found = True
        if len(header_item['content']) > 1: # If target_header has any content
          extract_content(header_item['content'][1:])

      elif len(header_item['content']) > 1: # Mine the content of target_header and call it recursively
        i = 1
        while i < len(header_item['content']) and not found:
          recursive_search([header_item['content'][i]], target_header)
          i += 1

  def extract_content(data): # Exctract the content of target_header
    nonlocal results
    for inside_item in data:
      results += "\n\n" + next(iter(inside_item.values())) + "\n" + inside_item['content'][0]
      if len(inside_item['content']) > 1:
        i = 1
        while i < len(inside_item['content']):
          extract_content([inside_item['content'][i]])
          i += 1

  recursive_search(data, target_header)
  return results

# To find the section that has the required data (skills, education, and experience)
work_exp = ""
skills = ""
education = ""

for samples in headers.values(): # Iterate over the headers dictionary to find each the required section

  header_scores = []
  for index, span_row in span_df.iterrows(): # Find the scores for each header store them in tuple
    if 'h' in span_row.tag:
      scores = [score[1] for score in process.extract(span_row.text, samples, scorer=fuzz.token_sort_ratio)]
      avg = sum(scores)/len(scores) # Lenght scores equals 5 by default
      header_scores.append((span_row.text, avg)) # Store the average for each header in tuple

  word_score = max(header_scores, key=lambda pair: pair[1]) # Get the header with highest score
  lookfor = search_headers(PDF_structure, word_score[0]) # Get the header section using the recursive function

  # Store each section into a separate variable
  if samples == headers['work_experience']:
    work_exp = lookfor
  elif samples == headers['skills']:
    skills = lookfor
  else:
    education = lookfor

# Regular expressions to find the dates
date_patterns = [
    r'\d{1,4}\s?[-,/]?\s?(?:\d{1,4})?\s?[-,]\s?\d{1,4}\s?[-,/]?\s?(?:\d{1,4})?',
    r'\d{1,4}\s?[-,/]?\s?(?:\d{1,4})?\s?[Tt][Oo]\s?\d{1,4}\s?[-,/]?\s?(?:\d{1,4})?',
    r'\d{1,4}\s?[-,/]?\s?(?:\d{1,4})?\s?[-,]\s?[Pp][Rr][Ee][Ss][Ee][Nn][Tt]'
    ]

current_year = datetime.now().year
current_month = datetime.now().month

# Excract the dates that fit the date formats in date_patterns from work_exp
dates = []
for pattern in date_patterns:
    matches = re.findall(pattern, work_exp)
    dates.extend(matches)

# Calculate total years of experience
sum_all = 0
sum_years = 0
sum_months = 0

for date in dates:
  date = re.split('\s?[-,/]\s?|[Tt][Oo]', date)
  twoORone = False
  for item in date:
    if re.search('[Pp][Rr][Ee][Ss][Ee][Nn][Tt]', str(item)): # Replace the word "Present" with current date

      if len(str(date[0])) == 2 or len(str(date[0])) == 1: # To figure out the the date format to replace "Present"
        date[date.index(item)] = current_month
        date.append(current_year)
      else:
        date[date.index(item)] = current_year
        date.append(current_month)

    else:
      date[date.index(item)] = int(item)
  if len(date) == 4: # To convert int into date based on the date format
    if len(str(date[0])) == 2 or len(str(date[0])) == 1: # mm/yyyy
      datetime1 = datetime(date[1], date[0], 1)
      datetime2 = datetime(date[3], date[2], 1)

    else: # yyyy/mm
      print(date[1])
      datetime1 = datetime(date[0], date[1], 1)
      datetime2 = datetime(date[2], date[3], 1)
  else: # yyyy (no month)
      datetime1 = datetime(date[0], 1, 1)
      datetime2 = datetime(date[1], 1, 1)

  if datetime2 > datetime1: # Find the bigger date
      twoORone = True

  if twoORone: # Find the time_difference between the two dates
    time_difference = relativedelta.relativedelta(datetime2, datetime1)
  else:
    time_difference = relativedelta.relativedelta(datetime1, datetime2)

  # Sum the dates years and months only then all together
  sum_years += time_difference.years
  sum_months += time_difference.months

sum_all = sum_years + round(sum_months/12)

skills = re.sub(r'\s+', ' ', re.sub(r'[*]', ' ', re.sub(r'\n', ' ', skills))) # Clean skills


In [None]:
# Prompt google gemini to generate questions with the entered position the skills
position = input("Enter the position you want to generate questions for: ")
prompt = model.generate_content(f"Generate 10 cognitive questions about {position} position, and give it a number from 1 to 3 at the end of the question between () the based on the difficulty of the question (where 3 is more difficult than 1) taking into account the following {skills}")

Enter the position you want to generate questions for: Senior Software Developer


In [None]:
# Clean and extract the questions and prompt the user to answer them
matches = re.findall(r'\n(\d+\..*?\(\d+\))', prompt.text)
answers = [] # Store questions and answers
print('Please answer the following questions:')
for question in matches:
  answer = input(question)
  answers.append((question, answer))


Please answer the following questions:
1. **Describe a situation where you had to debug a complex software issue in a large codebase. How did you approach the problem, and what tools or techniques did you employ? (2)In a previous role, I encountered a complex software issue in a large codebase where a critical feature was intermittently failing. I approached the problem by first gathering detailed logs and error messages to pinpoint when and where the issue occurred. Using debugging tools like breakpoints in the IDE, I traced the execution flow and examined variable values to identify anomalies. Collaborating with team members, we reviewed the code together, conducting peer reviews and brainstorming sessions to explore potential causes and solutions. Ultimately, isolating the root cause required meticulous attention to detail and systematic testing of hypotheses until we successfully resolved the issue.
2. **Explain the trade-offs involved in choosing between a relational database (MS 

[('1. **Describe a situation where you had to debug a complex software issue in a large codebase. How did you approach the problem, and what tools or techniques did you employ? (2)',
  'In a previous role, I encountered a complex software issue in a large codebase where a critical feature was intermittently failing. I approached the problem by first gathering detailed logs and error messages to pinpoint when and where the issue occurred. Using debugging tools like breakpoints in the IDE, I traced the execution flow and examined variable values to identify anomalies. Collaborating with team members, we reviewed the code together, conducting peer reviews and brainstorming sessions to explore potential causes and solutions. Ultimately, isolating the root cause required meticulous attention to detail and systematic testing of hypotheses until we successfully resolved the issue.'),
 ('2. **Explain the trade-offs involved in choosing between a relational database (MS SQL) and a NoSQL databas

In [None]:
# Prompt google gemini to grade the answers
prompt = model.generate_content(f"Provide ONLY the total grade out of 100 and the grades for the following question, answer pairs. Take into account the numbers (1) to (3) the higher the number the higher the grade that make up the total 100 grade, and keep in mind the the questions with the same number (1), (2), (3) must be marked out of the same grade: {answers}")

In [None]:
#Create a txt file to write the results on it
with open('CV_Report.txt', 'w') as f:
    f.write("- " + work_exp + '\n\n')
    f.write("- " + skills + '\n\n')
    f.write("- " + education + '\n\n')
    f.write(f"- The candadit has: {sum_all} years of experience and is applaying for the {position} position\n\n")
    f.write(f"- Candadit's questions and answers:\n\n")
    for q,a in answers:
      f.write(f"Q:{q} \n A:{a}\n")
    f.write(f"\n\n- Candadit's grades:\n {prompt.text}")