In [None]:
#!pip install docx2txt
import docx2txt
job_description = docx2txt.process('Description.docx')
resume = docx2txt.process('Resume.docx')
content = [job_description, resume]
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
count_matrix = cv.fit_transform(content)
from sklearn.metrics.pairwise import cosine_similarity
mat = cosine_similarity(count_matrix)
print('Resume Matches by: '+  str(mat[1][0]*100) + '%')

A code to judge if two images depict the same person or not.
Returns True if the images are of the same person or else returns false.
Author : Ankshuk Ray

Testing the functionality

In [None]:
import cv2
import numpy as np
from docx import Document
import fitz
import face_recognition
from skimage.io import imread
from skimage.transform import resize
import pdf2image
import logging
import matplotlib.pyplot as plt

def extract_images_docx(cv_doc):
    doc = Document(cv_doc)
    iml=[]
    for rel in doc.part.rels.values():
        if "image" in rel.reltype:
            image_data = rel.target_part.blob
            nparr = np.frombuffer(image_data, np.uint8)
            img_np = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
            iml.append(img_np)
    return iml

# Function to extract images if the document is of PDF format


def extract_images_pdf(cv_doc):
    iml=[]
    # Open the PDF file
    pdf = fitz.open(cv_doc, filetype="pdf")
    for page in pdf:
        pix = page.get_pixmap()
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
            pix.h, pix.w, pix.n)
        iml.append(img)
    return iml

def rotate_image(image, angle):
    # Get image dimensions
    height, width = image.shape[:2]

    # Calculate the rotation matrix
    rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1.0)

    # Apply rotation to the image
    rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))

    return rotated_image

def enhanceImage(image):
    # Convert image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply denoising
    denoised = cv2.fastNlMeansDenoising(gray, h=10)

    # Enhance contrast
    clahe = cv2.createCLAHE(clipLimit=2.0)
    enhanced = clahe.apply(denoised)

    # Apply sharpening
    sharpened = cv2.GaussianBlur(enhanced, (0, 0), 3)
    sharpened = cv2.addWeighted(enhanced, 1.5, sharpened, -0.5, 0)
    return sharpened


def shwimg(img):
    plt.imshow(img)
    plt.show()



angles = [30, 60, 90, -30, -60, -90]

def get_faces(image,rotating=False):
    faces = []
    # rgb_image = enhanceImage(image)
    rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    rotates=[rgb_image]
    if rotating:
        rotates+=[rotate_image(rgb_image, angle) for angle in angles]
    for rt_img in rotates:
        # Detect faces in the image
        face_locations = face_recognition.face_locations(rt_img, model='cnn',number_of_times_to_upsample=2)
        # Extract the face region
        for face_location in face_locations:
            top, right, bottom, left = face_location
            face_image = rt_img[top:bottom, left:right]
            shwimg(face_image)
            face_encoding = face_recognition.face_encodings(
                rt_img, known_face_locations=[face_location],num_jitters=100, model='large')[0]
            faces.append(face_encoding)
    return faces
# Function to extract human face Images from PDF or word document


def extract_human_faces(inp_doc, scannedpdfImages, rotating=False):
    image_list = []
    f = True
    try:
        if inp_doc.endswith('.docx'):
            image_list=extract_images_docx(inp_doc)
        elif inp_doc.endswith('.pdf'):
            image_list=extract_images_pdf(inp_doc)
        else:
            image_list = [imread(inp_doc)]
        assert(len(image_list)>0)
    except:
        if inp_doc.endswith('.pdf'):
            f=not f
            image_list = [np.array(x) for x in scannedpdfImages]
        else:
            return []
    humanImages = []
    for image in image_list:
        humanImages += get_faces(image,rotating)
    return humanImages

def is_fake(cv_path, id_path, cvimages, idimages):
    # Initializations
    (overallStat, matchtype) = ('FAIL', 'NO-Match')
    # extract face from CV
    face_from_cv = extract_human_faces(cv_path, cvimages)
    faces_from_id = extract_human_faces(id_path, idimages, rotating=True)
    if (len(faces_from_id) < 1):
        return {
            'CV_MATCH': matchtype,
            'ID-Authentication':None,
            'Id-Summary': "PROVIDED WRONG ID DOCUMENT",
            'Over-All-Status': overallStat}
    if (len(face_from_cv) < 1):
        return {
            'CV_MATCH': matchtype+" PROVIDED CV without a picture ".capitalize(),
            'ID-Authentication':None,
            'Id-Summary': None,
            'Over-All-Status': overallStat}
    for face in face_from_cv:
        matchtype = "Full Match" if sum(face_recognition.compare_faces(
            faces_from_id, face)) != 0 else "No-Match"
        if matchtype != "Full Match":
            return {
            'CV_MATCH': matchtype + " - Faces Don't match ",
            'ID-Authentication':None,
            'Id-Summary': None,
            'Over-All-Status': overallStat}            
    # un-comment theese lines and comment the others to stop the description invocation    
    return {
        'CV_MATCH': matchtype,
        'ID-Authentication':None,
        'Id-Summary': None,
        'Over-All-Status': "PASS"}

## Notebook Method to ingest docs
def imageIngestion(s1:str,s2:str):
    img_array1=None
    img_array2=None
    if s1.endswith(".pdf"):
        img_array1=pdf2image.pdf2image.convert_from_path(s1)
    if s2.endswith(".pdf"):
        img_array2=pdf2image.pdf2image.convert_from_path(s2)
    print(is_fake(s1,s2,img_array1,img_array2))

s1='PAN.jpeg'
s2='Passport.pdf'
imageIngestion(s1,s2)