In [3]:
# Install required packages
!pip install tensorflow_text
!pip install tensorflow # to check version: 
!pip install gpl
!pip install sentence-transformers==2.6.1
!pip install nltk


Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached click-8.1.7-py3-none-any.whl (97 kB)
Installing collected packages: click, nltk
Successfully installed click-8.1.7 nltk-3.8.1


In [4]:
# Import required libraries
import pandas as pd 
import re 
import json
from tqdm.autonotebook import tqdm
import gpl
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm.auto import tqdm

2024-04-10 14:20:17 - PyTorch version 2.2.2 available.
2024-04-10 14:20:17 - TensorFlow version 2.16.1 available.
2024-04-10 14:20:18 - Loading faiss with AVX2 support.
2024-04-10 14:20:18 - Successfully loaded faiss with AVX2 support.


In [5]:
# functions to perform pre-processing
def preprocess(text):
    text = re.sub(r"[^a-zA-Z0-9,.'?]+", ' ', str(text)) # remove special characters
    text = [text[:len(text)//2],text[len(text)//2:]] # split the text into two parts: first half and second half
    return text

def split_first(text):
    return len(text.split()) # return the length of the first half of the text

def convert_str(text):
    return str(text)+"_" # add underscore at the end of the text

In [6]:
# load the dataset
df = pd.read_csv('UpdatedResumeDataSet.csv') # read the data
df

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


In [9]:
'''
Preprocess the data
'''

# lower case all the text in the dataset
df = df.apply(lambda x:x.str.lower())
# Drop any null values
df = df.dropna()

# Preprocess the Text in 'Resume' and add the preprocessed text as new column 'new_text'
df['new_text'] = df.Resume.apply(preprocess)

# Explode and reset the index: Convert each item in the list under the new_text column into a separate row, effectively doubling the number of rows in the DataFrame.
df = df.explode("new_text")
df = df.reset_index(drop=True)

# Add Identifiers and Filtering
df["_id"] = df.index # add an identifier column
df['num'] = df['new_text'].apply(split_first) # add a column to store the length of the first half of the text
df = df[df['num'] < 400] # filter out the rows where the length of the first half of the text is greater than 400
df["_id"] = df["_id"].apply(convert_str) # add underscore at the end of the identifier

# adjust the column names
df['title'] = ""
df['metadata'] = ""
df['title'] = df['title'].astype(str)
df['text'] = df['new_text'].astype(str)
df['_id'] = df['_id'].astype(str)
df['concat'] = "qgen" + df["title"] + " " + df["text"] # intended for query generation, prefixed with "qgen".

# export to JSON and JSON Lines Format: useful for NLP tasks as it allows for efficient loading of large datasets line by line.
df[['_id', 'title', 'text', 'metadata']].to_json('corpus.json',orient='records')
df[['_id', 'title', 'text', 'metadata']].to_json('corpus.jsonl',orient='records',lines=True)

In [10]:
# load the data from JSON
f = open('corpus.json')
data = json.load(f)

# Write Data to JSONL: 
with open('corpus.jsonl', 'w') as outfile:
    for entry in data:
        json.dump(entry, outfile)
        outfile.write('\n')

# dump the correct format: This converts the data into JSON Lines format, where each line is a valid JSON string, useful for stream processing or line-by-line reading.
filepath = 'corpus.jsonl'

# Aggregate JSONL Back into JSON:
with open(filepath, 'r') as infile, open('output.json', 'w') as outfile:
    data = [json.loads(line) for line in infile]
    json.dump(data, outfile)


# Convert JSON to CSV
df = pd.read_json('output.json')
df['num']=df['text'].apply(split_first)
df = df[df['num']<400]
df.to_csv('final_.csv')

In [11]:
'''
- Generate queries (Resume) from a passage of text using a T5 model: The Query highlights key aspects of the resume's content, not questions but rather capture essential elements, skills, experiences, or qualifications presented in the resume.
- Transforming Resume content into queries that might be used to retrieve similar documents or information.
- 
'''
# Loading the model:
model_name = 'doc2query/msmarco-t5-base-v1' # Model specific to the task of generating queries from documents.
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Prepare Text for Query Generation:
passage = df['text'].iloc[5] # select a passage from the dataset

# Tokenize the passage
inputs = tokenizer(passage, return_tensors='pt') # return the tokenized passage as PyTorch tensors

# Generate Queries: 3 queries are generated for the given passage
outputs = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_length=64,
    do_sample=True,
    top_p=0.95,
    num_return_sequences=3
)

# Display the Original Passage and Generated Queries
print("Paragraph:")
print(passage)

print("\nGenerated Queries:")
for i in range(len(outputs)):
    query = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(f'{i + 1}: {query}')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Paragraph:
d electrical enthusiast skill details data analysis exprience less than 1 year months excel exprience less than 1 year months machine learning exprience less than 1 year months mathematics exprience less than 1 year months python exprience less than 1 year months matlab exprience less than 1 year months electrical engineering exprience less than 1 year months sql exprience less than 1 year monthscompany details company themathcompany description i am currently working with a casino based operator name not to be disclosed in macau.i need to segment the customers who visit their property based on the value the patrons bring into the company.basically prove that the segmentation can be done in much better way than the current system which they have with proper numbers to back it up.henceforth they can implement target marketing strategy to attract their customers who add value to the business.

Generated Queries:
1: d electrical enthusiast skills
2: what are the skills required

### Purpose of Queries in Matching Resumes to Job Descriptions

1. **Keyword Extraction and Emphasis:** Each query acts as a distilled representation of parts of the resume, emphasizing skills, experiences, or qualifications that might be relevant to potential employers or match specific job descriptions.

2. **Enhanced Searchability:** By converting sections of a resume into queries, the system can more effectively use these queries to search through job descriptions or a database of job requirements. This reverses the typical job application process, making the resumes actively "search" for matching job opportunities.

3. **Semantic Matching:** These queries help in moving beyond simple keyword matching by leveraging the T5 model's understanding of language to generate search terms that capture the meaning and context of the resume's content. This leads to more nuanced and semantically relevant matches between job descriptions and candidate profiles.

4. **Highlighting Candidate's Fit:** The generated queries can serve to pinpoint why a candidate might be a good fit for a role, highlighting specific skills or experiences in the form of searchable and matchable text snippets.

### Example Interpretation
Given the output from the T5 model in your example:
- **"hp experience required"** might highlight a specific skill or qualification mentioned in the resume, albeit in a somewhat abstract way.
- **"what is experience"** seems like a less directly applicable query but might relate to the model trying to abstract the concept of experience from the resume.
- **"what is an electrical enthusiast"** directly references a specific interest or skill area mentioned in the resume, making it a potentially useful query for matching with job descriptions looking for candidates passionate about electrical engineering.

In essence, the "query" in this matching system is a tool generated by processing the resume's text to create a bridge between the candidate's profile and potential job opportunities, enhancing the ability to match resumes with job descriptions based on deeper textual understanding.

In [21]:
from tqdm.auto import tqdm
import torch

# Assuming df is your DataFrame and contains the 'text' column with passages
passages = df['text'].to_numpy()
num_queries = 3
# Determine how many passages have been processed already
with open('pairs.tsv', 'r', encoding='utf-8') as file:
    processed_lines = sum(1 for line in file)
processed_passages = processed_lines // num_queries  # Assuming 3 queries per passage

# Adjusted target calculation based on remaining passages
target = (len(passages) - processed_passages) * num_queries

model = T5ForConditionalGeneration.from_pretrained(model_name).cuda()
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = torch.nn.DataParallel(model)

batch_size = 128
count = 0  # Reset count based on the number of already processed queries
passage_batch = []

# Open the file in append mode to add missing pairs
with open('pairs.tsv', 'a', encoding='utf-8') as fp, tqdm(total=target) as progress:
    for index, passage in enumerate(passages[processed_passages:], start=processed_passages):
        if count >= target: break
        passage = passage.replace('\t', ' ').replace('\n', ' ')
        passage_batch.append(passage)
        
        if len(passage_batch) == batch_size or index == len(passages) - 1:  # Check if it's also the last batch
            inputs = tokenizer(passage_batch, truncation=True, padding=True, max_length=256, return_tensors='pt').to('cuda')

            outputs = model.module.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'],
                                            max_length=64, do_sample=True, top_p=0.95, num_return_sequences=num_queries)

            decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            for i, query in enumerate(decoded_output):
                query = query.replace('\t', ' ').replace('\n', ' ')
                passage_idx = int(i / num_queries)  # Get the index of the passage to match query
                fp.write(query + '\t' + passage_batch[passage_idx] + '\n')
                count += 1

            passage_batch = []  # Clear the batch to free memory
            torch.cuda.empty_cache()  # Free GPU memory
            progress.update(len(decoded_output))


Using device: cpu


  0%|          | 148/200000 [01:39<37:10:26,  1.49it/s]


KeyboardInterrupt: 

In [32]:
# Initalize the Sentence Transformer model
model = SentenceTransformer('msmarco-distilbert-base-tas-b')
model.max_seq_length = 256

# Define Generator to Read Query-Passage Pairs
def get_text():
    with open('pairs.tsv', 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    for line in tqdm(lines):
        try:
            query, passage = line.split('\t')
            yield query, passage
        except ValueError:
            pass

# use the generator to get the query and passage
pair_gen = get_text()
for i, (query, passage) in enumerate(pair_gen):
    print(query)
    print()
    print(passage)
    break


[2024-04-09 23:08:25] INFO [sentence_transformers.SentenceTransformer.__init__:66] Load pretrained SentenceTransformer: msmarco-distilbert-base-tas-b
[2024-04-09 23:09:19] INFO [sentence_transformers.SentenceTransformer.__init__:105] Use pytorch device: cpu
  0%|          | 0/5068 [00:00<?, ?it/s]

what languages do i use to do deep learning

skills programming languages python pandas, numpy, scipy, scikit learn, matplotlib , sql, java, javascript jquery. machine learning regression, svm, na ve bayes, knn, random forest, decision trees, boosting techniques, cluster analysis, word embedding, sentiment analysis, natural language processing, dimensionality reduction, topic modelling lda, nmf , pca neural nets. database visualizations mysql, sqlserver, cassandra, hbase, elasticsearch d3.js, dc.js, plotly, kibana, matplotlib, ggplot, tableau. others regular expression, html, css, angular 6, logstash, kafka, python flask, git, docker, computer vision open cv and understanding of deep learning.education details data science assurance associate data science assurance associate ernst young llp skill details javascript exprience 24 months jquery exprience 24 months python exprience 24 monthscompany details company ernst young llp description fraud investigations and dispute services assura

In [35]:
from tqdm.auto import tqdm
import pickle

# Define get_text() to read query-passage pairs from 'pairs.tsv'
def get_text():
    with open('pairs.tsv', 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    for line in tqdm(lines):
        try:
            query, passage = line.split('\t')
            yield query, passage
        except ValueError:  # Handles lines that don't have a tab character
            pass

# Re-populate the pairs list using the get_text generator
pairs = [pair for pair in get_text()]

# Initialize variables for embedding storage
passage_batch = []
id_batch = []
embeddings_store = []  # Store embeddings here
batch_size = 64

# Load the SentenceTransformer model as before
# Assuming model has been loaded here as shown in previous examples

# Process passages to avoid duplication and batch for embedding
for i, (query, passage) in enumerate(pairs):  # Now using the populated pairs list
    if passage not in passage_batch: 
        passage_batch.append(passage)
        id_batch.append(str(i))

    if len(passage_batch) == batch_size:
        # Encode passages to embeddings
        embeds = model.encode(passage_batch).tolist()
        for idx, emb in zip(id_batch, embeds):
            embeddings_store.append((idx, emb))
        passage_batch = []
        id_batch = []

# Ensure any remaining passages are processed
if passage_batch:
    embeds = model.encode(passage_batch).tolist()
    for idx, emb in zip(id_batch, embeds):
        embeddings_store.append((idx, emb))

# Save embeddings to a local file for later retrieval
with open('embeddings_store.pkl', 'wb') as f:
    pickle.dump(embeddings_store, f)

print(f"Total embeddings stored: {len(embeddings_store)}")


100%|██████████| 5068/5068 [00:00<00:00, 306947.56it/s]


Batches: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]
Batches: 100%|██████████| 2/2 [00:03<00:00,  1.59s/it]
Batches: 100%|██████████| 2/2 [00:03<00:00,  1.77s/it]
Batches: 100%|██████████| 2/2 [00:04<00:00,  2.08s/it]
Batches: 100%|██████████| 2/2 [00:04<00:00,  2.25s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.66s/it]

Total embeddings stored: 344





In [36]:
import pickle
import random
from sentence_transformers import util
from tqdm.auto import tqdm

# Loading embeddings from file
with open('embeddings_store.pkl', 'rb') as f:
    embeddings_store = pickle.load(f)

# Assuming `pairs` is already defined in your context as query-passage pairs
batch_size = 100
triplets = []

for i in tqdm(range(0, len(pairs), batch_size)):
    i_end = min(i+batch_size, len(pairs))
    queries = [pair[0] for pair in pairs[i:i_end]]
    pos_passages = [pair[1] for pair in pairs[i:i_end]]

    # Create query embeddings
    query_embs = model.encode(queries, convert_to_tensor=True, show_progress_bar=False)

    for query_idx, (query, pos_passage) in enumerate(zip(queries, pos_passages)):
        # This will store cosine similarities between the current query embedding and all passage embeddings
        cosine_similarities = []
        for _, emb in embeddings_store:
            sim = util.pytorch_cos_sim(query_embs[query_idx], emb).item()
            cosine_similarities.append(sim)
        
        # Sort passages by similarity to the query and select a negative sample
        sorted_passage_idxs = sorted(range(len(cosine_similarities)), key=lambda k: cosine_similarities[k], reverse=True)
        for idx in sorted_passage_idxs:
            # Assuming the first passage is the most similar one, we skip it to find a negative sample
            neg_passage = pairs[idx][1]
            if neg_passage != pos_passage:
                triplets.append(f"{query}\t{pos_passage}\t{neg_passage}")
                break  # Break after finding the first suitable negative

# Save the triplets to a file
with open('triplets.tsv', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(triplets))


  0%|          | 0/51 [00:00<?, ?it/s]

100%|██████████| 51/51 [04:27<00:00,  5.24s/it]


In [None]:
# Dont run this, It will take a lot of time
'''
takes triplets of query, positive passage, and negative passage from a TSV file,
uses a CrossEncoder model to score the relevance of the positive and negative passages to the query,
calculates the margin between these scores, and saves the results to a new TSV file
'''

from sentence_transformers import CrossEncoder

model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Define a Generator to Read Triplets from 'triplets.tsv'
def get_lines():
    with open('triplets.tsv', 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    for line in tqdm(lines):
        q, p, n = line.split('\t')
        yield q, p, n

# Scoring Triplets and Calculating Margins
lines = get_lines()
label_lines = []

for line in lines:
    q, p, n = line
    p_score = model.predict((q, p))
    n_score = model.predict((q, n))
    margin = p_score - n_score
    label_lines.append(
        q + '\t' + p + '\t' + n + '\t' + str(margin)
    )

# Save the Results
with open("triplets_margin.tsv", 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(label_lines))



[2024-04-09 23:33:34] INFO [sentence_transformers.cross_encoder.CrossEncoder.__init__:56] Use pytorch device: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 26.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 11.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 46.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 11.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 63.73it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.93it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.28it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.49it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.34it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.41it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 11.36it/s]
Batche

In [41]:
from tqdm.auto import tqdm
from sentence_transformers import InputExample

# Prepare Training Data
training_data = []

with open('triplets_margin.tsv', 'r', encoding='utf-8') as fp:
    lines = fp.read().split('\n')

for line in tqdm(lines):
    q, p, n, margin = line.split('\t')
    training_data.append(InputExample(
        texts=[q, p, n],
        label=float(margin)
    ))

# Intialize the Data Loader
batch_size = 32

loader = torch.utils.data.DataLoader(
    training_data, batch_size=batch_size, shuffle=True
)

# Set up the Sentence Transformer model
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-tas-b')
model.max_seq_length = 256



100%|██████████| 5067/5067 [00:00<00:00, 244997.33it/s]
[2024-04-09 23:56:35] INFO [sentence_transformers.SentenceTransformer.__init__:66] Load pretrained SentenceTransformer: msmarco-distilbert-base-tas-b
[2024-04-09 23:56:35] INFO [sentence_transformers.SentenceTransformer.__init__:105] Use pytorch device: cpu


In [43]:
# Import loss function
from sentence_transformers import losses

loss = losses.MarginMSELoss(model)

# Setting Training Parameters
epochs = 10
warmup_steps = int(len(loader) * epochs * 0.1)

# Training the Model
model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='msmarco-distilbert-base-tas-b-final',
    show_progress_bar=True
)

# Save the Model
model.save('msmarco-distilbert-base-tas-b-final') 

# Load the Model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('msmarco-distilbert-base-tas-b-final')



Iteration: 100%|██████████| 159/159 [44:32<00:00, 16.81s/it]
Iteration:  84%|████████▍ | 134/159 [40:23<07:32, 18.09s/it]
Epoch:  10%|█         | 1/10 [1:24:56<12:44:25, 5096.12s/it]


KeyboardInterrupt: 

In [75]:
!pip install --upgrade sentence-transformers==2.6.1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [7]:
import sys
print(sys.executable)


/Users/gabrieldeolaguibel/IE/DevOps_Assignement1/NLP_Resume_Ranker/.venv/bin/python


In [8]:
import pkg_resources

# Get the version of sentence-transformers
sentence_transformers_version = pkg_resources.get_distribution('sentence-transformers').version
print(sentence_transformers_version)


2.6.1


In [9]:
# Load the Model
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('msmarco-distilbert-base-tas-b-final')

[2024-04-10 14:20:40] INFO [sentence_transformers.SentenceTransformer.__init__:107] Load pretrained SentenceTransformer: msmarco-distilbert-base-tas-b-final
[2024-04-10 14:20:41] INFO [sentence_transformers.SentenceTransformer.__init__:213] Use pytorch device_name: mps


In [10]:
queries = ["""I have 3.5+ years of work experience and was working as a data scientist with 3 different organizations. I was responsible for using predictive modelling, data processing, and data mining algorithms to solve challenging business problems.
My technology stack includes but not limited to, are python, machine learning, deep learning, time-series, web scraping, flask, FastAPI, snowflake SQL servers, deploying production based servers, keras, TensorFlow, hugging face, Big Data and Data Warehouses. In my career, my growth has been exponential, and I developed interpersonal skills, now I know how to handle a project end to end.
My area of interests are applied machine learning, deep neural network, time series and everything around NLP in the field of ecommerce and consumer internet. My research focus is on information retrieval involving neuroscience and deep reinforcement learning.
I like to listen to a lot of learning courses and read research papers involving deep learning. In my spare time I like to keep up with the news, read blogs on medium and watch a few sci-fi films.""",

           "Snehil started his entrepreneurial journey 14 years ago with the launch of a social networking site along with music and video streaming portals back in 2006, while he was still in school. In 2011 while pursuing engineering in Computer Science, he joined Letsbuy, an e-commerce startup, where he developed and launched their mobile app and site while mobile-commerce was still in its nascent stage in India. Letsbuy was later acquired by Flipkart in 2012.Snehil also co-founded Findyahan, a services marketplace, which was eventually acquired in 2016 by Zimmber. Snehil joined Zimmber as Vice President of Product & Marketing. Zimmber was later acquired by Quikr.",

           """I have over 7 years of combined experience in the fields of data science and machine learning. I've led many data science projects in a wide array of industries. I mainly program in Python using its popular data science libraries.For deep learning, my go to framework is PyTorch. I’ve also worked a significant amount with relational databases and cloud environments.
Worked on diverse array of projects where I used my machine learning expertise to build and advise external clients on how to move forward with machine learning projects. I also advised on how to best collect and structure data.
Other than work, I write a significant amount with regards to AI. I’ve published several deep learning tutorials,focusing on the PyTorch framework. My articles are published on Medium under the publication A Coder’s Guide to AI."""]
           

In [11]:
document = """B.Tech / M.Tech degree in Computer Science from a premiere institute.
Should have 1 - 5 years of experience in designing, developing and deploying software, preferably Statistical and Machine Learning models.
Ability to work independently with strong problem solving skills.
Should have excellent knowledge in fundamentals of Machine Learning and Artificial Intelligence, especially in Regression, Forecasting and Optimization.
Should have excellent foundational knowledge in Probability, Statistics and Operations Research/Optimization techniques.
Should have hands on experience thorugh ML Lifecycle from EDA to model deployment.
Should have hands on experience data analysis tools like Jupyter, and packages like Numpy, Pandas, Matplotlib.
Should be hands-on in writing code that is reliable, maintainable, secure, performance optimized.
Should have good knowledge in Cloud Platforms and Service oriented architecture and design"""

In [12]:
from sentence_transformers import util
def score_cos_sim(art1,art2):
    scores = util.cos_sim(art1, art2)[0]
    return scores

In [13]:
def score_inference(queries,document,model):
    
    score = dict()
    
    queries_encode = [model.encode(text) for text in queries]
    document_encode = model.encode(document)
    
    for i,query in enumerate(queries_encode):
        score["document_"+str(i)] = score_cos_sim(query,document_encode)
    return score

In [14]:
score_inference(queries,document,model)

Batches: 100%|██████████| 1/1 [00:04<00:00,  4.49s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.46it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.57it/s]


{'document_0': tensor([0.8918]),
 'document_1': tensor([0.8050]),
 'document_2': tensor([0.8749])}