## 1. Imports

### 2. Embedding the Data

In [1]:
# ALL
import torch
import re

# FROM
from sklearn.preprocessing import LabelEncoder
from transformers import BertForSequenceClassification, BertTokenizer
from torch.optim import Adam

# AS
import pandas as pd
import numpy as np

### 3. Building a Text Classification System

In [2]:
# SKLEARN LIBRARY IMPORT
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

### 4. Building a Text Recommendation System

In [3]:
# SKLEARN LIBRARY IMPORT
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 2. Embedding the Data
Route 2: Embeddings using Python Libraries

### 2.4. Fine Tuning BERT for Classification

In [4]:
# Read the CSV file into a DataFrame

allJobsDataset = pd.read_csv("/Users/izzymohamed/Documents/WORK/STEMAway/Evalustion_Building an NLP Pipeline - Text Classification and Beyond/STEMAway StartUp Jobs Datasets/FullStack_Jobs.csv")

#pd.read_csv("/Users/izzymohamed/Documents/WORK/STEMAway/Evalustion_Building an NLP Pipeline - Text Classification and Beyond/STEMAway StartUp Jobs Datasets/all_jobs.csv")

#### 2.4.1. Cleaning the data

In [5]:
print(allJobsDataset.head)

<bound method NDFrame.head of                                                   Link   
0    https://startup.jobs/full-stack-developer-bunk...  \
1    https://startup.jobs/founding-full-stack-softw...   
2    https://startup.jobs/senior-full-stack-enginee...   
3    https://startup.jobs/full-stack-engineer-conte...   
4    https://startup.jobs/full-stack-engineer-nahc-...   
..                                                 ...   
350  https://startup.jobs/full-stack-web-developmen...   
351  https://startup.jobs/full-stack-web-developmen...   
352  https://startup.jobs/full-stack-web-developmen...   
353  https://startup.jobs/full-stack-web-developmen...   
354  https://startup.jobs/full-stack-web-developmen...   

                              Tags   
0             Developer,Full Stack  \
1    Engineer,Full Stack,Developer   
2                Full Stack,Senior   
3                         Engineer   
4              Engineer,Full Stack   
..                             ...   
350    

In [6]:
print(allJobsDataset.shape)

(355, 7)


In [7]:
# Drop rows with blank cells in 'Tags' column
df = allJobsDataset.dropna(subset=['Tags'])

# Print the updated DataFrame
print(df.shape)
# print(df)

(355, 7)


In [8]:
# Drop rows with blank cells in 'Job Title' column
df = allJobsDataset.dropna(subset=['Job Description'])

# Print the updated DataFrame
print(df.shape)
# print(df)

(355, 7)


In [9]:
# Drop rows with blank cells in 'Job Description' column
df = allJobsDataset.dropna(subset=['Job Description'])

# Print the updated DataFrame
print(df.shape)
# print(df)

(355, 7)


In [10]:
# Extract the column containing the Job Description
job_description_column = df['Job Description']
job_description_list = job_description_column.tolist()

# Function to preprocess job descriptions
def preprocess_job_description(job_description):
    return " ".join(job_description.lower().split())

# Remove duplicates and convert to a list
job_description_list = list(set([preprocess_job_description(desc) for desc in job_description_column]))

# Print the final label list
print(len(job_description_list))
print(job_description_list)

335
['front end software engineer - communications team with millions of diners, tens of thousands of restaurants, and 23 years of experience, opentable, part of booking holdings, inc. nasdaq bkng, is an industry leader with a unique insight into the world of hospitality. we champion restaurants, bars, wineries, and other venues around the world, helping them attract guests, manage capacity, improve operations and maximise revenue. every employee at opentable has a tangible impact on what we do and how we do it. youll also be part of a global network that includes opentable and kayaks portfolio of travel brands including swoodoo, checkfelix, momondo, cheapflights, mundi and hotelscombined. hospitality is all about taking care of others, and it defines our culture. youll work in a welcoming and inclusive environment, and get the benefits, flexibility, and support you need to succeed. the communications team brings restaurant hospitality directly to diners by facilitating multi-channel c

In [65]:
# Extract the column containing the Tags
tags_column = df['Tags']
tags_list = tags_column.tolist()

# Split the cell values where there is a comma, handling missing values
tags_labels = []
for value in tags_column:
    if isinstance(value, str):  # Check if the value is a non-null string
        tags_labels.extend(value.split(','))

# Remove duplicates and convert to a list
tags_list2 = list(set(tags_labels))

# Print the final label list
print(len(tags_list))
print(tags_list)


355
['Developer,Full Stack', 'Engineer,Full Stack,Developer', 'Full Stack,Senior', 'Engineer', 'Engineer,Full Stack', 'Developer,Full Stack,React', 'Engineer,Full Stack', 'Developer,Full Stack', 'Developer,Full Stack', 'Developer', 'Developer,Full Stack', 'Engineer,Full Stack,Developer', 'Developer,Full Stack', 'Developer', 'Engineer,Full Stack', 'Engineer,Full Stack', 'Developer,Full Stack', 'Engineer,Developer', 'Developer,Full Stack', 'Developer,Full Stack', 'Engineer', 'Developer,Full Stack,Java', 'Engineer,Full Stack', 'Developer,Full Stack', 'Developer,Full Stack,Laravel', 'Engineer,Full Stack', 'Engineer,Full Stack,Developer', 'Engineer,Full Stack', 'Engineer,Full Stack', 'Developer,Full Stack', 'Developer,Full Stack', 'Engineer,Full Stack,Developer', 'Engineer,Full Stack,Java,Developer', 'Engineer,Developer', 'Engineer,Full Stack,Developer', 'Javascript,CSS,Developer,Full Stack,Node JS', 'Engineer,Developer', 'Android,Developer', 'Developer,Full Stack', 'Engineer,Full Stack', '

In [74]:
# Extract unique tags
unique_tags = list(set(tag for tag in tags_list))

In [75]:
# Create the tag-label mapping dictionary
tag_labels_dict = {tag: i for i, tag in enumerate(unique_tags)}

In [77]:
# Convert tag names to numerical labels
tag_labels = [tag_labels_dict[tag] for tag in tags_list]

In [12]:
# Extract the column containing the Job Titles
job_titles_column = df['Job Title']

# Convert labels to lowercase and remove duplicates
job_titles_list = list(set(job_titles_column.str.lower()))

# Print the final label list
print(len(job_titles_list))
print(job_titles_list)

217
['full-stack wordpress developer', 'full stack developer (integration team) (remote)', 'full stack web engineer - tooling, integrations, & labs', 'full-stack software engineer, wayfinder', 'full stack engineer - api team', 'full stack typescript developer - apac region', 'full stack developer ii', 'full stack software engineer iii', 'full stack engineer, merchant success', 'full stack engineer (remote in portugal possible)', 'full stack engineer (launch)', 'full stack developer (.net, reactjs) - ac107', 'full stack software engineer, servicing engineering', 'full-stack developer (react/spring boot microservices)', 'full stack developer java/angular', 'full stack vue js developer', 'full stack engineer tel aviv, israel', 'full stack web development instructor - university of sydney (remote)', 'full stack web developer', 'founding full stack software engineer', 'full stack angular lead - a4965', 'full stack php developer', 'full stack developer (python, js)', 'full stack engineer (ic

#### 2.4.2. BERT tokenizer

In [13]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
    )

In [34]:
# Print the original sentence.
print(' Original: ', job_description_column[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(job_description_column[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(job_description_column[0])))

 Original:  

We are a Bristol-based product company developing a SaaS property managing platform. We are looking for a full-stack developer to join our team full-time.

Technical stack
 Typescript
 Angular 8-14
 NodeJS
 Firebase
 Firestore
 ExpressJS
 Jest

Must also have strong experience of working with Git in a team environment PR, code reviews, Branching etc and working with external libraries.

Should enjoy writing new and refactoring old code to a high standard, with clear separation of concerns and understandable naming conventions.

Most importantly, were looking for someone who really enjoys coding and building things.
Basic requirements for the candidate

 Candidates dont need a computer science degree, but must have at least 3 years of experience coding professionally.
 In-time work delivery.

The candidates journey contains three steps
1. initial cv screen
2. 20-30 min entry call with PM
3. 30-40 min tech interview with CTO4. offer

Please fill free to send your CV to the 

In [35]:
max_len = 0

# For every sentence...
for sent in job_description_column:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (669 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  2174


In [56]:
# Tokenize the job descriptions and tags

job_desc_batch_tokens = tokenizer.encode_plus(
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    job_description_list, # Sentence to encode.
    add_special_tokens = True, # Add '[CLS]' and '[SEP]'     
    pad_to_max_length = True,
    # padding=True,
    truncation=True,
    max_length=max_len,  # Pad & truncate all sentences.
    return_attention_mask = True,   # Construct attn. masks.
    return_tensors='pt',     # Return pytorch tensors.
)




In [57]:
# Get the tokenized input IDs and attention masks
input_ids = job_desc_batch_tokens['input_ids']
attention_masks = job_desc_batch_tokens['attention_mask']

In [58]:
input_ids.type

<function Tensor.type>

In [61]:
# Get the input tensors

# Add the encoded sentence to the list.    
# job_desc_inputs = torch.cat(input_ids, dim=0)

# And its attention mask (simply differentiates padding from non-padding).
# job_desc_attention_mask = torch.cat(attention_masks, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', job_description_column[0])
print('Token IDs:', job_desc_inputs[0])

Original:  

We are a Bristol-based product company developing a SaaS property managing platform. We are looking for a full-stack developer to join our team full-time.

Technical stack
 Typescript
 Angular 8-14
 NodeJS
 Firebase
 Firestore
 ExpressJS
 Jest

Must also have strong experience of working with Git in a team environment PR, code reviews, Branching etc and working with external libraries.

Should enjoy writing new and refactoring old code to a high standard, with clear separation of concerns and understandable naming conventions.

Most importantly, were looking for someone who really enjoys coding and building things.
Basic requirements for the candidate

 Candidates dont need a computer science degree, but must have at least 3 years of experience coding professionally.
 In-time work delivery.

The candidates journey contains three steps
1. initial cv screen
2. 20-30 min entry call with PM
3. 30-40 min tech interview with CTO4. offer

Please fill free to send your CV to the e

In [79]:
tag_inputs = torch.tensor(tag_labels)

#### 2.4.3. BERT model

In [80]:
# Load the pre-trained BERT model for sequence classification
bert_data_embedding_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [81]:
# Set the model to evaluation mode
bert_data_embedding_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [82]:
'''# Set the model to training mode
model.train()'''

'# Set the model to training mode\nmodel.train()'

#### 2.4.4. Embedding

In [83]:
# Forward pass through the BERT model to get the embeddings
def get_bert_embeddings(inputs,attention_mask):
    with torch.no_grad():
        outputs = bert_data_embedding_model(input_ids=inputs, attention_mask=attention_mask)
    return outputs[0]#.last_hidden_state[:, 0, :]

In [84]:
job_desc_embeddings = get_bert_embeddings(job_desc_inputs,job_desc_attention_mask)

In [85]:
print(job_desc_embeddings.shape)

torch.Size([355, 2])


In [86]:
print(job_desc_embeddings[0].shape)

torch.Size([2])


#### 2.4.5. Optimizer

In [87]:
# Define the optimizer
optimizer = Adam(bert_data_embedding_model.parameters(), lr=1e-5)

In [88]:
# Update the model's parameters
optimizer.step()

## 3. Building a Text Classification System

In [89]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    job_desc_embeddings, tag_labels, test_size=0.2, random_state=0
)

In [92]:
# Training the SVM model
svm_text_calssification_model = SVC()
svm_text_calssification_model.fit(X_train,y_train)

In [93]:
# Training the Logistic Regression model
lr_text_calssification_model = LogisticRegression()
lr_text_calssification_model.fit(X_train, y_train)

In [96]:
# Evaluating the models
svm_predictions = svm_text_calssification_model.predict(X_test)
lr_predictions = lr_text_calssification_model.predict(X_test)

svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions, average='macro')
svm_recall = recall_score(y_test, svm_predictions, average='macro')
svm_f1 = f1_score(y_test, svm_predictions, average='macro')

lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions, average='macro')
lr_recall = recall_score(y_test, lr_predictions, average='macro')
lr_f1 = f1_score(y_test, lr_predictions, average='macro')

print("SVM Metrics:")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1-Score:", svm_f1)

print("\nLogistic Regression Metrics:")
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1-Score:", lr_f1)

SVM Metrics:
Accuracy: 0.16901408450704225
Precision: 0.013334879406307976
Recall: 0.041666666666666664
F1-Score: 0.019648023143683704

Logistic Regression Metrics:
Accuracy: 0.16901408450704225
Precision: 0.014520202020202022
Recall: 0.041666666666666664
F1-Score: 0.018820450885668277


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 4. Building a Text Recommendation System

### 4.4. TF-IDF (Term Frequency-Inverse Document Frequency)

In [127]:
# Function to generate recommended job descriptions based on given tags
def tags_recommend_job_descriptions(tags):
    # Combine tags into a single string for each job description
    combined_descriptions = [desc + " " + tag for desc, tag in zip(job_description_list, tags)]
    
    # Create a TF-IDF vectorizer and fit_transform the combined job descriptions and tags
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(combined_descriptions)
    
    # Compute cosine similarity between TF-IDF vectors of job descriptions and tags
    similarity_matrix = cosine_similarity(tfidf_matrix)
    
    # Find the most similar job description for each tag set
    recommended_jobs = []
    for i, tag in enumerate(tags):
        # Get the index of the most similar job description (excluding the tag itself)
        most_similar_index = np.argsort(similarity_matrix[i])[::-1][1]
        # Add the recommended job description to the list
        recommended_jobs.append(job_description_list[most_similar_index])
    
    return recommended_jobs

In [167]:
tags = str(input("Enter a tag: "))

# Example usage
recommended_jobs = tags_recommend_job_descriptions(tags)

for i in range(1): #enumerate(recommended_jobs)
    print(f"Recommended job description for tags '{tags}': {recommended_jobs[i]}")

Recommended job description for tags 'data science':


Sell what you love. For us and millions of users across the globe, thats Spotify. Join the Sales team and youll build the relationships that help grow our business in existing markets and beyond. We dont just sell creative solutions to our clients and partners, we help to shape them; using our expert knowledge of ad products, sales channels and the industry to impact the way the world experiences music and podcasts. 

The TIL product area Tooling, Integrations,  Labs forms the product and engineering resources for Spotifys Advertising Business Development team and is focused on developing and innovating on strategic partner integrations and partner enabled advertising products.

We are looking for a Full Stack Web Engineer to join our fast-moving, innovative team. You will be tackling challenging problems while collaborating across teams to develop innovative, reliable, secure, and scalable solutions. You will build and support web