## 2. Embedding the Data
Route 2: Embeddings using Python Libraries

### 2.1. Word2Vec with Python using Gensim

In [1]:
from gensim.models import Word2Vec

In [2]:
# Preprocessed text data
sentences = [["This", "is", "the", "first", "sentence"],
             ["Here", "is", "another", "sentence"],
             ["And", "a", "third", "sentence", "for", "embedding"]]

In [3]:
# Train the Word2Vec model
model = Word2Vec(sentences, min_count=1)

In [4]:
# Get the word vector for a specific word
word_vector = model.wv['sentence']
print(word_vector)

[-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-03
 -1.5080082e-03  2.46979

### 2.2. Using GloVe with Python 1

In [5]:
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

In [6]:
# Convert GloVe embeddings to Word2Vec format
glove_file = "path/to/glove.6B.100d.txt"
word2vec_file = "path/to/glove.6B.100d.word2vec.txt"

glove2word2vec(glove_file, word2vec_file)

  glove2word2vec(glove_file, word2vec_file)


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/glove.6B.100d.txt'

In [None]:
# Load the GloVe embeddings in Word2Vec format
model = KeyedVectors.load_word2vec_format(word2vec_file)

In [None]:
# Get the word vector for a specific word
word_vector = model['sentence']
print(word_vector)

### 2.3. Deciphering BERT

In [7]:
from transformers import BertTokenizer, BertModel

In [8]:
# Preprocessed text data
sentences = ["This is the first sentence",
             "Here is another sentence",
             "And a third sentence for embedding"]

In [9]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
# Tokenize the sentences
tokenized_inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [11]:
# Load the pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# Get the BERT embeddings
outputs = model(**tokenized_inputs)
embeddings = outputs.pooler_output

In [13]:
# Print the embeddings
print(embeddings)

tensor([[-0.8580, -0.2532,  0.5253,  ...,  0.1378, -0.5749,  0.8462],
        [-0.8722, -0.3590,  0.1652,  ...,  0.0631, -0.6483,  0.9079],
        [-0.6885, -0.2533, -0.5861,  ..., -0.6987, -0.6173,  0.5979]],
       grad_fn=<TanhBackward0>)


### 2.4. Fine Tuning BERT for Classification

In [14]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.optim import Adam

In [15]:
# Preprocessed text data and labels
sentences = ["This is the first sentence",
             "Here is another sentence",
             "And a third sentence for embedding"]

labels = [0, 1, 0]

In [16]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
# Tokenize the sentences and convert them to input tensors
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [18]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [19]:
# Set the model to training mode
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [20]:
# Convert the labels to a tensor
labels_tensor = torch.tensor(labels)

In [21]:
# Forward pass through the model
outputs = model(**inputs, labels=labels_tensor)

In [22]:
# Compute the loss
loss = outputs.loss

In [23]:
# Perform backpropagation and update the model's parameters
##loss.backward()

# Perform backpropagation
loss.backward(retain_graph=True)  # Specify retain_graph=True

In [24]:
# Define the optimizer
optimizer = Adam(model.parameters(), lr=1e-5)

In [25]:
# Update the model's parameters
optimizer.step()

## 3. Building a Text Classification System

In [26]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np

In [27]:
# Preprocessed embedded data
X = np.array([[0.1, 0.2, 0.3],
              [0.4, 0.5, 0.6],
              [0.7, 0.8, 0.9]])

In [28]:
# Classification labels
y = np.array([0, 1, 0])

In [29]:
# Preprocessing the data (if needed)


In [30]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Training the SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [32]:
# Training the Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [33]:
# Evaluating the models
svm_predictions = svm_model.predict(X_test)
lr_predictions = lr_model.predict(X_test)

svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions)
svm_recall = recall_score(y_test, svm_predictions)
svm_f1 = f1_score(y_test, svm_predictions)

lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions)
lr_recall = recall_score(y_test, lr_predictions)
lr_f1 = f1_score(y_test, lr_predictions)

print("SVM Metrics:")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1-Score:", svm_f1)

print("\nLogistic Regression Metrics:")
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1-Score:", lr_f1)

SVM Metrics:
Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1-Score: 0.0

Logistic Regression Metrics:
Accuracy: 0.0
Precision: 0.0
Recall: 0.0
F1-Score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 4. Building a Text Recommendation System

### 4.1. Cosine Similarity

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
# List of job descriptions
job_descriptions = [
    "Software Engineer with experience in Python and web development",
    "Data Scientist with expertise in machine learning and data analysis",
    "Frontend Developer proficient in HTML, CSS, and JavaScript",
    "UX/UI Designer specializing in user-centered design and prototyping"
]

In [36]:
# Convert job descriptions into numerical representations using TF-IDF
vectorizer = TfidfVectorizer()
job_embeddings = vectorizer.fit_transform(job_descriptions)

In [37]:
# Calculate cosine similarity between job descriptions
similarity_matrix = cosine_similarity(job_embeddings)

In [38]:
# Function to recommend similar jobs based on a given job description
def recommend_jobs_cosine(job_description, num_recommendations):
    try:
        # Find the index of the given job description
        job_index = job_descriptions.index(job_description)
        
        # Get the similarity scores for the given job description
        similarity_scores = similarity_matrix[job_index]
        
        # Sort the similarity scores in descending order
        sorted_indices = similarity_scores.argsort()[::-1]
        
        # Recommend the top N similar jobs
        recommended_jobs = [job_descriptions[i] for i in sorted_indices[1:num_recommendations+1]]
        
        return recommended_jobs
    except ValueError:
        print("Input job description is not found in the list.")
        return []

In [39]:
# Test the recommendation system using cosine similarity
input_job_cosine = "Software Engineer with experience in Python"
recommendations_cosine = recommend_jobs_cosine(input_job_cosine, num_recommendations=2)

print("Cosine Similarity - Job Recommendations for '{}'".format(input_job_cosine))

for i, job in enumerate(recommendations_cosine):
    print("Recommendation {}: {}".format(i+1, job))

Input job description is not found in the list.
Cosine Similarity - Job Recommendations for 'Software Engineer with experience in Python'


### 4.2. Euclidean Distance

In [40]:
from sklearn.metrics.pairwise import euclidean_distances

In [41]:
# Calculate Euclidean distance between job descriptions
distance_matrix = euclidean_distances(job_embeddings)

In [42]:
# Function to recommend similar jobs based on a given job description
def recommend_jobs_euclidean(job_description, num_recommendations):
    try:
        # Find the index of the given job description
        job_index = job_descriptions.index(job_description)
        
        # Get the distance scores for the given job description
        distance_scores = distance_matrix[job_index]
        
        # Sort the distance scores in ascending order
        sorted_indices = distance_scores.argsort()
        
        # Recommend the top N similar jobs
        recommended_jobs = [job_descriptions[i] for i in sorted_indices[1:num_recommendations+1]]
        
        return recommended_jobs
    except ValueError:
        print("Input job description is not found in the list.")
        return []
    

In [43]:
# Test the recommendation system using Euclidean distance
input_job_euclidean = "Software Engineer with experience in Python"
recommendations_euclidean = recommend_jobs_euclidean(input_job_euclidean, num_recommendations=2)

print("Euclidean Distance - Job Recommendations for '{}'".format(input_job_euclidean))

for i, job in enumerate(recommendations_euclidean):
    print("Recommendation {}: {}".format(i+1, job))

Input job description is not found in the list.
Euclidean Distance - Job Recommendations for 'Software Engineer with experience in Python'


### 4.3. Jaccard Similarity

In [44]:
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
# List of job descriptions
job_descriptions = [
    "Software Engineer with experience in Python and web development",
    "Data Scientist with expertise in machine learning and data analysis",
    "Frontend Developer proficient in HTML, CSS, and JavaScript",
    "UX/UI Designer specializing in user-centered design and prototyping"
]

In [46]:
# Function to preprocess job descriptions
def preprocess_job_description(job_description):
    return " ".join(job_description.lower().split())

In [47]:
# Preprocess job descriptions
job_descriptions = [preprocess_job_description(desc) for desc in job_descriptions]

In [48]:
# Convert job descriptions into numerical representations using TF-IDF
vectorizer = TfidfVectorizer()
job_embeddings = vectorizer.fit_transform(job_descriptions)

In [49]:
# Calculate Jaccard similarity between job descriptions
similarity_matrix_jaccard = []
for i in range(len(job_descriptions)):
    similarity_scores = []
    for j in range(len(job_descriptions)):
        if i != j:  # Exclude self-comparison
            set1 = set(job_descriptions[i].split())
            set2 = set(job_descriptions[j].split())
            common_elements = set1.intersection(set2)
            union_elements = set1.union(set2)
            similarity_scores.append(len(common_elements) / len(union_elements))
    
    similarity_matrix_jaccard.append(similarity_scores)

In [50]:
# Function to recommend similar jobs based on a given job description
def recommend_jobs_jaccard(job_description, num_recommendations):
    # Preprocess the input job description
    job_description = preprocess_job_description(job_description)
    
    # Get the similarity scores for the given job description
    similarity_scores = []
    for j in range(len(job_descriptions)):
        set1 = set(job_description.split())
        set2 = set(job_descriptions[j].split())
        common_elements = set1.intersection(set2)
        union_elements = set1.union(set2)
        similarity_scores.append(len(common_elements) / len(union_elements))
    
    # Sort the similarity scores in descending order
    sorted_indices = sorted(range(len(similarity_scores)), key=lambda k: similarity_scores[k], reverse=True)
    
    # Recommend the top N similar jobs
    recommended_jobs = [job_descriptions[i] for i in sorted_indices[:num_recommendations]]
    
    return recommended_jobs

In [51]:
# Test the recommendation system using Jaccard similarity
input_job_jaccard = "Software Engineer with experience in Python"
recommendations_jaccard = recommend_jobs_jaccard(input_job_jaccard, num_recommendations=2)

print("Jaccard Similarity - Job Recommendations for '{}'".format(input_job_jaccard))

for i, job in enumerate(recommendations_jaccard):
    print("Recommendation {}: {}".format(i+1, job))

Jaccard Similarity - Job Recommendations for 'Software Engineer with experience in Python'
Recommendation 1: software engineer with experience in python and web development
Recommendation 2: data scientist with expertise in machine learning and data analysis


### 4.4. TF-IDF (Term Frequency-Inverse Document Frequency)

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert job descriptions into numerical representations using TF-IDF
vectorizer_tfidf = TfidfVectorizer()
job_embeddings_tfidf = vectorizer_tfidf.fit_transform(job_descriptions)

In [53]:
# Calculate cosine similarity between job descriptions using TF-IDF
similarity_matrix_tfidf = cosine_similarity(job_embeddings_tfidf)

In [54]:
# Function to recommend similar jobs based on a given job description
def recommend_jobs_tfidf(job_description, num_recommendations):
    try:
        # Find the index of the given job description
        job_index = job_descriptions.index(job_description)
        
        # Get the similarity scores for the given job description
        similarity_scores = similarity_matrix_tfidf[job_index]
        
        # Sort the similarity scores in descending order
        sorted_indices = similarity_scores.argsort()[::-1]
        
        # Recommend the top N similar jobs
        recommended_jobs = [job_descriptions[i] for i in sorted_indices[1:num_recommendations+1]]
        
        return recommended_jobs
    
    except ValueError:
        print("Input job description is not found in the list.")
        return []

In [55]:
# Test the recommendation system using TF-IDF
input_job_tfidf = "Software Engineer with experience in Python"
recommendations_tfidf = recommend_jobs_tfidf(input_job_tfidf, num_recommendations=2)

print("TF-IDF - Job Recommendations for '{}'".format(input_job_tfidf))

for i, job in enumerate(recommendations_tfidf):
    print("Recommendation {}: {}".format(i+1, job))

Input job description is not found in the list.
TF-IDF - Job Recommendations for 'Software Engineer with experience in Python'
