In [2]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.8.3-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.11-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.10-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.3-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.0-cp312-cp312-win_amd64

In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 3.4 MB/s eta 0:00:04
     --- ------------------------------------ 1.0/12.8 MB 2.6 MB/s eta 0:00:05
     ---- ----------------------------------- 1.3/12.8 MB 2.5 MB/s eta 0:00:05
     ------ --------------------------------- 2.1/12.8 MB 2.5 MB/s eta 0:00:05
     -------- ------------------------------- 2.6/12.8 MB 2.5 MB/s eta 0:00:05
     --------- ------------------------------ 2.9/12.8 MB 2.4 MB/s eta 0:00:05
     ---------- ----------------------------- 3.4/12.8 MB 2.4 MB/s eta 0:00:04
     ------------- -------------------------- 4.2/12.8 MB 2.5 MB/s eta 0:00:04
     -------------- ------------------------- 4.7/12.8 MB 2.5 MB/s eta 0:00:04
     --------------- --------------------

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Load spacy's english model
nlp = spacy.load("en_core_web_sm")

In [9]:
# Importing the data
df = pd.read_csv(r"C:\Users\dell\Desktop\MyDocs\Docs\MK\Wikihow Mockup.csv")
df.head()

Unnamed: 0,Article ID,Title,Summary,Tags
0,1,How to Make a Cup of Coffee,Learn how to brew the perfect cup of coffee wi...,"['coffee', 'beverages', 'kitchen', 'how-to']"
1,2,How to Tie a Tie,A step-by-step guide to tie various types of t...,"['fashion', 'clothing', 'how-to']"
2,3,How to Improve Your Memory,Techniques to enhance your memory and recall s...,"['memory', 'brain', 'health', 'how-to']"
3,4,How to Start a Blog,Learn the steps to create and launch a success...,"['blogging', 'website', 'how-to', 'digital']"
4,5,How to Grow Tomatoes,Everything you need to know about growing toma...,"['gardening', 'plants', 'food', 'how-to']"


In [10]:
# Function to clean and preprocess text using spacy
def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [11]:
# Apply preprocessing to the summaries
df['Processed Summary'] = df['Summary'].apply(preprocess_text)
df['Processed Summary'].head()

0    learn brew perfect cup coffee french press dri...
1    step step guide tie type tie include Windsor h...
2    technique enhance memory recall skill memory e...
3    learn step create launch successful blog inclu...
4      need know grow tomato select seed harvest fruit
Name: Processed Summary, dtype: object

In [12]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

In [13]:
# Fit the model to the processed summaries and transform them to vectors
tfidf_matrix = vectorizer.fit_transform(df['Processed Summary'])
tfidf_matrix.shape

(100, 408)

In [15]:
# Function to perform semantic search and return the most similar article based on a query
def semantic_search(query):
    # Preprocess the query
    query_processed = preprocess_text(query)
    # Transform the query to the same vector space as the articles
    query_vector = vectorizer.transform([query_processed])
    # Calculate cosine similarity between the query vector and article vectors
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
    most_similar_idx = cosine_similarities.argmax()
    # Get the top cosine similarity value
    top_result = np.sort(cosine_similarities[0])[-1]
    # Return the most similar article's data
    result = df.iloc[most_similar_idx]
    return result['Title'], result['Summary'], result['Tags'], top_result

In [16]:
query = "How to make a pizza from scratch"
title, summary, tags, cosine_similarity = semantic_search(query)
print(f"Title: {title}\nSummary: {summary}\nTags: {tags}\nCosine Similarity: {cosine_similarity}")

Title: How to Make a Pizza Dough
Summary: Learn how to make homemade pizza dough from scratch for delicious pizzas.
Tags: ['baking', 'food', 'how-to', 'Italian cuisine']
Cosine Similarity: 0.7629084502312261


In [17]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub>=0.20.0 (from sentence_transformers)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Downloading safetensors-0.5.0-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
Downloading huggingface_hub-0.27.0-py3-none-any.whl (450 kB)
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
   ---------------------------------------- 0.0/10.1 MB ? eta -:--:--
   -- ------------------------------

In [23]:
!pip install sentence-transformers



In [28]:
!pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.7 MB ? eta -:--:--
   ------------ --------------------------- 0.5/1.7 MB 1.7 MB/s eta 0:00:01
   ------------------ --------------------- 0.8/1.7 MB 1.5 MB/s eta 0:00:01
   ------------------------ --------------- 1.0/1.7 MB 1.4 MB/s eta 0:00:01
   ------------------------------ --------- 1.3/1.7 MB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 1.3 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.18.0


In [29]:
from sentence_transformers import SentenceTransformer




In [30]:
# Load SBERT model
sbert = SentenceTransformer('all-MiniLM-L6-v2')

In [31]:
# Combine multiple columns into a single string for embedding
def combine_columns(row):    
    title = row['Title']    
    summary = row['Summary']    
    return f"{title} {summary}"

In [32]:
# Create a new column with combined text
df['Combined'] = df.apply(combine_columns, axis=1)
df['Combined'].head()

0    How to Make a Cup of Coffee Learn how to brew ...
1    How to Tie a Tie A step-by-step guide to tie v...
2    How to Improve Your Memory Techniques to enhan...
3    How to Start a Blog Learn the steps to create ...
4    How to Grow Tomatoes Everything you need to kn...
Name: Combined, dtype: object

In [33]:
# Embedding for the summaries
df['Embedding'] = list(sbert.encode(df['Combined'], convert_to_numpy=True))
df['Embedding'].head()

0    [-0.013924233, -0.045962542, 0.034426745, 0.01...
1    [0.036684852, -0.028378056, -0.013170101, 0.00...
2    [0.07461934, -0.025543148, -0.022907995, 0.034...
3    [0.05341247, -0.13683234, -0.039519165, 0.0388...
4    [-0.012920145, 0.024291642, -0.039193686, 0.03...
Name: Embedding, dtype: object

In [38]:
# Function to perform semantic search and return the most similar article based on a query
def semantic_search(query):
    # Embedding for the query
    query_embedding = sbert.encode(query, convert_to_numpy=True)
   
    # Calculate cosine similarity between the query embedding and article embedding
    cosine_scores = util.cos_sim(query_embedding, df['Embedding'])
   
    # Ensure cosine_scores is a 1D array
    cosine_scores = cosine_scores.squeeze()
   
    # Get the index of the most similar article
    most_similar_idx = int(cosine_scores.argmax())
   
    # Return the most similar article's data
    result = df.iloc[most_similar_idx]
    return result['Title'], result['Summary'], result['Tags'], cosine_scores[most_similar_idx].item()

In [41]:
# Suppressing warnings
import warnings
warnings.simplefilter("ignore")
from sentence_transformers import util

In [42]:
query = "How to make a pizza from scratch"
title, summary, tags, cosine_similarity = semantic_search(query)
print(f"Title: {title}\nSummary: {summary}\nTags: {tags}\nCosine Similarity: {cosine_similarity}")

Title: How to Make Pizza
Summary: Detailed instructions on making pizza from scratch, including the dough and topping varieties.
Tags: ['cooking', 'food', 'how-to', 'Italian cuisine']
Cosine Similarity: 0.8074661493301392
