In [71]:
import pandas as pd

# Sample articles data (you can replace this with reading a CSV)
data = {
    "title": ["Python Basics", "Machine Learning Intro", "Data Science Overview", "Deep Learning Guide", "Web Development 101"],
    "text": [
        "Python is a programming language.",
        "Machine learning allows computers to learn from data.",
        "Data science involves analyzing and interpreting data.",
        "Deep learning uses neural networks to model complex patterns.",
        "Web development includes building websites and web apps."
    ]
}
df = pd.DataFrame(data)
df


Unnamed: 0,title,text
0,Python Basics,Python is a programming language.
1,Machine Learning Intro,Machine learning allows computers to learn fro...
2,Data Science Overview,Data science involves analyzing and interpreti...
3,Deep Learning Guide,Deep learning uses neural networks to model co...
4,Web Development 101,Web development includes building websites and...


In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Turn article text into numbers
vectorizer = TfidfVectorizer(stop_words="english")
all_words = vectorizer.fit_transform(df["text"])
print(vectorizer.get_feature_names_out())

['allows' 'analyzing' 'apps' 'building' 'complex' 'computers' 'data'
 'deep' 'development' 'includes' 'interpreting' 'involves' 'language'
 'learn' 'learning' 'machine' 'model' 'networks' 'neural' 'patterns'
 'programming' 'python' 'science' 'uses' 'web' 'websites']


In [73]:
print(all_words)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 28 stored elements and shape (5, 26)>
  Coords	Values
  (0, 21)	0.5773502691896258
  (0, 20)	0.5773502691896258
  (0, 12)	0.5773502691896258
  (1, 15)	0.43429718303084847
  (1, 14)	0.3503882327118585
  (1, 0)	0.43429718303084847
  (1, 5)	0.43429718303084847
  (1, 13)	0.43429718303084847
  (1, 6)	0.3503882327118585
  (2, 6)	0.6279137616509933
  (2, 22)	0.38914146140230915
  (2, 11)	0.38914146140230915
  (2, 1)	0.38914146140230915
  (2, 10)	0.38914146140230915
  (3, 14)	0.2916794154657719
  (3, 7)	0.36152911730069653
  (3, 23)	0.36152911730069653
  (3, 18)	0.36152911730069653
  (3, 17)	0.36152911730069653
  (3, 16)	0.36152911730069653
  (3, 4)	0.36152911730069653
  (3, 19)	0.36152911730069653
  (4, 24)	0.6666666666666666
  (4, 8)	0.3333333333333333
  (4, 9)	0.3333333333333333
  (4, 3)	0.3333333333333333
  (4, 25)	0.3333333333333333
  (4, 2)	0.3333333333333333


In [74]:
documents = all_words.toarray()

In [75]:
print(documents[0])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.57735027 0.         0.         0.         0.         0.
 0.         0.         0.57735027 0.57735027 0.         0.
 0.         0.        ]


In [76]:
# Python is a programming language.  --> document 0
#'python', 'programming' 'language' --> after preprocessing the document

Explanation = """
Word counts:
python- 1
programming- 1
language- 1
Total words (after ignoring stop words): 3

  TF(python) = word/total = 1/3
  TF(programming) = word/total = 1/3
  TF(language) = word/total = 1/3

IDF(word)=log( N/ {1+Number of documents containing word} )
        where N = "Total No of Documents"
IDF(python) =	log(5/(1+1))
            = log(2.5)
            ≈ 0.9163
IDF(programmming) ≈ 0.9163
​IDF(language) ≈ 0.9163

TF-IDF = TF * IDF
  TF-IDF(python) = 1/3 * 0.9163 ≈ 0.3054
  TF-IDF(programming) = 1/3 * 0.9163 ≈ 0.3054
  TF-IDF(language) = 1/3 * 0.9163 ≈ 0.3054

# Normalize the scores using L2 norm
TF-IDF(python) = 0.529/sqrt{ (0.3054^2+0.3054^2+0.3054^2) }
               = 0.529/0.3054
               ≈ 0.577
​TF-IDF(programming) ≈ 0.577
​TF-IDF(language) ≈ 0.577

# NOTE: TF_IDF = -ve means that the term is very common across the corpus
— it appears in most documents.
This means the word is not helpful for distinguishing between documents.
Some implementations clip negative IDF values to zero to avoid negative weights.

"""

In [77]:
from sklearn.metrics.pairwise import cosine_similarity
# Calculate similarity between all articles
similarity_matrix = cosine_similarity(documents)

In [78]:
print(similarity_matrix)
# This is a 5x5 similarity matrix comparing 5 documents against each other.

[[1.         0.         0.         0.         0.        ]
 [0.         1.         0.22001359 0.10220103 0.        ]
 [0.         0.22001359 1.         0.         0.        ]
 [0.         0.10220103 0.         1.         0.        ]
 [0.         0.         0.         0.         1.        ]]


In [79]:
print(documents[0], '\n\n', documents[1])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.57735027 0.         0.         0.         0.         0.
 0.         0.         0.57735027 0.57735027 0.         0.
 0.         0.        ] 

 [0.43429718 0.         0.         0.         0.         0.43429718
 0.35038823 0.         0.         0.         0.         0.
 0.         0.43429718 0.35038823 0.43429718 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]


In [80]:
Explanation_for_Similarity_Matrix = """
documents[0] vector = [a1,a2,a3,...,an]
documents[1] vector = [b1,b2,b3,...,bn]
documents[2] vector = [c1,c2,c3,...,cn]

cosθ(0,1) = A⋅B / {∥A∥.∥B∥}
     = 0
cosθ(1,2) = B⋅C / {∥B∥.∥C∥}
     = 0.22001359

# Values range from 0 to 1

# Just for Rememberance purpose, think of it like a correlation matrix
"""

In [81]:
# Pick an article index
document_id = int(input("Enter the document index from 0 to 4 : "))

Enter the document index from 0 to 4 : 1


In [82]:
# Print the selected article title and text
print("You selected:", df.loc[document_id, "title"])
print("Article text:", df.loc[document_id, "text"])
# df.loc[label, column_name] --> df.loc[index, column]

You selected: Machine Learning Intro
Article text: Machine learning allows computers to learn from data.


In [83]:
print(similarity_matrix[document_id])

[0.         1.         0.22001359 0.10220103 0.        ]


In [84]:
# Get similarity scores for this article with all articles
scores = list(enumerate(similarity_matrix[document_id]))
print(scores)

[(0, np.float64(0.0)), (1, np.float64(1.0000000000000002)), (2, np.float64(0.22001359324034672)), (3, np.float64(0.10220103490347976)), (4, np.float64(0.0))]


In [85]:
# Sort the scores by similarity, highest first
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
print(sorted_scores)

# Take each tuple x, and use its second element (x[1]) as the sort key
# Otherwise, by default sort happens by using x[0] as key

[(1, np.float64(1.0000000000000002)), (2, np.float64(0.22001359324034672)), (3, np.float64(0.10220103490347976)), (0, np.float64(0.0)), (4, np.float64(0.0))]


In [86]:
# Ignore the first one (the article itself), get top 2 recommended article indices
recommended_indices = [i for i, score in sorted_scores[1:3]]

In [87]:
print("Recommended articles:")
for id in recommended_indices:
    print(df.loc[id, "title"])

Recommended articles:
Data Science Overview
Deep Learning Guide
