#You need to implement a TF-IDF vectorizer to convert a collection of documents into TF-IDF vectors. You can use the sklearn’s inbuild dataset fetch_20newsgroups.

In [10]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
!pip install nltk
import nltk
nltk.download('stopwords')
!pip install scikit-learn
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def remove_stopwords(news_text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(news_text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)


In [12]:
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
news_texts = newsgroups.data


In [None]:
# Apply remove_stopwords function to each news reading
cleaned_news_texts = [remove_stopwords(news_text) for news_text in news_texts]
cleaned_news_texts

In [15]:
def remove_punctuation(text):
    punctuationfree = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return punctuationfree

cleaned_news_texts = [remove_punctuation(news_text) for news_text in cleaned_news_texts]
print(cleaned_news_texts[:5])  # Display the first 5 elements as an example


['sure bashers Pens fans pretty confused lack kind posts recent Pens massacre Devils  Actually  bit puzzled bit relieved  However  going put end nonPIttsburghers  relief bit praise Pens  Man  killing Devils worse thought  Jagr showed much better regular season stats  also lot fo fun watch playoffs  Bowman let JAgr lot fun next couple games since Pens going beat pulp Jersey anyway  disappointed see Islanders lose final regular season game  PENS RULE   ', 'brother market highperformance video card supports VESA local bus 12MB RAM  anyone suggestionsideas   Diamond Stealth Pro Local Bus  Orchid Farenheit 1280  ATI Graphics Ultra Pro  highperformance VLB card Please post email  Thank   Matt', 'Finally said dream  Mediterranean     new  area  greater  years  like  holocaust  numbers       Ist July USA      Sweden s April still cold  changed calendar                                                     NOTHING MENTIONED TRUE  LET SAY s TRUE  SHALL AZERI WOMEN CHILDREN GOING PAY PRICE         

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tfidf= TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(cleaned_news_texts)
tfidf_matrix

<18846x164805 sparse matrix of type '<class 'numpy.float64'>'
	with 1345421 stored elements in Compressed Sparse Row format>

#Create a function to calculate the cosine similarity between two TF-IDF vectors.

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    similarity_matrix = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    cosine_similarity_value = similarity_matrix[0, 0]
    return cosine_similarity_value


In [25]:
document1 = "You can think of reshaping as first raveling the array (using the given index order), then inserting the elements from the raveled array into the new array using the same kind of index ordering as was used for the raveling."
document2 = "This document is an example document."

similarity = calculate_cosine_similarity(document1, document2)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.0


#Q3. Implement a document similarity search function that takes a document as input and returns a list of documents ranked by their similarity to the input document.

In [26]:
def document_similarity_search(input_document, dataset):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([input_document] + dataset)
    similarity_scores = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1:])
    ranked_documents = [(similarity, document) for similarity, document in zip(similarity_scores[0], dataset)]
    ranked_documents.sort(reverse=True, key=lambda x: x[0])

    return ranked_documents

In [27]:
input_document = "GMCH to cut red tape, buy robotic arm, high-tech infra"
result = document_similarity_search(input_document, cleaned_news_texts)

# Display the ranked documents
for similarity, document in result:
    print(f"Similarity: {similarity:.4f}, Document: {document}")


Similarity: 0.1777, Document: looking information infra red based position encoders  idea would bounce infrared source wall device would read distance  preferable would rs232 addressable  leads 
Similarity: 0.1613, Document: Acorn Replay running 25MHz ARM 3 processor  ARM 3 20  slower ARM 6  software  standard CDROM   16 bit colour resolution  computer 8 bit colour support  realtime dithering    3D0O supposed couple DSPs  ARM used housekeeping  25MHz ARM 6xx clock around 20 ARM MIPS  say 18 flat  Depends really surrounding system whether talking ARM6x ARM6xx  latter cache  essential run kind speed slower memory   ll stop saying things cos ll hopefully working ARM graduation  Mike PS nt pay heed reps Philips say  3D0O nt beat pants 3DI ll eat postscript 
Similarity: 0.1555, Document: got one Microsoft tech support 
Similarity: 0.1379, Document: always thought wanted send Police tape ransom demand  send CNN video tape see wanted buy  would place small magnet near takeup spool tape would 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




Similarity: 0.0000, Document: Giday netters  got used Newlife 25 accelerator  FPU  wondering points  Anyone know current driver version   Can handle 16bit grayscale card  get video option Why would hating hard drive   ca nt use accelerator hard drive time   need new driver drive  make  Thanks  Jason
Similarity: 0.0000, Document: 
Similarity: 0.0000, Document: sold 86 Sprint last April 95k  d driven since previous July  putting 20k miles  sensor light used light regularly  starting 5k miles bought  brother rebuilt engine used original equipment  suppose sensor could used replacement  Performance  hah  could call  change  Perhaps emissions increased  much emissions could CAregistered 3 cylinder engine produce  neat car  held engine block easily one hand  anyone ever driven Turbo  variant  curious 
Similarity: 0.0000, Document: stories report eve  Norm s farewell twin cities  earlier post  announced Norm Green given midnight last homegame North Stars cleanup belongings turn keys arena  h

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Similarity: 0.0000, Document: say   invader  nt  want  perhaps neural net design countries involved Lebanon nodes      Cognitive Neural Systems 
Similarity: 0.0000, Document: try answer Dorin s questions  even though addressed specifically  feel bit concerned thread since Southern Lebanese village often receiving end Israel s bombs  first place death three soldiers patrol occupied Lebanese terrritory act terrorism murder  disingeneous compare death athletes Munich act terrorism mrder  exercise aimed solely diverting issue far truth  seems  Dorin   remote ignorant problem ground comments charactrized irrelevant  heavily colored preconceptions misinformation  try paint accurate picture situation really South Lebanon   back home village last summer  information PEOPLE  bunch indiscriminate terrorists  people village regular inhabitants go daily business  work fields  small shops  others older men go coffe shop drink coffee  hard imagine      terrorist camp  Israelis like view villages sma

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Similarity: 0.0000, Document: Hi  anyone medical imaging  good ray tracing background  m interested field  Could point sources  better yet  experience  want talk whats going re working 
Similarity: 0.0000, Document: Well  say  without Morris  mention assumed replacement  alternative Morris letting Cito Gaston softtoss ball underhand opposition every 5 days  course Blue Jays would nt without Morris  alternative replacement level  think would ve close  yes  Morris might ve made difference  alternative Frank Viola  Blue Jays probably would easily Viola  Yes  make argument presence prevented team collapsing August  innings probably helpful  Well  think complete bs  happened time Morris fell behind  team came back rescued  Mostly  s lousy 1st inning pitcher  much better rest way  team climb 40  50 holes consistently win 65 Morris  credit  s team s credit  Furthermore  Morris exceed WL percentage would projected runs allowed run support  nt done previous years  fact  WL record 1991 lot worse