In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.decomposition import TruncatedSVD

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hassn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# reading data 
df = pd.read_csv('cnn_dailymail_test.csv')
articles = df['article']


In [6]:
# preprocessing english articles
def preprocessing(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    tokens = nltk.word_tokenize(text)
    processed_text = ' '.join(tokens)
    return processed_text


preprocessed_articles = articles.apply(preprocessing)
pd.set_option('display.max_rows', None) 
print("Before Preprocessing \n", df['article'].iloc[0], "\n")
print("After Preprocessing \n", preprocessed_articles.iloc[0], "\n") 

Before Preprocessing 
 Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee. 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues t

In [7]:
# extracting features from the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(preprocessed_articles)

# creating tagged documents and training a Doc2Vec model
documents = [TaggedDocument(words=doc, tags=[str(i)]) for i, doc in enumerate(preprocessed_articles)]
doc2vec_model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, dm=1)

# performing Latent Semantic Analysis (LSA) on the TF-IDF vectors
lsa = TruncatedSVD(n_components=100, random_state=42)
lsa_features = lsa.fit_transform(tfidf_features)


In [8]:
# semantic search function
def semantic_search(query, method, n_results=5):
    most_similar_indices = []
    if method == "TF-IDF":
        query_vector = tfidf_vectorizer.transform([query])
        cosine_similarities = cosine_similarity(query_vector, tfidf_features)
        most_similar_indices = cosine_similarities[0].argsort()[:-n_results - 1:-1]

    elif method == "Doc2Vec":
        query_tokens = preprocessing(query)  # Use your preprocessing function
        query_vector = doc2vec_model.infer_vector(query_tokens.split())

        similarity_scores = []
        for i, doc_vector in enumerate(doc2vec_model.dv.vectors):
            similarity = cosine_similarity([query_vector], [doc_vector])
            similarity_scores.append((i, similarity[0][0]))

        most_similar_indices = [i for i, _ in sorted(similarity_scores, key=lambda x: x[1], reverse=True)[:n_results]]

    elif method == "LSA":
        query_vector = tfidf_vectorizer.transform([query])
        query_lsa = lsa.transform(query_vector)
        cosine_similarities = cosine_similarity(query_lsa, lsa_features)
        most_similar_indices = cosine_similarities[0].argsort()[:-n_results - 1:-1]

    return most_similar_indices


In [9]:
# Set display options to show all content
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)


In [11]:
query1 = "Football news"
similar_article_indices_tfidf= semantic_search(query1, "TF-IDF", n_results=5)
print("TF-IDF Results", similar_article_indices_tfidf)
similar_articles1 = df['article'].iloc[similar_article_indices_tfidf]
print(similar_articles1)
similar_article_indices_doc2vec = semantic_search(query1, "Doc2Vec", n_results=5)
print("Doc2Vec Results", similar_article_indices_doc2vec)
similar_articles2 = df['article'].iloc[similar_article_indices_doc2vec]
print(similar_articles2)
similar_article_indices_lsa = semantic_search(query1, "LSA", n_results=5)
print("LSA Results", similar_article_indices_lsa)
similar_articles3 = df['article'].iloc[similar_article_indices_lsa]
print(similar_articles3)

TF-IDF Results [3634 2577 7709 6107 9630]
3634                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          