In [None]:
!kaggle datasets download jacopoferretti/bbc-articles-dataset
!unzip bbc-articles-dataset.zip

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/bbc_text_cls.csv')

In [None]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def preprocess_text(text):
    # 1. Lowercasing
    text = text.lower()

    # 2. Remove HTML Tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # 3. Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # 4. Remove emojis
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"  # Emoticons
                               "\U0001F300-\U0001F5FF"  # Miscellaneous Symbols and Pictographs
                               "\U0001F680-\U0001F6FF"  # Transport and Map Symbols
                               "\U0001F700-\U0001F77F"  # Alchemical Symbols
                               "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               "\U0001FA00-\U0001FA6F"  # Chess Symbols
                               "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               "\U00002702-\U000027B0"  # Dingbats
                               "\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub('', text)

    # 5. Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # 6. Tokenization
    tokens = word_tokenize(text)

    # 7. Remove stopwords and single character tokens
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 1]

    # 8. Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Joining tokens back into a sentence
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [None]:
df['text'] = df['text'].apply(preprocess_text)

# Using Counter

In [None]:
from collections import Counter
texts_lowercase_tokenized = df['text'].apply(word_tokenize)

token_counters = texts_lowercase_tokenized.apply(Counter).values.tolist()

In [None]:
print({token: n_occ for token, n_occ in token_counters[0].items() if n_occ >= 10})

{'profit': 10}


In [None]:
def get_scores(query_tokens, token_counters):
  scores = []
  for token_counter in token_counters:
    matches = [token_counter[query_token] for query_token in query_tokens]
    total_score = sum(matches)
    scores.append(total_score)
  return scores

In [None]:
import numpy as np

def show_best_results(df_articles, scores, top_n=10):
  best_indexes = np.argsort(scores)[::-1]
  for position, idx in enumerate(best_indexes[:top_n]):
    row = df_articles.iloc[idx]
    title = row["text"]
    score = scores[idx]
    print(f"{position + 1} [score = {score}]: {title}")

In [41]:
query = "japan narrowly escape recession japan economy "

query = preprocess_text(query)

query_tokens = word_tokenize(query)

scores = get_scores(query_tokens, token_counters)
# print(scores)
show_best_results(df, scores)

1 [score = 41]: japan ageing workforce built last twenty battled tuberculosis eight year went run clothing business marrying late thirty yearold torao toshitsune eaten raw fish pretty much every day throughout life mr toshitsune one japan centenarian club growing annually oldest member neat osaka detached house life one sexagenarian daughter mr toshitsune keep regular routine copying buddhist sutra preparing traditional japanese tea ceremony task remarkably active senior citizen reveals next goal well whats important japan number one mr toshitsune want outlive everyone come longevity japan country appears woman expect live men four year longer american european outskirt kyoto yearold yuji shimizu contemplates phenomenon round golf younger friend seventy think food industry environment improved remark average live longer whether diet traditional family structure role clearly defined something gene japan elderly remarkable life may game golf mr shimizu grandchild huge problem ahead japan

# Using Bag Of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()

In [None]:
text = cv.fit_transform(df['text'])

In [37]:
query = "India"
query = preprocess_text(query)
query_bow = cv.transform([query])

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
cosine_similarities = cosine_similarity(query_bow, text).flatten()

# Get the indices of the most relevant documents in descending order of similarity
relevant_docs_indices = cosine_similarities.argsort()[::-1]

In [39]:
relevant_docs_indices

array([  17,  129,    7, ..., 1482, 1483,    0])

In [40]:
for index in relevant_docs_indices[:10]:
    print(f"Document: {df['text'][index]}")
    print(f"Similarity Score: {cosine_similarities[index]}\n")

Document: india rupee hit fiveyear high india rupee hit fiveyear high standard poor sp raised country foreign currency rating rupee climbed per u dollar thursday close currency gained almost past three session sp rate borrower creditworthiness lifted india rating one notch bb indian asset seen le gamble cash expected flow market buoying rupee upgrade positive basically people use excuse come back india said bhanu baweja strategist ubs money moved india first two three week january market like korea thailand upgrade lead reversal india foreign currency rating one notch investment grade start bbb increase put level romania egypt el salvador one level russia
Similarity Score: 0.4472135954999579

Document: aviation firm eye booming india india defence minister opened country aero india air show invitation global aerospace firm outsource job nation pranab mukherjee said company could take advantage india highly skilled worker low wage civil military aerospace firm country attending show ana