In [1]:
pip install pandas nltk spacy gensim scikit-learn




In [2]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer


In [5]:
# Assuming BBC_DATA.csv is in the same directory
dataset = pd.read_csv('BBC_DATA.csv')


In [30]:
dataset.head()


Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
sample_article = dataset['Text'].iloc[0]

# Tokenization
words = word_tokenize(sample_article)
sentences = sent_tokenize(sample_article)


In [16]:
print(words)
print(sentences)



In [18]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [19]:
# Stemming
porter_stemmer = PorterStemmer()
stemmed_words = [porter_stemmer.stem(word) for word in words]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]


In [21]:
print(stemmed_words)
print(lemmatized_words)

['worldcom', 'ex-boss', 'launch', 'defenc', 'lawyer', 'defend', 'former', 'worldcom', 'chief', 'berni', 'ebber', 'against', 'a', 'batteri', 'of', 'fraud', 'charg', 'have', 'call', 'a', 'compani', 'whistleblow', 'as', 'their', 'first', 'wit', '.', 'cynthia', 'cooper', 'worldcom', 's', 'ex-head', 'of', 'intern', 'account', 'alert', 'director', 'to', 'irregular', 'account', 'practic', 'at', 'the', 'us', 'telecom', 'giant', 'in', '2002.', 'her', 'warn', 'led', 'to', 'the', 'collaps', 'of', 'the', 'firm', 'follow', 'the', 'discoveri', 'of', 'an', '$', '11bn', '(', '£5.7bn', ')', 'account', 'fraud', '.', 'mr', 'ebber', 'ha', 'plead', 'not', 'guilti', 'to', 'charg', 'of', 'fraud', 'and', 'conspiraci', '.', 'prosecut', 'lawyer', 'have', 'argu', 'that', 'mr', 'ebber', 'orchestr', 'a', 'seri', 'of', 'account', 'trick', 'at', 'worldcom', 'order', 'employe', 'to', 'hide', 'expens', 'and', 'inflat', 'revenu', 'to', 'meet', 'wall', 'street', 'earn', 'estim', '.', 'but', 'ms', 'cooper', 'who', 'now',

In [23]:
# Load SpaCy's pre-trained model
nlp = spacy.load('en_core_web_sm')

# Process the sample article
doc = nlp(sample_article)

# Extract named entities
named_entities = [(ent.text, ent.label_) for ent in doc.ents]
print(named_entities)

[('worldcom ex-boss', 'PERSON'), ('worldcom', 'ORG'), ('bernie', 'PERSON'), ('first', 'ORDINAL'), ('cynthia cooper  worldcom s ex-head', 'PERSON'), ('us', 'GPE'), ('2002', 'DATE'), ('5.7bn', 'MONEY'), ('worldcom', 'ORG'), ('new york', 'GPE'), ('wednesday', 'DATE'), ('arthur andersen', 'PERSON'), ('worldcom', 'ORG'), ('early 2001 and 2002', 'DATE'), ('worldcom', 'ORG'), ('cooper', 'PERSON'), ('worldcom financial', 'ORG'), ('scott sullivan', 'PERSON'), ('sullivan', 'PERSON'), ('2001', 'DATE'), ('85 years', 'DATE'), ('worldcom', 'ORG'), ('2004', 'DATE'), ('mci', 'ORG'), ('last week', 'DATE'), ('mci', 'ORG'), ('6.75bn', 'MONEY')]


In [45]:
# Tokenize the entire dataset
tokenized_data = [word_tokenize(article) for article in dataset['Text']]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_data, vector_size=100, window=5, min_count=1, workers=4)


# Get a word from the vocabulary of the Word2Vec model
sample_word = word2vec_model.wv.index_to_key[0]  # Choose index 0 or any other valid index

# Get vector representation of the chosen word
sample_word_vector = word2vec_model.wv[sample_word]
print(sample_word_vector)
print(sample_word)
print(word2vec_model)

[ 2.7879721e-02  1.3422313e-03  1.6377891e+00  1.0293640e+00
 -1.0290486e+00 -1.3601092e+00  1.2974035e+00  9.9968672e-01
 -8.5291559e-01 -1.7252359e+00  5.7322627e-01 -6.5874749e-01
 -1.5534254e+00  1.3109279e-01 -7.6012686e-03 -1.0684609e-01
  5.5359185e-01  4.6730842e-02 -4.6097383e-01 -1.0784883e+00
  1.2382365e+00 -3.4004995e-01  1.8526311e+00 -8.9855832e-01
  8.0788380e-01 -2.3810017e-01 -7.3202866e-01  8.5141951e-01
 -6.0484940e-01 -1.3663299e-01  1.7441951e-01  4.1195318e-01
  2.2754297e-01 -1.4277290e+00  2.6205432e-01 -2.3459113e-01
  3.2779139e-01 -1.0027425e+00 -2.6524103e-01 -9.7996330e-01
 -5.3939018e-02 -7.3855877e-01 -3.2385218e-01 -4.5562556e-01
  3.3828276e-01 -9.2608529e-01 -7.6590401e-01  5.1656097e-01
  5.9468585e-01  8.1698108e-01 -2.2878036e-01 -5.8444786e-01
 -5.9562641e-01  2.0109849e-01 -4.9469033e-01 -3.5529356e-02
 -7.3464908e-02 -3.1536564e-01 -2.6733142e-01  7.5271177e-01
  2.9261222e-01  5.9244394e-01  3.9192730e-01 -1.2188308e-01
 -4.8659769e-01  9.61735

In [41]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the dataset
tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['Text'])

# Calculate cosine similarity between two news articles
cosine_similarity = (tfidf_matrix * tfidf_matrix.T).toarray()
print(tfidf_matrix)
print(cosine_similarity)

  (0, 1009)	0.05643509689668293
  (0, 23572)	0.05171683198420305
  (0, 6472)	0.02902507640882694
  (0, 5342)	0.044434208471766566
  (0, 23661)	0.05875637443991937
  (0, 4150)	0.0601801944482067
  (0, 1652)	0.034102087269467486
  (0, 24104)	0.02383356638673293
  (0, 13075)	0.017706106467856114
  (0, 14251)	0.10916318514966579
  (0, 12865)	0.03184197913561059
  (0, 407)	0.025019777226527817
  (0, 17579)	0.039549514785291666
  (0, 2875)	0.04289120555750058
  (0, 9615)	0.012344388705610583
  (0, 8036)	0.043630523793597796
  (0, 8748)	0.038172940645020706
  (0, 12224)	0.021277420879685587
  (0, 1792)	0.017252502375521908
  (0, 5810)	0.049059642984184255
  (0, 11448)	0.018347791308608738
  (0, 24617)	0.02055116391848126
  (0, 1076)	0.048608990336853036
  (0, 19913)	0.049533415911568075
  (0, 12316)	0.04415847017280601
  :	:
  (1489, 11033)	0.08625548429839593
  (1489, 15792)	0.020993536539211335
  (1489, 16046)	0.017799215951455238
  (1489, 17416)	0.03244209969602519
  (1489, 4156)	0.0727512