# Data Loading using pandas library

In [None]:
import pandas as pd

In [21]:
df=pd.read_excel('/content/Assignment10.xlsx')

In [22]:
df.head()

Unnamed: 0,Article
0,"Retailers, the makers of foods marketed for we..."
1,"Move over, Ozempic — there’s a new drug in tow..."
2,Sept 14 (Reuters) - Bristol Myers Squibb (BMY....
3,Austin Wolcott was 18 years old and pretty sur...
4,"Cancer, often referred to as the “emperor of a..."


# cleaning the text by removing stopwords, stemming words



In [23]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

# Ensure NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

def clean_article(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

# Assuming 'text' is the column with the articles
df['cleaned_text'] = df['Article'].apply(clean_article)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
df.rename(columns={'Article': 'Article (Original)'}, inplace=True)

In [24]:
df['cleaned_text'].head()

0    retail  maker food market weight loss  type co...
1    move  ozemp   new drug town  eli lilli  zepbou...
2    sept 14  reuter   bristol myer squibb  bmyn  s...
3    austin wolcott 18 year old pretti sure  surviv...
4    cancer  often refer  emperor maladi   unyield ...
Name: cleaned_text, dtype: object

In [31]:
import pandas as pd

# Assuming 'df' is your DataFrame loaded from the Excel file
# And assuming 'cleaned_text' is the column with the cleaned articles

# Check if 'Article' column exists in the DataFrame
if 'Article (Original)' in df.columns:
    # Create a new column 'Article (Cleaned)' and fill it with the cleaned text
    df['Article (Cleaned)'] = df['cleaned_text']
else:
    print("Column 'Article' does not exist in the DataFrame.")
df.rename(columns={'Article': 'Article (Original)'}, inplace=True)

# Display the modified DataFrame
print (df.head())

                                  Article (Original)  \
0  Retailers, the makers of foods marketed for we...   
1  Move over, Ozempic — there’s a new drug in tow...   
2  Sept 14 (Reuters) - Bristol Myers Squibb (BMY....   
3  Austin Wolcott was 18 years old and pretty sur...   
4  Cancer, often referred to as the “emperor of a...   

                                        cleaned_text  \
0  retail  maker food market weight loss  type co...   
1  move  ozemp   new drug town  eli lilli  zepbou...   
2  sept 14  reuter   bristol myer squibb  bmyn  s...   
3  austin wolcott 18 year old pretti sure  surviv...   
4  cancer  often refer  emperor maladi   unyield ...   

                                   Article (Cleaned)  
0  retail  maker food market weight loss  type co...  
1  move  ozemp   new drug town  eli lilli  zepbou...  
2  sept 14  reuter   bristol myer squibb  bmyn  s...  
3  austin wolcott 18 year old pretti sure  surviv...  
4  cancer  often refer  emperor maladi   unyield ..

# Clean Up Articles
## show the Article (Original) and Article (Cleaned)

In [32]:

display_columns = ['Article (Original)', 'Article (Cleaned)']
print(df[display_columns].head())

                                  Article (Original)  \
0  Retailers, the makers of foods marketed for we...   
1  Move over, Ozempic — there’s a new drug in tow...   
2  Sept 14 (Reuters) - Bristol Myers Squibb (BMY....   
3  Austin Wolcott was 18 years old and pretty sur...   
4  Cancer, often referred to as the “emperor of a...   

                                   Article (Cleaned)  
0  retail  maker food market weight loss  type co...  
1  move  ozemp   new drug town  eli lilli  zepbou...  
2  sept 14  reuter   bristol myer squibb  bmyn  s...  
3  austin wolcott 18 year old pretti sure  surviv...  
4  cancer  often refer  emperor maladi   unyield ...  


In [29]:
df.columns

Index(['Article (Original)', 'cleaned_text', 'Article (Cleaned)'], dtype='object')

In [34]:
df.rename(columns={ 'Article (Original)':'Article'}, inplace=True)

#  Determines the overall mood of each article using TextBlob.

In [35]:
from textblob import TextBlob

def check_mood(text):
    blob = TextBlob(text)
    if blob.sentiment.polarity > 0:
        return "Positive"
    elif blob.sentiment.polarity == 0:
        return "Neutral"
    else:
        return "Negative"

df['mood'] = df['cleaned_text'].apply(check_mood)

In [7]:
df['mood'].head()

0    Positive
1    Negative
2    Positive
3    Positive
4    Positive
Name: mood, dtype: object

# Checking the Mood

In [36]:

display_columns = ['Article', 'mood']
print(df[display_columns].head())

                                             Article      mood
0  Retailers, the makers of foods marketed for we...  Positive
1  Move over, Ozempic — there’s a new drug in tow...  Negative
2  Sept 14 (Reuters) - Bristol Myers Squibb (BMY....  Positive
3  Austin Wolcott was 18 years old and pretty sur...  Positive
4  Cancer, often referred to as the “emperor of a...  Positive


# Common Words Identification: Identifies the most frequently occurring words across all articles.

In [8]:
from collections import Counter
import string

def find_common_words(df, top_n=10):
    all_words = []
    for text in df['cleaned_text']:
        words = text.split()
        all_words.extend(words)

    word_counts = Counter(all_words)
    common_words = word_counts.most_common(top_n)

    return common_words

common_words = find_common_words(df)
print(common_words)

[('nike', 129), ('s', 116), ('firm', 60), ('cancer', 59), ('compani', 56), ('new', 56), ('account', 56), ('therapi', 55), ('market', 52), ('also', 48)]


# Finding Connections

In [9]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

processed_docs = df['cleaned_text'].map(preprocess)

dictionary = gensim.corpora.Dictionary(processed_docs)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Train LDA model
lda_model = gensim.models.LdaModel(bow_corpus,
                                   id2word=dictionary,
                                   num_topics=10,
                                   random_state=100,
                                   chunksize=200,
                                   passes=20)

# Print the topics found by the LDA model
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic #{idx}: {topic}')

Topic #0: 0.026*"nike" + 0.021*"strava" + 0.017*"cart" + 0.016*"club" + 0.014*"cancer" + 0.012*"tcell" + 0.010*"cell" + 0.010*"therapi" + 0.010*"user" + 0.008*"platform"
Topic #1: 0.031*"nike" + 0.019*"firm" + 0.012*"market" + 0.011*"product" + 0.010*"cancer" + 0.009*"believ" + 0.009*"therapi" + 0.007*"brand" + 0.007*"increas" + 0.006*"cart"
Topic #2: 0.011*"compani" + 0.008*"file" + 0.008*"claim" + 0.006*"food" + 0.006*"said" + 0.006*"lawsuit" + 0.006*"burger" + 0.006*"kelli" + 0.006*"beverag" + 0.006*"fals"
Topic #3: 0.054*"taco" + 0.030*"bell" + 0.017*"order" + 0.014*"nacho" + 0.012*"menu" + 0.011*"sale" + 0.009*"free" + 0.009*"quarter" + 0.009*"estim" + 0.009*"mcdonald"
Topic #4: 0.027*"unilev" + 0.021*"foundat" + 0.012*"chariti" + 0.009*"cancer" + 0.009*"accord" + 0.009*"court" + 0.009*"lawsuit" + 0.006*"patient" + 0.006*"inform" + 0.006*"said"
Topic #5: 0.013*"obes" + 0.011*"drug" + 0.009*"brand" + 0.009*"said" + 0.009*"compani" + 0.008*"manag" + 0.007*"restaur" + 0.007*"nestl" +

In [38]:
pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


In [40]:
import gensim
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel
from pprint import pprint
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sample articles (replace this with your actual articles)
articles = [
    "Artificial intelligence is revolutionizing industries...",
    "Cybersecurity threats are increasing rapidly...",
    "Digital transformation is transforming healthcare..."

]

# Preprocessing
stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Stemming
    stemmer = nltk.stem.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

# Preprocess each article
corpus = [preprocess(article) for article in articles]

# Creating the term dictionary of our corpus, where every unique word is assigned an index.
dictionary = corpora.Dictionary(corpus)

# Converting list of integers into a bag of words corpus
bow_corpus = [dictionary.doc2bow(text) for text in corpus]

# Training the LDA model
lda_model = gensim.models.ldamodel.LdaModel(bow_corpus,
                                             num_topics=3,  # Number of topics
                                             id2word=dictionary,
                                             passes=50,  # Number of times the algorithm iterates over the corpus
                                             chunksize=200,  # Number of documents to be passed together to the model
                                             alpha='auto',  # Alpha parameter
                                             per_word_topics=False)

# Printing the topics found by the LDA model
pprint(lda_model.print_topics(num_words=4))

  and should_run_async(code)


[(0, '0.083*"..." + 0.083*"transform" + 0.083*"cybersecur" + 0.083*"threat"'),
 (1, '0.167*"..." + 0.167*"transform" + 0.095*"increas" + 0.095*"threat"'),
 (2,
  '0.148*"intellig" + 0.148*"industri" + 0.148*"artifici" + '
  '0.148*"revolution"')]


In [11]:
# Select a subset of documents from each topic for manual review
num_documents_per_review = 5  # Adjust based on your needs
selected_articles = []

for topic, count in topic_counts.items():
    selected_articles.extend(lda_model.show_topic(topic, topn=num_documents_per_review))

# Print the selected articles
for article in selected_articles:
    print(article)

('nike', 0.025845312)
('strava', 0.020839963)
('cart', 0.01725603)
('club', 0.015825346)
('cancer', 0.013674422)
('nike', 0.031256136)
('firm', 0.019167475)
('market', 0.012454602)
('product', 0.01077593)
('cancer', 0.009818687)


In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [15]:
from textblob import TextBlob

def analyze_aspect_sentiment(text, aspect):
    # Tokenize sentences
    sentences = sent_tokenize(text)
    # Filter sentences containing the aspect
    relevant_sentences = [sentence for sentence in sentences if aspect in sentence.lower()]
    # Combine relevant sentences
    combined_text = ' '.join(relevant_sentences)
    # Annotate sentiment
    analysis = TextBlob(combined_text)
    return analysis.sentiment.polarity

In [21]:
# Assuming selected_articles is a list of tuples from show_topic
# Extract text from tuples and store them in a separate list
texts_for_analysis = [article[0] for article in selected_articles]

# Now, analyze sentiment of extracted texts
for text in texts_for_analysis:
    sentiment = analyze_sentiment(text)

Sentiment: Neutral
Sentiment: Neutral
Sentiment: Neutral
Sentiment: Neutral
Sentiment: Neutral
Sentiment: Neutral
Sentiment: Neutral
Sentiment: Neutral
Sentiment: Neutral
Sentiment: Neutral


# Aspect Analysis (Optional)

In [43]:
import spacy
from textblob import TextBlob

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

# Assuming 'df' is your DataFrame and it has a column named 'Article'
# Function to analyze sentiment towards specific aspects
def analyze_sentiment_for_aspects(text):
    doc = nlp(text)
    aspects = []
    for ent in doc.ents:
        # Filter entities based on your criteria (e.g., label, category)
        if ent.label_ == "ORG" or ent.label_ == "GPE":  # Example criteria
            aspects.append(ent.text)

    results = {}
    for aspect in aspects:
        blob = TextBlob(aspect)
        sentiment = blob.sentiment.polarity
        results[aspect] = sentiment

    return results

# Initialize a new column in the DataFrame for aspect analysis results
df['Aspect Analysis'] = ""

# Perform aspect analysis for each article and update the new column
for index, row in df.iterrows():
    article = row['Article']
    aspect_analysis = analyze_sentiment_for_aspects(article)
    # Convert the aspect analysis dictionary to a string for easy appending
    aspect_analysis_str = str(aspect_analysis)
    df.at[index, 'Aspect Analysis'] = aspect_analysis_str

# Display the modified DataFrame
print(df)

  and should_run_async(code)


                                              Article  \
0   Retailers, the makers of foods marketed for we...   
1   Move over, Ozempic — there’s a new drug in tow...   
2   Sept 14 (Reuters) - Bristol Myers Squibb (BMY....   
3   Austin Wolcott was 18 years old and pretty sur...   
4   Cancer, often referred to as the “emperor of a...   
5   Nov 28 (Reuters) - The U.S. Food and Drug Admi...   
6   Nov 21 (Reuters) - BeiGene (6160.HK) said on T...   
7   Sept 19 (Reuters) - Drugmaker BeiGene (6160.HK...   
8   BRUKINSA is the first and only BTK inhibitor a...   
9   Whether you're looking for a quick bite to eat...   
10  A federal judge in New York has dismissed a la...   
11  The future of fast food delivery is here.\n\nD...   
12  Yum Brands topped Wall Street estimates for th...   
13  If you fancy Taco Bell's Nacho Fries, the fast...   
14  Taco Bell is serving up its new Toasted Breakf...   
15  Oct 30 (Reuters) - McDonald's (MCD.N) beat Wal...   
16  Whether you dip it, drizzle