In [None]:
# Install necessary libraries
!pip install newsapi-python pandas openpyxl requests beautifulsoup4 python-docx vaderSentiment

from newsapi import NewsApiClient
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
from google.colab import files
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re

# Download necessary nltk data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize NewsApiClient
my_api_key = "f57a815c8fdb43acacc42aa1f1b814d8"
newsapi = NewsApiClient(api_key=my_api_key)

# Function to fetch article content
def fetch_article_content(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        full_content = '\n'.join([para.get_text() for para in paragraphs])
        return full_content if full_content.strip() != '' else 'removed'
    except Exception as e:
        return 'removed'

# Function to get working URLs
def get_working_url(urls):
    working_urls = []
    bad_requests = []
    for url in urls:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                working_urls.append(url)
            else:
                bad_requests.append(url)
        except Exception as e:
            bad_requests.append(url)
    return working_urls, bad_requests

# Fetch articles
print("Fetching articles...")
data = newsapi.get_everything(q='indian elections 2024', language='en',page_size=100)

if data['status'] != 'ok':
    raise Exception("Failed to fetch data from News API")

articles = data['articles']
df = pd.DataFrame(articles)
print("Articles fetched successfully.")

# Fetch article content and remove bad URLs
print("Fetching article content...")
df['full_content'] = df['url'].apply(fetch_article_content)
working_urls, bad_requests = get_working_url(df['url'])

# Filter out rows with 'removed' content
df_filtered = df[df['full_content'] != 'removed']
print(f"Filtered DataFrame has {len(df_filtered)} rows.")

# Initialize Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# List of political parties to identify in the articles, including common variations
party_keywords = {
    'BJP': ['BJP', 'Bharatiya Janata Party'],
    'INC': ['Congress', 'Indian National Congress', 'INC'],
    'AAP': ['AAP', 'Aam Aadmi Party'],
    'CPI': ['CPI', 'Communist Party of India'],
    'CPM': ['CPM', 'Communist Party of India (Marxist)', 'CPI(M)'],
    'NCP': ['NCP', 'Nationalist Congress Party'],
    'BSP': ['BSP', 'Bahujan Samaj Party'],
    'SP': ['SP', 'Samajwadi Party'],
    'RJD': ['RJD', 'Rashtriya Janata Dal'],
    'JD(U)': ['JD(U)', 'Janata Dal (United)'],
    'TMC': ['TMC', 'All India Trinamool Congress', 'Trinamool Congress'],
    'AIADMK': ['AIADMK', 'All India Anna Dravida Munnetra Kazhagam'],
    'DMK': ['DMK', 'Dravida Munnetra Kazhagam'],
    'Shiv Sena': ['Shiv Sena', 'Shiv Sena'],
    'TRS': ['TRS', 'Telangana Rashtra Samithi'],
    'YSRCP': ['YSRCP', 'Yuvajana Sramika Rythu Congress Party'],
    'TDP': ['TDP', 'Telugu Desam Party'],
    'LJP': ['LJP', 'Lok Janshakti Party'],
    'RLD': ['RLD', 'Rashtriya Lok Dal'],
    'AIMIM': ['AIMIM', 'All India Majlis-e-Ittehadul Muslimeen'],
    'JD(S)': ['JD(S)', 'Janata Dal (Secular)'],
    'INLD': ['INLD', 'Indian National Lok Dal'],
    'JMM': ['JMM', 'Jharkhand Mukti Morcha'],
    'SAD': ['SAD', 'Shiromani Akali Dal'],
    'RSP': ['RSP', 'Revolutionary Socialist Party'],
    'AGP': ['AGP', 'Asom Gana Parishad'],
    'BPF': ['BPF', 'Bodoland People\'s Front'],
    'SDF': ['SDF', 'Sikkim Democratic Front'],
    'MNDF': ['MNDF', 'Mizo National Front'],
    'UDP': ['UDP', 'United Democratic Party (Meghalaya)'],
    'NPF': ['NPF', 'Naga People\'s Front'],
    'ZPM': ['ZPM', 'Zoram People\'s Movement'],
    'KC(M)': ['KC(M)', 'Kerala Congress (M)'],
    'PDP': ['PDP', 'Peoples Democratic Party'],
    'NC': ['NC', 'National Conference']

}

# Apply sentiment analysis
def analyze_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

def get_sentiment_label(score):
    if score >= 0.5:
        return 'Positive'
    elif score <= -0.5:
        return 'Negative'
    else:
        return 'Neutral'

import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract party names from text
def extract_parties(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    parties_in_text = []
    for party, keywords in party_keywords.items():
        if any(keyword in entities for keyword in keywords):
            parties_in_text.append(party)
    return parties_in_text

# Function to get sentiment of a text
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

# Function to determine the main party based on weighted sentiment
def determine_main_party_weighted(text):
    doc = nlp(text)
    sentences = list(doc.sents)

    party_sentiments = {party: 0 for party in party_keywords}
    party_counts = {party: 0 for party in party_keywords}

    for sentence in sentences:
        sentence_text = sentence.text
        sentiment = get_sentiment(sentence_text)
        parties = extract_parties(sentence_text)

        for party in parties:
            party_sentiments[party] += sentiment
            party_counts[party] += 1

    weighted_scores = {party: party_sentiments[party] / party_counts[party] if party_counts[party] > 0 else 0 for party in party_keywords}
    max_party_by_weighted_score = max(weighted_scores, key=weighted_scores.get)
    max_party_by_count = max(party_counts, key=party_counts.get)

    if weighted_scores[max_party_by_weighted_score] != 0:
        return max_party_by_weighted_score
    else:
        return max_party_by_count if party_counts[max_party_by_count] != 0 else 'Unknown'



# Ensure all rows are subjected to sentiment analysis
df_filtered['sentiment_score'] = df_filtered['full_content'].apply(analyze_sentiment)
df_filtered['party'] = df_filtered['full_content'].apply(determine_main_party_weighted)
df_filtered['sentiment_label'] = df_filtered['sentiment_score'].apply(get_sentiment_label)

# Filter out rows where the party is 'Unknown'
df_final = df_filtered[df_filtered['party'] != 'Unknown']

# Check the number of rows processed
processed_rows = len(df_final)
total_rows = len(df_filtered)
print(f"Sentiment analysis completed for {processed_rows} out of {total_rows} rows with identified parties.")

# Save filtered DataFrame to Excel
file_path = '/content/output.xlsx'
df_final.to_excel(file_path, sheet_name='Sheet1', index=False)

if os.path.exists(file_path):
    print("Excel file found. Attempting to download...")
    files.download(file_path)
else:
    print("Excel file not found.")

# Save filtered DataFrame to CSV
csv_file_path = '/content/output_full_content.csv'
df_final.to_csv(csv_file_path, index=False)

# Verify the content of the saved CSV
if os.path.exists(csv_file_path):
    print("CSV file with full content found.")
    df_check = pd.read_csv(csv_file_path)
    print("Preview of saved CSV:")
    print(df_check.head())  # Display the first few rows for verification
    files.download(csv_file_path)
else:
    print("CSV file not found.")

# Save bad request URLs to Excel
bad_requests_df = pd.DataFrame({'Bad URLs': bad_requests})
bad_requests_file_path = '/content/badrequests.xlsx'
bad_requests_df.to_excel(bad_requests_file_path, index=False)

if os.path.exists(bad_requests_file_path):
    print("Bad request URLs saved to 'badrequests.xlsx'")
else:
    print("Failed to save bad request URLs.")

print("Script executed successfully")


Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, vaderSentiment, newsapi-python
Successfully installed newsapi-python-0.2.7 python-docx-1.1.2 vaderSentiment-3.3.2


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Fetching articles...
Articles fetched successfully.
Fetching article content...


KeyboardInterrupt: 

In [None]:
# Install necessary libraries
!pip install newsapi-python pandas openpyxl requests beautifulsoup4 python-docx vaderSentiment

from newsapi import NewsApiClient
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
from google.colab import files
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re

# Download necessary nltk data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize NewsApiClient
my_api_key = "f57a815c8fdb43acacc42aa1f1b814d8"
newsapi = NewsApiClient(api_key=my_api_key)

# Function to fetch article content
def fetch_article_content(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        full_content = '\n'.join([para.get_text() for para in paragraphs])
        return full_content if full_content.strip() != '' else 'removed'
    except Exception as e:
        return 'removed'

# Function to get working URLs
def get_working_url(urls):
    working_urls = []
    bad_requests = []
    for url in urls:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                working_urls.append(url)
            else:
                bad_requests.append(url)
        except Exception as e:
            bad_requests.append(url)
    return working_urls, bad_requests

# Fetch articles with a different query
print("Fetching articles...")
data = newsapi.get_everything(q='Samajwadi Party OR SP OR Bahujan Samaj Party OR BSP OR Communist Party of India OR CPI', language='en', page_size=100)

if data['status'] != 'ok':
    raise Exception("Failed to fetch data from News API")

articles = data['articles']
df = pd.DataFrame(articles)
print("Articles fetched successfully.")

# Fetch article content and remove bad URLs
print("Fetching article content...")
df['full_content'] = df['url'].apply(fetch_article_content)
working_urls, bad_requests = get_working_url(df['url'])

# Filter out rows with 'removed' content
df_filtered = df[df['full_content'] != 'removed']
print(f"Filtered DataFrame has {len(df_filtered)} rows.")

# Initialize Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# List of political parties to identify in the articles, including common variations
party_keywords = {
      'BJP': ['BJP', 'Bharatiya Janata Party'],
    'INC': ['Congress', 'Indian National Congress', 'INC'],
    'AAP': ['AAP', 'Aam Aadmi Party'],
    'CPI': ['CPI', 'Communist Party of India'],
    'CPM': ['CPM', 'Communist Party of India (Marxist)', 'CPI(M)'],
    'NCP': ['NCP', 'Nationalist Congress Party'],
    'BSP': ['BSP', 'Bahujan Samaj Party'],
    'SP': ['SP', 'Samajwadi Party'],
    'RJD': ['RJD', 'Rashtriya Janata Dal'],
    'JD(U)': ['JD(U)', 'Janata Dal (United)'],
    'TMC': ['TMC', 'All India Trinamool Congress', 'Trinamool Congress'],
    'AIADMK': ['AIADMK', 'All India Anna Dravida Munnetra Kazhagam'],
    'DMK': ['DMK', 'Dravida Munnetra Kazhagam'],
    'Shiv Sena': ['Shiv Sena', 'Shiv Sena'],
    'TRS': ['TRS', 'Telangana Rashtra Samithi'],
    'YSRCP': ['YSRCP', 'Yuvajana Sramika Rythu Congress Party'],
    'TDP': ['TDP', 'Telugu Desam Party'],
    'LJP': ['LJP', 'Lok Janshakti Party'],
    'RLD': ['RLD', 'Rashtriya Lok Dal'],
    'AIMIM': ['AIMIM', 'All India Majlis-e-Ittehadul Muslimeen'],
    'JD(S)': ['JD(S)', 'Janata Dal (Secular)'],
    'INLD': ['INLD', 'Indian National Lok Dal'],
    'JMM': ['JMM', 'Jharkhand Mukti Morcha'],
    'SAD': ['SAD', 'Shiromani Akali Dal'],
    'RSP': ['RSP', 'Revolutionary Socialist Party'],
    'AGP': ['AGP', 'Asom Gana Parishad'],
    'BPF': ['BPF', 'Bodoland People\'s Front'],
    'SDF': ['SDF', 'Sikkim Democratic Front'],
    'MNDF': ['MNDF', 'Mizo National Front'],
    'UDP': ['UDP', 'United Democratic Party (Meghalaya)'],
    'NPF': ['NPF', 'Naga People\'s Front'],
    'ZPM': ['ZPM', 'Zoram People\'s Movement'],
    'KC(M)': ['KC(M)', 'Kerala Congress (M)'],
    'PDP': ['PDP', 'Peoples Democratic Party'],
    'NC': ['NC', 'National Conference']
}

# Apply sentiment analysis
def analyze_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

def get_sentiment_label(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to extract party names from text
def extract_parties(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    parties_in_text = []
    for party, keywords in party_keywords.items():
        if any(keyword in entities for keyword in keywords):
            parties_in_text.append(party)
    return parties_in_text

# Function to get sentiment of a text
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

# Function to determine the main party based on weighted sentiment
def determine_main_party_weighted(text):
    doc = nlp(text)
    sentences = list(doc.sents)

    party_sentiments = {party: 0 for party in party_keywords}
    party_counts = {party: 0 for party in party_keywords}

    for sentence in sentences:
        sentence_text = sentence.text
        sentiment = get_sentiment(sentence_text)
        parties = extract_parties(sentence_text)

        for party in parties:
            party_sentiments[party] += sentiment
            party_counts[party] += 1

    weighted_scores = {party: party_sentiments[party] / party_counts[party] if party_counts[party] > 0 else 0 for party in party_keywords}
    max_party_by_weighted_score = max(weighted_scores, key=weighted_scores.get)
    max_party_by_count = max(party_counts, key=party_counts.get)

    if weighted_scores[max_party_by_weighted_score] != 0:
        return max_party_by_weighted_score
    else:
        return max_party_by_count if party_counts[max_party_by_count] != 0 else 'Unknown'



# Ensure all rows are subjected to sentiment analysis
df_filtered['sentiment_score'] = df_filtered['full_content'].apply(analyze_sentiment)
df_filtered['party'] = df_filtered['full_content'].apply(determine_main_party_weighted)
df_filtered['sentiment_label'] = df_filtered['sentiment_score'].apply(get_sentiment_label)

# Filter out rows where the party is 'Unknown'
df_final = df_filtered[df_filtered['party'] != 'Unknown']

# Check the number of rows processed
processed_rows = len(df_final)
total_rows = len(df_filtered)
print(f"Sentiment analysis completed for {processed_rows} out of {total_rows} rows with identified parties.")

# Save filtered DataFrame to Excel
file_path = '/content/output_new.xlsx'
df_final.to_excel(file_path, sheet_name='Sheet1', index=False)

if os.path.exists(file_path):
    print("Excel file found. Attempting to download...")
    files.download(file_path)
else:
    print("Excel file not found.")

# Save filtered DataFrame to CSV
csv_file_path = '/content/output_new_full_content.csv'
df_final.to_csv(csv_file_path, index=False)

# Verify the content of the saved CSV
if os.path.exists(csv_file_path):
    print("CSV file with full content found.")
    df_check = pd.read_csv(csv_file_path)
    print("Preview of saved CSV:")
    print(df_check.head())  # Display the first few rows for verification
    files.download(csv_file_path)
else:
    print("CSV file not found.")

# Save bad request URLs to Excel
bad_requests_df = pd.DataFrame({'Bad URLs': bad_requests})
bad_requests_file_path = '/content/badrequests_new.xlsx'
bad_requests_df.to_excel(bad_requests_file_path, index=False)

if os.path.exists(bad_requests_file_path):
    print("Bad request URLs saved to 'badrequests_new.xlsx'")
else:
    print("Failed to save bad request URLs.")

print("Script executed successfully")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fetching articles...
Articles fetched successfully.
Fetching article content...
Filtered DataFrame has 97 rows.
                                        article_text party  sentiment
0  The BJP has launched its new campaign for the ...  None    -0.2263
1   AAP's manifesto focuses on health and education.  None     0.0000
Sentiment analysis completed for 72 out of 97 rows with identified parties.
Excel file found. Attempting to download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

CSV file with full content found.
Preview of saved CSV:
                                              source            author  \
0  {'id': 'al-jazeera-english', 'name': 'Al Jazee...            AJLabs   
1  {'id': 'al-jazeera-english', 'name': 'Al Jazee...  Al Jazeera Staff   
2  {'id': 'al-jazeera-english', 'name': 'Al Jazee...       Saif Khalid   
3  {'id': 'al-jazeera-english', 'name': 'Al Jazee...  Al Jazeera Staff   
4  {'id': 'al-jazeera-english', 'name': 'Al Jazee...  Al Jazeera Staff   

                                               title  \
0     Mapping the results of the India election 2024   
1  India election results: Big wins, losses and s...   
2  India election results: Did ‘secular’ parties ...   
3  India Lok Sabha election 2024 Phase 7: Who vot...   
4  India Lok Sabha election 2024 Phase 6: Who vot...   

                                         description  \
0  The Bharatiya Janata Party, together with its ...   
1  A tight Varanasi race and BJP's Maharashtra do.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Bad request URLs saved to 'badrequests_new.xlsx'
Script executed successfully


In [None]:
!pip install newsapi-python pandas openpyxl requests beautifulsoup4 python-docx vaderSentiment spacy

from newsapi import NewsApiClient
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
from google.colab import files
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy

# Download necessary nltk data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize NewsApiClient
my_api_key = "f57a815c8fdb43acacc42aa1f1b814d8"
newsapi = NewsApiClient(api_key=my_api_key)

# Function to fetch article content
def fetch_article_content(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        full_content = '\n'.join([para.get_text() for para in paragraphs])
        return full_content if full_content.strip() != '' else 'removed'
    except Exception as e:
        return 'removed'

# Function to get working URLs
def get_working_url(urls):
    working_urls = []
    bad_requests = []
    for url in urls:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                working_urls.append(url)
            else:
                bad_requests.append(url)
        except Exception as e:
            bad_requests.append(url)
    return working_urls, bad_requests

# Fetch articles
print("Fetching articles...")
data = newsapi.get_everything(q='indian elections 2024', language='en')

if data['status'] != 'ok':
    raise Exception("Failed to fetch data from News API")

articles = data['articles']
df = pd.DataFrame(articles)
print("Articles fetched successfully.")

# Fetch article content and remove bad URLs
print("Fetching article content...")
df['full_content'] = df['url'].apply(fetch_article_content)
working_urls, bad_requests = get_working_url(df['url'])

# Filter out rows with 'removed' content
df_filtered = df[df['full_content'] != 'removed']
print(f"Filtered DataFrame has {len(df_filtered)} rows.")

# Initialize Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# List of political parties to identify in the articles, including common variations
party_keywords = {
    'BJP': ['BJP', 'Bharatiya Janata Party'],
    'INC': ['Congress', 'Indian National Congress', 'INC'],
    'AAP': ['AAP', 'Aam Aadmi Party'],
    'CPI': ['CPI', 'Communist Party of India'],
    'CPM': ['CPM', 'Communist Party of India (Marxist)', 'CPI(M)'],
    'NCP': ['NCP', 'Nationalist Congress Party'],
    'BSP': ['BSP', 'Bahujan Samaj Party'],
    'SP': ['SP', 'Samajwadi Party'],
    'RJD': ['RJD', 'Rashtriya Janata Dal'],
    'JD(U)': ['JD(U)', 'Janata Dal (United)'],
    'TMC': ['TMC', 'All India Trinamool Congress', 'Trinamool Congress'],
    'AIADMK': ['AIADMK', 'All India Anna Dravida Munnetra Kazhagam'],
    'DMK': ['DMK', 'Dravida Munnetra Kazhagam'],
    'Shiv Sena': ['Shiv Sena', 'Shiv Sena'],
    'TRS': ['TRS', 'Telangana Rashtra Samithi'],
    'YSRCP': ['YSRCP', 'Yuvajana Sramika Rythu Congress Party'],
    'TDP': ['TDP', 'Telugu Desam Party'],
    'LJP': ['LJP', 'Lok Janshakti Party'],
    'RLD': ['RLD', 'Rashtriya Lok Dal'],
    'AIMIM': ['AIMIM', 'All India Majlis-e-Ittehadul Muslimeen'],
    'JD(S)': ['JD(S)', 'Janata Dal (Secular)'],
    'INLD': ['INLD', 'Indian National Lok Dal'],
    'JMM': ['JMM', 'Jharkhand Mukti Morcha'],
    'SAD': ['SAD', 'Shiromani Akali Dal'],
    'RSP': ['RSP', 'Revolutionary Socialist Party'],
    'AGP': ['AGP', 'Asom Gana Parishad'],
    'BPF': ['BPF', 'Bodoland People\'s Front'],
    'SDF': ['SDF', 'Sikkim Democratic Front'],
    'MNDF': ['MNDF', 'Mizo National Front'],
    'UDP': ['UDP', 'United Democratic Party (Meghalaya)'],
    'NPF': ['NPF', 'Naga People\'s Front'],
    'ZPM': ['ZPM', 'Zoram People\'s Movement'],
    'KC(M)': ['KC(M)', 'Kerala Congress (M)'],
    'PDP': ['PDP', 'Peoples Democratic Party'],
    'NC': ['NC', 'National Conference']
}

# Function to extract party names from text
def extract_parties(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    parties_in_text = []
    for party, keywords in party_keywords.items():
        if any(keyword in entities for keyword in keywords):
            parties_in_text.append(party)
    return parties_in_text

# Function to get sentiment of a text
def get_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

# Function to determine the main party based on weighted sentiment
def determine_main_party_weighted(text):
    doc = nlp(text)
    sentences = list(doc.sents)

    party_sentiments = {party: 0 for party in party_keywords}
    party_counts = {party: 0 for party in party_keywords}

    for sentence in sentences:
        sentence_text = sentence.text
        sentiment = get_sentiment(sentence_text)
        parties = extract_parties(sentence_text)

        for party in parties:
            party_sentiments[party] += sentiment
            party_counts[party] += 1

    weighted_scores = {party: party_sentiments[party] / party_counts[party] if party_counts[party] > 0 else 0 for party in party_keywords}
    max_party_by_weighted_score = max(weighted_scores, key=weighted_scores.get)
    max_party_by_count = max(party_counts, key=party_counts.get)

    if weighted_scores[max_party_by_weighted_score] != 0:
        return max_party_by_weighted_score
    else:
        return max_party_by_count if party_counts[max_party_by_count] != 0 else 'Unknown'

# Apply sentiment analysis
df_filtered['sentiment_score'] = df_filtered['full_content'].apply(analyze_sentiment)
df_filtered['party'] = df_filtered['full_content'].apply(determine_main_party_weighted)
df_filtered['sentiment_label'] = df_filtered['sentiment_score'].apply(get_sentiment_label)

# Filter out rows where the party is 'Unknown'
df_final = df_filtered[df_filtered['party'] != 'Unknown']

# Check the number of rows processed
processed_rows = len(df_final)
total_rows = len(df_filtered)
print(f"Sentiment analysis completed for {processed_rows} out of {total_rows} rows with identified parties.")

# Save filtered DataFrame to Excel
file_path = '/content/output.xlsx'
df_final.to_excel(file_path, sheet_name='Sheet1', index=False)

if os.path.exists(file_path):
    print("Excel file found. Attempting to download...")
    files.download(file_path)
else:
    print("Excel file not found.")

# Save filtered DataFrame to CSV
csv_file_path = '/content/output_full_content.csv'
df_final.to_csv(csv_file_path, index=False)

# Verify the content of the saved CSV
if os.path.exists(csv_file_path):
    print("CSV file with full content found.")
    df_check = pd.read_csv(csv_file_path)
    print("Preview of saved CSV:")
    print(df_check.head())  # Display the first few rows for verification
    files.download(csv_file_path)
else:
    print("CSV file not found.")

# Save bad request URLs to Excel
bad_requests_df = pd.DataFrame({'Bad URLs': bad_requests})
bad_requests_file_path = '/content/badrequests.xlsx'
bad_requests_df.to_excel(bad_requests_file_path, index=False)

if os.path.exists(bad_requests_file_path):
    print("Bad request URLs saved to 'badrequests.xlsx'")
else:
    print("Failed to save bad request URLs.")

print("Script executed successfully")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fetching articles...
Articles fetched successfully.
Fetching article content...
Filtered DataFrame has 85 rows.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['sentiment_score'] = df_filtered['full_content'].apply(analyze_sentiment)


Sentiment analysis completed for 58 out of 85 rows with identified parties.
Excel file found. Attempting to download...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['party'] = df_filtered['full_content'].apply(determine_main_party_weighted)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['sentiment_label'] = df_filtered['sentiment_score'].apply(get_sentiment_label)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

CSV file with full content found.
Preview of saved CSV:
                                              source  \
0  {'id': 'business-insider', 'name': 'Business I...   
1                     {'id': 'time', 'name': 'Time'}   
2  {'id': 'business-insider', 'name': 'Business I...   
3  {'id': 'business-insider', 'name': 'Business I...   
4        {'id': None, 'name': 'Yahoo Entertainment'}   

                                              author  \
0                                     Rebecca Rommen   
1                                    Astha Rajvanshi   
2                                        Matthew Loh   
3                                         Tom Porter   
4  Vandinika Shukla, Harvard Kennedy School and B...   

                                               title  \
0  Indian authorities seize over $1 billion worth...   
1  The Controversy Over a New Population Study Fr...   
2  It's been 3 days since Modi won, and we're alr...   
3  The competition between India and China is 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Bad request URLs saved to 'badrequests.xlsx'
Script executed successfully
