In [22]:
import pandas as pd

In [24]:
df = pd.read_csv('news_data.csv')

In [25]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,0,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,Why You Need to Stop Clicking Sponsored Google...,"We all google, so we’re all used to Google’s q...",https://lifehacker.com/why-you-need-to-stop-cl...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T23:30:00Z,"We all google, so were all used to Googles qui..."
1,1,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,This YouTube Video Could Crash Your Pixel,Smartphones these days are powerful. Some can ...,https://lifehacker.com/this-youtube-video-coul...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T18:30:00Z,Smartphones these days are powerful. Some can ...
2,2,"{'id': None, 'name': 'Lifehacker.com'}",Daniel Oropeza,You Can Get a Lifetime Subscription to Scriven...,Scrivener 3 is giving their lifetime subscript...,https://lifehacker.com/you-can-get-a-lifetime-...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-24T18:30:00Z,Scrivener 3 is giving their lifetime subscript...
3,3,"{'id': 'engadget', 'name': 'Engadget'}",Mariella Moon,Google Glass is set to disappear (again),Google will no longer sell its Glass AR smart ...,https://www.engadget.com/google-retires-glass-...,https://s.yimg.com/uu/api/res/1.2/HB77RtCiVNln...,2023-03-16T08:25:12Z,Google will no longer sell its Glass AR smart ...
4,4,"{'id': 'engadget', 'name': 'Engadget'}",Igor Bonifacic,Google Pixel 7 phones are cheaper than ever ri...,If you’re in the market for a new Android phon...,https://www.engadget.com/google-pixel-7-phones...,https://s.yimg.com/uu/api/res/1.2/COiwL90z9LaM...,2023-03-19T14:52:46Z,All products recommended by Engadget are selec...


In [26]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def fine_grained_sentimental_analysis(content):   
    tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")   
    tokens = tokenizer.encode(content, return_tensors='pt', truncation=True, padding=True)
    result = model(tokens)
    result.logits
    sentiment_score = int(torch.argmax(result.logits))+1
    return sentiment_score

In [27]:
import spacy
import contextlib
import re
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 


#use this utility function to get the preprocessed text data
def preprocess(text):
    with contextlib.suppress(Exception):
        # remove special characters except full stop and apostrophe
        text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)

        # text = text.lower()  # convert text to lowercase
        text = text.strip()  # remove leading and trailing whitespaces
        text = text.encode('ascii', 'ignore').decode('ascii')  # remove non-ascii characters

        # split text into words without messing up the punctuation
        text = re.findall(r"[\w']+|[.,!?;]", text)
    
        text= ' '.join(text)
        return text.replace(' .', '.')

In [28]:
df['preprocessed_title'] = df['title'].apply(preprocess)

In [29]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,preprocessed_title
0,0,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,Why You Need to Stop Clicking Sponsored Google...,"We all google, so we’re all used to Google’s q...",https://lifehacker.com/why-you-need-to-stop-cl...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T23:30:00Z,"We all google, so were all used to Googles qui...",Why You Need to Stop Clicking Sponsored Google...
1,1,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,This YouTube Video Could Crash Your Pixel,Smartphones these days are powerful. Some can ...,https://lifehacker.com/this-youtube-video-coul...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T18:30:00Z,Smartphones these days are powerful. Some can ...,This YouTube Video Could Crash Your Pixel
2,2,"{'id': None, 'name': 'Lifehacker.com'}",Daniel Oropeza,You Can Get a Lifetime Subscription to Scriven...,Scrivener 3 is giving their lifetime subscript...,https://lifehacker.com/you-can-get-a-lifetime-...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-24T18:30:00Z,Scrivener 3 is giving their lifetime subscript...,You Can Get a Lifetime Subscription to Scriven...
3,3,"{'id': 'engadget', 'name': 'Engadget'}",Mariella Moon,Google Glass is set to disappear (again),Google will no longer sell its Glass AR smart ...,https://www.engadget.com/google-retires-glass-...,https://s.yimg.com/uu/api/res/1.2/HB77RtCiVNln...,2023-03-16T08:25:12Z,Google will no longer sell its Glass AR smart ...,Google Glass is set to disappear again
4,4,"{'id': 'engadget', 'name': 'Engadget'}",Igor Bonifacic,Google Pixel 7 phones are cheaper than ever ri...,If you’re in the market for a new Android phon...,https://www.engadget.com/google-pixel-7-phones...,https://s.yimg.com/uu/api/res/1.2/COiwL90z9LaM...,2023-03-19T14:52:46Z,All products recommended by Engadget are selec...,Google Pixel 7 phones are cheaper than ever ri...


In [30]:
df['score'] = df['preprocessed_title'].apply(fine_grained_sentimental_analysis)

In [31]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,preprocessed_title,score
0,0,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,Why You Need to Stop Clicking Sponsored Google...,"We all google, so we’re all used to Google’s q...",https://lifehacker.com/why-you-need-to-stop-cl...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T23:30:00Z,"We all google, so were all used to Googles qui...",Why You Need to Stop Clicking Sponsored Google...,1
1,1,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,This YouTube Video Could Crash Your Pixel,Smartphones these days are powerful. Some can ...,https://lifehacker.com/this-youtube-video-coul...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T18:30:00Z,Smartphones these days are powerful. Some can ...,This YouTube Video Could Crash Your Pixel,1
2,2,"{'id': None, 'name': 'Lifehacker.com'}",Daniel Oropeza,You Can Get a Lifetime Subscription to Scriven...,Scrivener 3 is giving their lifetime subscript...,https://lifehacker.com/you-can-get-a-lifetime-...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-24T18:30:00Z,Scrivener 3 is giving their lifetime subscript...,You Can Get a Lifetime Subscription to Scriven...,5
3,3,"{'id': 'engadget', 'name': 'Engadget'}",Mariella Moon,Google Glass is set to disappear (again),Google will no longer sell its Glass AR smart ...,https://www.engadget.com/google-retires-glass-...,https://s.yimg.com/uu/api/res/1.2/HB77RtCiVNln...,2023-03-16T08:25:12Z,Google will no longer sell its Glass AR smart ...,Google Glass is set to disappear again,2
4,4,"{'id': 'engadget', 'name': 'Engadget'}",Igor Bonifacic,Google Pixel 7 phones are cheaper than ever ri...,If you’re in the market for a new Android phon...,https://www.engadget.com/google-pixel-7-phones...,https://s.yimg.com/uu/api/res/1.2/COiwL90z9LaM...,2023-03-19T14:52:46Z,All products recommended by Engadget are selec...,Google Pixel 7 phones are cheaper than ever ri...,5
5,5,"{'id': 'engadget', 'name': 'Engadget'}",Kris Holt,Google One's VPN will soon be available to all...,Google One\r\n is expanding its security featu...,https://www.engadget.com/google-ones-vpn-will-...,https://s.yimg.com/uu/api/res/1.2/VV6PFM6_u0Lp...,2023-03-08T17:00:06Z,Google One\r\n is expanding its security featu...,Google Ones VPN will soon be available to all ...,5
6,6,"{'id': 'engadget', 'name': 'Engadget'}",Kris Holt,Magic Eraser is coming to older Pixel phones a...,Google\r\n is bringing photo features that wer...,https://www.engadget.com/magic-eraser-is-comin...,https://s.yimg.com/uu/api/res/1.2/kzVwfXgqwh7L...,2023-02-23T17:00:19Z,Google is bringing photo features that were on...,Magic Eraser is coming to older Pixel phones a...,5
7,7,"{'id': 'engadget', 'name': 'Engadget'}",Mariella Moon,Google workers in Japan have joined a labor un...,Dozens of Google Japan employees have organize...,https://www.engadget.com/google-workers-in-jap...,https://s.yimg.com/uu/api/res/1.2/h_6UREc5G4Mp...,2023-03-02T06:44:17Z,Dozens of Google Japan employees have organize...,Google workers in Japan have joined a labor un...,1
8,8,"{'id': 'engadget', 'name': 'Engadget'}",Jon Fingas,Google TV's new family page helps you find kid...,Google TV may now be easier to use when you're...,https://www.engadget.com/google-tvs-new-family...,https://s.yimg.com/uu/api/res/1.2/NWenytPjZ.DP...,2023-02-23T17:00:34Z,Google TV may now be easier to use when you're...,Google TVs new family page helps you find kidf...,5
9,9,"{'id': 'engadget', 'name': 'Engadget'}",Andrew Tarantola,"Google is shoving generative AI into Gmail, Do...",Google has been scrambling to catch up to to O...,https://www.engadget.com/google-is-shoving-gen...,https://s.yimg.com/uu/api/res/1.2/iCNq2RcP3wmj...,2023-03-14T16:17:11Z,Google has been scrambling to catch up to to O...,Google is shoving generative AI into Gmail Doc...,5


In [32]:
df['score'].value_counts()

5    49
1    36
2     6
3     5
4     4
Name: score, dtype: int64

In [33]:
df['sentiment'] = df['score'].map({
    5: "Very Positive",
    4: "Positive",
    3: "Neutral",
    2: "Negative",
    1: "Very Negative"
})

In [34]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,preprocessed_title,score,sentiment
0,0,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,Why You Need to Stop Clicking Sponsored Google...,"We all google, so we’re all used to Google’s q...",https://lifehacker.com/why-you-need-to-stop-cl...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T23:30:00Z,"We all google, so were all used to Googles qui...",Why You Need to Stop Clicking Sponsored Google...,1,Very Negative
1,1,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,This YouTube Video Could Crash Your Pixel,Smartphones these days are powerful. Some can ...,https://lifehacker.com/this-youtube-video-coul...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T18:30:00Z,Smartphones these days are powerful. Some can ...,This YouTube Video Could Crash Your Pixel,1,Very Negative
2,2,"{'id': None, 'name': 'Lifehacker.com'}",Daniel Oropeza,You Can Get a Lifetime Subscription to Scriven...,Scrivener 3 is giving their lifetime subscript...,https://lifehacker.com/you-can-get-a-lifetime-...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-24T18:30:00Z,Scrivener 3 is giving their lifetime subscript...,You Can Get a Lifetime Subscription to Scriven...,5,Very Positive
3,3,"{'id': 'engadget', 'name': 'Engadget'}",Mariella Moon,Google Glass is set to disappear (again),Google will no longer sell its Glass AR smart ...,https://www.engadget.com/google-retires-glass-...,https://s.yimg.com/uu/api/res/1.2/HB77RtCiVNln...,2023-03-16T08:25:12Z,Google will no longer sell its Glass AR smart ...,Google Glass is set to disappear again,2,Negative
4,4,"{'id': 'engadget', 'name': 'Engadget'}",Igor Bonifacic,Google Pixel 7 phones are cheaper than ever ri...,If you’re in the market for a new Android phon...,https://www.engadget.com/google-pixel-7-phones...,https://s.yimg.com/uu/api/res/1.2/COiwL90z9LaM...,2023-03-19T14:52:46Z,All products recommended by Engadget are selec...,Google Pixel 7 phones are cheaper than ever ri...,5,Very Positive


In [35]:
df.to_csv('file.csv')

In [36]:
df_new = pd.read_csv('file.csv')

In [37]:
df_new.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,preprocessed_title,score,sentiment
0,0,0,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,Why You Need to Stop Clicking Sponsored Google...,"We all google, so we’re all used to Google’s q...",https://lifehacker.com/why-you-need-to-stop-cl...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T23:30:00Z,"We all google, so were all used to Googles qui...",Why You Need to Stop Clicking Sponsored Google...,1,Very Negative
1,1,1,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,This YouTube Video Could Crash Your Pixel,Smartphones these days are powerful. Some can ...,https://lifehacker.com/this-youtube-video-coul...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T18:30:00Z,Smartphones these days are powerful. Some can ...,This YouTube Video Could Crash Your Pixel,1,Very Negative
2,2,2,"{'id': None, 'name': 'Lifehacker.com'}",Daniel Oropeza,You Can Get a Lifetime Subscription to Scriven...,Scrivener 3 is giving their lifetime subscript...,https://lifehacker.com/you-can-get-a-lifetime-...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-24T18:30:00Z,Scrivener 3 is giving their lifetime subscript...,You Can Get a Lifetime Subscription to Scriven...,5,Very Positive
3,3,3,"{'id': 'engadget', 'name': 'Engadget'}",Mariella Moon,Google Glass is set to disappear (again),Google will no longer sell its Glass AR smart ...,https://www.engadget.com/google-retires-glass-...,https://s.yimg.com/uu/api/res/1.2/HB77RtCiVNln...,2023-03-16T08:25:12Z,Google will no longer sell its Glass AR smart ...,Google Glass is set to disappear again,2,Negative
4,4,4,"{'id': 'engadget', 'name': 'Engadget'}",Igor Bonifacic,Google Pixel 7 phones are cheaper than ever ri...,If you’re in the market for a new Android phon...,https://www.engadget.com/google-pixel-7-phones...,https://s.yimg.com/uu/api/res/1.2/COiwL90z9LaM...,2023-03-19T14:52:46Z,All products recommended by Engadget are selec...,Google Pixel 7 phones are cheaper than ever ri...,5,Very Positive


In [38]:
df_new['preprocessed_content'] = df_new['content'].apply(preprocess)

In [39]:
df_new.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,preprocessed_title,score,sentiment,preprocessed_content
0,0,0,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,Why You Need to Stop Clicking Sponsored Google...,"We all google, so we’re all used to Google’s q...",https://lifehacker.com/why-you-need-to-stop-cl...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T23:30:00Z,"We all google, so were all used to Googles qui...",Why You Need to Stop Clicking Sponsored Google...,1,Very Negative,We all google so were all used to Googles quir...
1,1,1,"{'id': None, 'name': 'Lifehacker.com'}",Jake Peterson,This YouTube Video Could Crash Your Pixel,Smartphones these days are powerful. Some can ...,https://lifehacker.com/this-youtube-video-coul...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-27T18:30:00Z,Smartphones these days are powerful. Some can ...,This YouTube Video Could Crash Your Pixel,1,Very Negative,Smartphones these days are powerful. Some can ...
2,2,2,"{'id': None, 'name': 'Lifehacker.com'}",Daniel Oropeza,You Can Get a Lifetime Subscription to Scriven...,Scrivener 3 is giving their lifetime subscript...,https://lifehacker.com/you-can-get-a-lifetime-...,https://i.kinja-img.com/gawker-media/image/upl...,2023-02-24T18:30:00Z,Scrivener 3 is giving their lifetime subscript...,You Can Get a Lifetime Subscription to Scriven...,5,Very Positive,Scrivener 3 is giving their lifetime subscript...
3,3,3,"{'id': 'engadget', 'name': 'Engadget'}",Mariella Moon,Google Glass is set to disappear (again),Google will no longer sell its Glass AR smart ...,https://www.engadget.com/google-retires-glass-...,https://s.yimg.com/uu/api/res/1.2/HB77RtCiVNln...,2023-03-16T08:25:12Z,Google will no longer sell its Glass AR smart ...,Google Glass is set to disappear again,2,Negative,Google will no longer sell its Glass AR smart ...
4,4,4,"{'id': 'engadget', 'name': 'Engadget'}",Igor Bonifacic,Google Pixel 7 phones are cheaper than ever ri...,If you’re in the market for a new Android phon...,https://www.engadget.com/google-pixel-7-phones...,https://s.yimg.com/uu/api/res/1.2/COiwL90z9LaM...,2023-03-19T14:52:46Z,All products recommended by Engadget are selec...,Google Pixel 7 phones are cheaper than ever ri...,5,Very Positive,All products recommended by Engadget are selec...


In [40]:
def giveBaseScore(text):
    if(text == "Very Negative"):
        return 200
    
    elif(text == "Negative"):
        return 100
    
    return 0

In [43]:
import csv
def process_csv(filename):  

    with open ('assets/negative-words.txt', 'r', encoding='utf-8') as file:
        negative_words_list = file.read().splitlines()

    with open ('assets/bad-words.txt', 'r', encoding='utf-8') as file:
        bad_words = file.read().splitlines()

    with open ('assets/countries.txt', 'r', encoding='utf-8') as file:
        countries = file.read().splitlines()

    with open('assets/lawsuit.txt', 'r', encoding='utf-8') as file:
        lawsuits = file.read().splitlines()

    with open('assets/harassement.txt', 'r', encoding='utf-8') as file:
        harassment = file.read().splitlines()



# ========================#
# Creating Final csv      #
# ========================#
    #definig charset
    with open('COMMON-PROCESSED.csv', 'w', encoding='utf-8', newline='') as summary:
        
        # read first row from Uber.csv
        with open(filename, 'r', encoding='utf-8') as file:
            df_new = pd.read_csv(filename)
            df_new['preprocessed_content'] = df_new['description'].apply(preprocess)
            try:
                reader = csv.reader(file)
                next(reader)

                # write to csv
                writer = csv.writer(summary)

                # do for every news article
                writer.writerows([[ "Index","Source","Author","Title", "Description", "Content", "Headline Sentiment", "Offense Rating", "Negative Words", "Offensive Words", "Tags"]])

                for idx, row in enumerate(reader, start=1):
                    raw_text = df_new['preprocessed_content'][idx]

                    headline = df_new['title'][idx]
                    headline_sentiment = df_new['sentiment'][idx]
                    offense_rating = giveBaseScore(df_new['sentiment'][idx])

                    negative_words=[]
                    offensive_words=[]
                    tags=[]

                    # tag as negative

                    nlp_text= nlp(raw_text)


                    # add custom entities
                    for word in nlp_text:
                        # if it is a negative word
                        if word.text.lower() in negative_words_list:
                            offense_rating+=10
                            negative_words.append(word.text)


                        # if it is a highly offensive word 
                        if word.text.lower() in bad_words:
                            offense_rating+=50
                            offensive_words.append(word.text)


                        # if the article is talks about lawsuits
                        if word.text.lower() in lawsuits:
                            offense_rating+=30
                            tags.append(word.text)

                        # if the article is about harassment
                        if word.text.lower() in harassment:
                            offense_rating+=50
                            tags.append(word.text)

                        # does article mention a country?
                        if word.text.lower() in countries:
                            tags.append(word.text)    

                        # does article mention a person
                        if word.ent_type_ == "PERSON":
                            tags.append(word.text) 
                        
                        if word.ent_type_ == "ORG":
                            tags.append(word.text)
                        
                        if word.ent_type_ == "GPE":
                            tags.append(word.text)


                    if offense_rating>20:
                        offense_rating-=10


                    # Write each row
                    writer.writerow(
                        [
                            idx,
                            df_new['source'][idx],
                            df_new['author'][idx],
                            headline,
                            df_new['description'][idx],
                            df_new['content'][idx],
                            headline_sentiment,
                            offense_rating,
                            list(set(negative_words)),
                            list(set(offensive_words)),
                            list(set(tags)),
                        ]
                    )
                    print(f"Article {idx} written to csv")

            except Exception as e:
                print(e)
                print(e.__class__)
                print(e.__doc__)
                print(e.__traceback__)

In [44]:
process_csv('file.csv')

Article 1 written to csv
Article 2 written to csv
Article 3 written to csv
Article 4 written to csv
Article 5 written to csv
Article 6 written to csv
Article 7 written to csv
Article 8 written to csv
Article 9 written to csv
Article 10 written to csv
Article 11 written to csv
Article 12 written to csv
Article 13 written to csv
Article 14 written to csv
Article 15 written to csv
Article 16 written to csv
Article 17 written to csv
Article 18 written to csv
Article 19 written to csv
Article 20 written to csv
Article 21 written to csv
Article 22 written to csv
Article 23 written to csv
Article 24 written to csv
Article 25 written to csv
Article 26 written to csv
Article 27 written to csv
Article 28 written to csv
Article 29 written to csv
Article 30 written to csv
Article 31 written to csv
Article 32 written to csv
Article 33 written to csv
Article 34 written to csv
Article 35 written to csv
Article 36 written to csv
Article 37 written to csv
Article 38 written to csv
Article 39 written to