In [5]:
import pandas as pd

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.head(5)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def fine_grained_sentimental_analysis(content):   
    tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")   
    tokens = tokenizer.encode(content, return_tensors='pt', truncation=True, padding=True)
    result = model(tokens)
    result.logits
    sentiment_score = int(torch.argmax(result.logits))+1
    return sentiment_score

In [7]:
import spacy
import contextlib
import re
# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 


#use this utility function to get the preprocessed text data
def preprocess(text):
    with contextlib.suppress(Exception):
        # remove special characters except full stop and apostrophe
        text = re.sub(r'[^a-zA-Z0-9\s.]', '', text)

        # text = text.lower()  # convert text to lowercase
        text = text.strip()  # remove leading and trailing whitespaces
        text = text.encode('ascii', 'ignore').decode('ascii')  # remove non-ascii characters

        # split text into words without messing up the punctuation
        text = re.findall(r"[\w']+|[.,!?;]", text)
    
        text= ' '.join(text)
        return text.replace(' .', '.')

In [None]:
df['preprocessed_title'] = df['title'].apply(preprocess)

In [None]:
df.head(5)

In [None]:
df['score'] = df['preprocessed_title'].apply(fine_grained_sentimental_analysis)

In [None]:
df.head(10)

In [None]:
df['score'].value_counts()

In [None]:
df['sentiment'] = df['score'].map({
    5: "Very Positive",
    4: "Positive",
    3: "Neutral",
    2: "Negative",
    1: "Very Negative"
})

In [None]:
df.head(5)

In [None]:
df.to_csv('file.csv')

In [None]:
df_new = pd.read_csv('file.csv')

In [None]:
df_new.head(5)

In [48]:
df_new['preprocessed_content'] = df_new['content'].apply(preprocess)

In [49]:
df_new.head(5)

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,preprocessed_title,score,sentiment,preprocessed_content
0,"{'id': 'reuters', 'name': 'Reuters'}",,IBM reports highest annual revenue growth in a...,"IBM Corp <a href=""https://www.reuters.com/comp...",https://www.reuters.com/technology/ibm-reports...,https://www.reuters.com/resizer/FjOhONjPelMPMz...,2023-01-25T21:12:00Z,Jan 25 (Reuters) - IBM Corp (IBM.N) on Wednesd...,IBM report high annual revenue growth decade r...,5,Very Positive,Jan 25 Reuters IBM Corp IBM. N on Wednesday re...
1,"{'id': 'business-insider', 'name': 'Business I...",insider@insider.com (Yanet Borrego),I was a career coach on the side for years bef...,Yanet Borrego spent nine years as an engineer ...,https://www.businessinsider.com/quit-6-figure-...,https://i.insider.com/63c98aeceee94d001a78fbc5...,2023-01-23T18:55:16Z,I spent seven years at ExxonMobil and two year...,career coach year quit 6 figure consulting job...,5,Very Positive,I spent seven years at ExxonMobil and two year...
2,"{'id': 'business-insider', 'name': 'Business I...",amok@insider.com (Aaron Mok),How people are making money on Canva and turni...,Workers are leaving their full-time jobs to st...,https://www.businessinsider.com/workers-leavin...,https://i.insider.com/63612637ade71a00193dc7db...,2023-01-28T14:08:25Z,After a tech startup fired 31-year-old Shruti ...,people make money Canva turn graphic design hu...,5,Very Positive,After a tech startup fired 31yearold Shruti Pa...
3,"{'id': None, 'name': 'VentureBeat'}",Dean Takahashi,Consumer and business interest in the metavers...,Consumers and businesses are getting more inte...,https://venturebeat.com/games/consumer-and-bus...,https://venturebeat.com/wp-content/uploads/202...,2023-01-04T13:00:00Z,Connect with gaming and metaverse leaders onli...,consumer business interest metaverse grow | ac...,5,Very Positive,Connect with gaming and metaverse leaders onli...
4,"{'id': None, 'name': 'VentureBeat'}","Michael Biltz, Accenture Labs, Marc Carrel-Bil...",Plan now for the internetâ€™s transformation b...,How Web3 and the metaverse will transform the ...,https://venturebeat.com/virtual/plan-now-for-t...,https://venturebeat.com/wp-content/uploads/202...,2023-01-29T16:20:00Z,Check out all the on-demand sessions from the ...,plan internetâ€ ™ s transformation metaverse Web3,4,Positive,Check out all the ondemand sessions from the I...


In [9]:
def giveBaseScore(text):
    if(text == "Very Negative"):
        return 200
    
    elif(text == "Negative"):
        return 100
    
    return 0

In [11]:
import csv
def process_csv(filename):  

    with open ('assets/negative-words.txt', 'r', encoding='utf-8') as file:
        negative_words_list = file.read().splitlines()

    with open ('assets/bad-words.txt', 'r', encoding='utf-8') as file:
        bad_words = file.read().splitlines()

    with open ('assets/countries.txt', 'r', encoding='utf-8') as file:
        countries = file.read().splitlines()

    with open('assets/lawsuit.txt', 'r', encoding='utf-8') as file:
        lawsuits = file.read().splitlines()

    with open('assets/harassement.txt', 'r', encoding='utf-8') as file:
        harassment = file.read().splitlines()



# ========================#
# Creating Final csv      #
# ========================#
    #definig charset
    with open('COMMON-PROCESSED.csv', 'w', encoding='utf-8', newline='') as summary:
        
        # read first row from Uber.csv
        with open(filename, 'r', encoding='utf-8') as file:
            df_new = pd.read_csv(filename)
            df_new['preprocessed_content'] = df_new['content'].apply(preprocess)
            try:
                reader = csv.reader(file)
                next(reader)

                # write to csv
                writer = csv.writer(summary)

                # do for every news article
                writer.writerows([["Index", "Source", "Author", "Title", "Description", "Content", "Headline Sentiment", "Offense Rating", "Negative Words", "Offensive Words", "Tags"]])

                for idx, row in enumerate(reader, start=1):
                    raw_text = df_new['preprocessed_content'][idx]

                    headline = df_new['title'][idx]
                    headline_sentiment = df_new['sentiment'][idx]
                    offense_rating = giveBaseScore(df_new['sentiment'][idx])

                    negative_words=[]
                    offensive_words=[]
                    tags=[]

                    # tag as negative

                    nlp_text= nlp(raw_text)


                    # add custom entities
                    for word in nlp_text:
                        # if it is a negative word
                        if word.text.lower() in negative_words_list:
                            offense_rating+=10
                            negative_words.append(word.text)


                        # if it is a highly offensive word 
                        if word.text.lower() in bad_words:
                            offense_rating+=50
                            offensive_words.append(word.text)


                        # if the article is talks about lawsuits
                        if word.text.lower() in lawsuits:
                            offense_rating+=30
                            tags.append(word.text)

                        # if the article is about harassment
                        if word.text.lower() in harassment:
                            offense_rating+=50
                            tags.append(word.text)

                        # does article mention a country?
                        if word.text.lower() in countries:
                            tags.append(word.text)    

                        # does article mention a person
                        if word.ent_type_ == "PERSON":
                            tags.append(word.text) 
                        
                        if word.ent_type_ == "ORG":
                            tags.append(word.text)
                        
                        if word.ent_type_ == "GPE":
                            tags.append(word.text)


                    if offense_rating>20:
                        offense_rating-=10


                    # Write each row
                    writer.writerow(
                        [
                            idx,
                            df_new['source'][idx],
                            df_new['author'][idx],
                            headline,
                            df_new['description'][idx],
                            df_new['content'][idx],
                            headline_sentiment,
                            offense_rating,
                            list(set(negative_words)),
                            list(set(offensive_words)),
                            list(set(tags)),
                        ]
                    )
                    print(f"Article {idx} written to csv")

            except Exception as e:
                print(e)
                print(e.__class__)
                print(e.__doc__)
                print(e.__traceback__)

In [12]:
process_csv('file.csv')

Article 1 written to csv
Article 2 written to csv
Article 3 written to csv
Article 4 written to csv
Article 5 written to csv
Article 6 written to csv
Article 7 written to csv
Article 8 written to csv
Article 9 written to csv
Article 10 written to csv
Article 11 written to csv
Article 12 written to csv
Article 13 written to csv
Article 14 written to csv
Article 15 written to csv
Article 16 written to csv
Article 17 written to csv
Article 18 written to csv
Article 19 written to csv
Article 20 written to csv
Article 21 written to csv
Article 22 written to csv
Article 23 written to csv
Article 24 written to csv
Article 25 written to csv
Article 26 written to csv
Article 27 written to csv
Article 28 written to csv
Article 29 written to csv
Article 30 written to csv
Article 31 written to csv
Article 32 written to csv
Article 33 written to csv
Article 34 written to csv
Article 35 written to csv
Article 36 written to csv
Article 37 written to csv
Article 38 written to csv
Article 39 written to