# News Articles Seniment Analysis 

# Importing Librarires

In [1]:
#importing the required libraries

# linear algebra
import numpy as np

# data Processing
import pandas as pd

# data visualisation
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import pyplot as plt
from matplotlib import style



# Loading Data into Pandas DataFrame

In [2]:
import pandas as pd

# Create a DataFrame from the JSON data
df = pd.read_csv(r"news_articles.csv")

# Display the DataFrame
df.head()

Unnamed: 0,Content,Description,Published At,Title,Source ID,Source Name
0,When Emma Vaughan left for work after turning ...,It is the third local house fire in two weeks ...,2024-03-09T12:38:26Z,Bride-to-be 'devastated' after tumble dryer fire,bbc-news,BBC News
1,An AI tool tested by an NHS hospital trust suc...,An AI tool called Mia found missed breast canc...,2024-03-21T01:16:08Z,NHS AI test spots tiny cancers missed by doctors,bbc-news,BBC News
2,Join Fox News for access to this content\r\nPl...,Apple's Journal app may make some of your pers...,2024-03-06T11:00:15Z,The iPhone privacy setting you need to turn off,fox-news,Fox News
3,Join Fox News for access to this content\r\nPl...,A stealthy technology known as EM Eye allows e...,2024-03-21T14:00:21Z,Creepy tool lets criminal hackers access your ...,fox-news,Fox News
4,Join Fox News for access to this content\r\nPl...,"Tech guru Kurt ""CyberGuy"" Knutsson reveals the...",2024-03-20T14:00:15Z,The 4 best secret note-taking apps that can ch...,fox-news,Fox News


# Missing Values

In [3]:
missing_values = df.isnull().sum()

# Print the count of missing values

print("Missing Values:")

print(missing_values)

# Calculate the missing value ratio

missing_ratio = df.isnull().mean()

# Print the missing value ratio

print("Missing Value Ratio:")

print(missing_ratio)

Missing Values:
Content          0
Description      0
Published At     0
Title            0
Source ID       59
Source Name      0
dtype: int64
Missing Value Ratio:
Content         0.000000
Description     0.000000
Published At    0.000000
Title           0.000000
Source ID       0.241803
Source Name     0.000000
dtype: float64


In [4]:
# Dropping Raws which have null values
# Did this step because the rest columns with missing values are the review related text and replacement of the text will not be accurate.
df = df.dropna()

#remove duplicates
df = df.drop_duplicates()

In [5]:
# Calculate the missing value ratio

missing_ratio = df.isnull().mean()

# Print the missing value ratio

print("Missing Value Ratio:")

print(missing_ratio)

Missing Value Ratio:
Content         0.0
Description     0.0
Published At    0.0
Title           0.0
Source ID       0.0
Source Name     0.0
dtype: float64



# Data Exploration

In [6]:
import pandas as pd

#examine the structure of the data
print("Data Structure:")
print(df.info())
print("-------")
print("\n")

print("Data Variables:")
print(df.columns)
print("-------")
print("\n")

Data Structure:
<class 'pandas.core.frame.DataFrame'>
Index: 149 entries, 0 to 230
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Content       149 non-null    object
 1   Description   149 non-null    object
 2   Published At  149 non-null    object
 3   Title         149 non-null    object
 4   Source ID     149 non-null    object
 5   Source Name   149 non-null    object
dtypes: object(6)
memory usage: 8.1+ KB
None
-------


Data Variables:
Index(['Content', 'Description', 'Published At', 'Title', 'Source ID',
       'Source Name'],
      dtype='object')
-------




# Descriptive Statistics

In [7]:
# Print data distributions
print("\nData Distributions:")
print(df.describe())

# Print data distributions for object columns
print("\nData Objects:")
print(df.describe(include=['object']))



Data Distributions:
                                                  Content  \
count                                                 149   
unique                                                148   
top     Join Fox News for access to this content\r\nPl...   
freq                                                    2   

                                              Description  \
count                                                 149   
unique                                                144   
top     Stay up to date on the latest AI technology ad...   
freq                                                    6   

                Published At  \
count                    149   
unique                   149   
top     2024-03-09T12:38:26Z   
freq                       1   

                                                   Title Source ID Source Name  
count                                                149       149         149  
unique                                      

# Text Preprocessing

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess_text(text):
    # Lowercasing
    
    text = text.lower()

    # Removing special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)

    # Tokenization
    tokens = text.split()

    # Removing stop words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]

    # Removing numerical values
    tokens = [word for word in tokens if not word.isdigit()]

    # Removing extra whitespaces
    tokens = [word.strip() for word in tokens if word.strip()]

    # Joining tokens back to a single string
    processed_text = " ".join(tokens)
    return processed_text

# Applying text preprocessing to the 'Review' column of the DataFrame
df['Content'] = df['Content'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\patel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\patel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\patel\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Sentiment Analysis 

In [9]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

def analyze_sentiment(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    sentiment = sentiment_scores['compound']

    
    tokens = nltk.word_tokenize(text)
    word_scores = []
    for token in tokens:
        word_score = sid.polarity_scores(token)
        word_scores.append((token, word_score))
    #print("word scores:", word_scores)
    
    if sentiment >= 0.05:
        return 'Positive'
    elif sentiment <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

    
# Applying sentiment analysis to the 'Review' column of the DataFrame
df['Sentiment_token'] = df['Content'].apply(analyze_sentiment)


In [10]:
df.head()

Unnamed: 0,Content,Description,Published At,Title,Source ID,Source Name,Sentiment_token
0,emma vaughan left work turning tumble dryer id...,It is the third local house fire in two weeks ...,2024-03-09T12:38:26Z,Bride-to-be 'devastated' after tumble dryer fire,bbc-news,BBC News,Negative
1,ai tool tested nhs hospital trust successfully...,An AI tool called Mia found missed breast canc...,2024-03-21T01:16:08Z,NHS AI test spots tiny cancers missed by doctors,bbc-news,BBC News,Negative
2,join fox news access content plus special acce...,Apple's Journal app may make some of your pers...,2024-03-06T11:00:15Z,The iPhone privacy setting you need to turn off,fox-news,Fox News,Positive
3,join fox news access content plus special acce...,A stealthy technology known as EM Eye allows e...,2024-03-21T14:00:21Z,Creepy tool lets criminal hackers access your ...,fox-news,Fox News,Positive
4,join fox news access content plus special acce...,"Tech guru Kurt ""CyberGuy"" Knutsson reveals the...",2024-03-20T14:00:15Z,The 4 best secret note-taking apps that can ch...,fox-news,Fox News,Positive


In [11]:
# Save the updated DataFrame to a new CSV file
df.to_excel('Sentiment_Articles.xlsx', index=False)

# Machine Learning Models

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [13]:
# Split the data into features (X) and target (y)
X = df['Content']
y = df['Sentiment_token']


#convert Text data into Numerical feature using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Logistic Regression

In [15]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

lr_predictions = lr_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, lr_predictions)
print(f"Logistic Regression Accuracy: {accuracy_lr:.2f}")

Logistic Regression Accuracy: 0.67


### Naive Bayes

In [16]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

nb_predictions = nb_model.predict(X_test)
accuracy_nb = accuracy_score(y_test, nb_predictions)
print(f"Naive Bayes Accuracy: {accuracy_nb:.2f}")

Naive Bayes Accuracy: 0.43


### Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

rf_predictions = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")

Random Forest Accuracy: 0.67
