In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from google.colab import drive
from nltk.corpus import stopwords
import string
import nltk
import chardet
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from tqdm import tqdm_notebook as tqdm
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import statsmodels.api as sm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Datasets Reference:

Dataset 1 (Pre-labelled):
https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news

Dataset 2: We manually labelled a subset of
https://www.kaggle.com/datasets/therohk/million-headlines

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Objective 1: Classification of News Headlines Based on Positive, Negative or Neutral Sentiment**

In [None]:
# Loading and CLeaning Dataset #1

# Define the file path in your Google Drive
file_path = '/content/drive/MyDrive/NLP/HS/New_Data/newdata.csv'

# Detect the encoding
with open(file_path, 'rb') as f:
    result = chardet.detect(f.read())

# Get the encoding from the result
encoding = result['encoding']

# Read the CSV file into a DataFrame
df_total = pd.read_csv(file_path, encoding=encoding)

In [None]:
# First Look at the dataset #1
df_total.head()

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [None]:
# Rename the columns
df_total.rename(columns={'neutral': 'Sentiment', 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .': 'News Headline'}, inplace=True)

# Display the updated DataFrame
df_total

Unnamed: 0,Sentiment,News Headline
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...


In [None]:
# Loading Dataset #2 - Manually Labelled
df2 = pd.read_csv('/content/drive/MyDrive/NLP/HS/New_Data/labeled_headlines.csv')

### **Categorization # 1: Positive vs Rest**

In [None]:
# Create a copy of the positive dataframe
df_pos = df_total.copy()

# Define the mapping for replacement
sentiment_mapping = {'positive': 1, 'negative': 0, 'neutral': 0}

# Replace values in the "Sentiment" column
df_pos['Sentiment'] = df_pos['Sentiment'].replace(sentiment_mapping)

# Display the updated DataFrame
df_pos.head()

Unnamed: 0,Sentiment,News Headline
0,0,Technopolis plans to develop in stages an area...
1,0,The international electronic industry company ...
2,1,With the new production plant the company woul...
3,1,According to the company 's updated strategy f...
4,1,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [None]:
# Rename the columns
df_pos = df_pos.rename(columns={'Sentiment': 'label', 'News Headline': 'headline_text'})[['headline_text', 'label']]

# Display the updated dataframe
df_pos

Unnamed: 0,headline_text,label
0,Technopolis plans to develop in stages an area...,0
1,The international electronic industry company ...,0
2,With the new production plant the company woul...,1
3,According to the company 's updated strategy f...,1
4,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,1
...,...,...
4840,LONDON MarketWatch -- Share prices ended lower...,0
4841,Rinkuskiai 's beer sales fell by 6.5 per cent ...,0
4842,Operating profit fell to EUR 35.4 mn from EUR ...,0
4843,Net sales of the Paper segment decreased to EU...,0


In [None]:
df2.head()

Unnamed: 0,headline_text,label
0,melbourne and adelaide battered by winds,-1
1,woman dies after assault in cloverdale home,-1
2,police: no hope for missing eucumbene fisherman,-1
3,excited fans wait for paul mccartney at the ca...,1
4,serena through to us open quarter final,1


In [None]:
# Make a copy of the second dataset
df2_pos = df2.copy()

# Define the mapping for replacement: Ensuring Binary Classification
sentiment_mapping = {1 : 1, -1 : 0, 0 : 0}

# Replace values in the "Sentiment" column
df2_pos['label'] = df2_pos['label'].replace(sentiment_mapping)

In [None]:
# Examining correctly labelled 'Positive Vs Rest' component of Dataset #2
df2_pos.head()

Unnamed: 0,headline_text,label
0,melbourne and adelaide battered by winds,0
1,woman dies after assault in cloverdale home,0
2,police: no hope for missing eucumbene fisherman,0
3,excited fans wait for paul mccartney at the ca...,1
4,serena through to us open quarter final,1


In [None]:
# Combine dataframes vertically
df1_pos = pd.concat([df_pos, df2_pos], ignore_index=True)

# Shuffle the rows randomly
df1_pos = df1_pos.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the combined and shuffled dataframe
df1_pos.head()

Unnamed: 0,headline_text,label
0,EUR 220 million of the transaction considerati...,0
1,islam awareness workshops teach bundaberg comm...,1
2,latham hopeful after latest diagnosis,1
3,`` The number of collection errors fell consid...,1
4,The impact of this acquisition to Teleste 's n...,0


### Pre-Processing Data

In [None]:
# Tokenization function
def tokenize_text(text, **kwargs):
    return word_tokenize(text.lower(), **kwargs)  # Convert to lowercase for consistency

# Apply tokenization to the 'text' column
df1_pos['tokenized_news'] = df1_pos['headline_text'].apply(tokenize_text)

In [None]:
# Lemmatization function
def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# Apply lemmatization to the 'tokenized_text' column
df1_pos['lemmatized_news'] = df1_pos['tokenized_news'].apply(lemmatize_text)

In [None]:
# Removal of stopwords function
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# Apply removal of stopwords to the 'lemmatized_text' column
df1_pos['text_no_stopwords'] = df1_pos['lemmatized_news'].apply(remove_stopwords)

In [None]:
# Removal of punctuation function
def remove_punctuation(tokens):
    return [token for token in tokens if token not in string.punctuation]

# Apply removal of punctuation to the 'text_no_stopwords' column
df1_pos['text_no_punctuation'] = df1_pos['text_no_stopwords'].apply(remove_punctuation)

# Reassemble the tokens into a single string
df1_pos['processed_text'] = df1_pos['text_no_punctuation'].apply(lambda tokens: ' '.join(tokens))

In [None]:
# Vectorization using TF-IDF
def text_representation(data):
    tfidf_vect_pos = TfidfVectorizer()

    # Convert non-string entries in 'processed_text' to strings
    data['processed_text'] = data['processed_text'].apply(lambda text: str(text) if isinstance(text, (float, int)) else text)

    # Tokenization and TF-IDF vectorization
    data['processed_text'] = data['processed_text'].apply(lambda text: " ".join(set(str(text).split())))
    X_tfidf = tfidf_vect_pos.fit_transform(data['processed_text'])

    print(X_tfidf.shape)
    print(tfidf_vect_pos.get_feature_names_out())

    # Convert the sparse matrix to a DataFrame
    X_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect_pos.get_feature_names_out())

    return X_tfidf, tfidf_vect_pos

# Apply the text_representation function
X_tfidf, tfidf_vect_pos = text_representation(df1_pos)

(9733, 14287)
['00' '000' '000063' ... 'ætehuolto' 'ðl' 'ˆeur']


### Fitting Data Models

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df1_pos['label'], test_size=0.3, random_state=42)

#### Logistic Regression

In [None]:
# Initialize a Logistic Regression classifier for multiclass classification
log_r_classifier_pos = LogisticRegression(multi_class='ovr')  # 'ovr' stands for One-vs-Rest

# Fit the classifier on the training data
log_r_classifier_pos.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_r_classifier_pos.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.75
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.97      0.84      1997
           1       0.80      0.26      0.40       923

    accuracy                           0.75      2920
   macro avg       0.77      0.62      0.62      2920
weighted avg       0.76      0.75      0.70      2920



#### Multinomial Naive Bayes classifier

In [None]:
# Initialize a Multinomial Naive Bayes classifier
classifier_nb_pos = MultinomialNB()

# Fit the classifier on the training data
classifier_nb_pos.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_nb_pos.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.72
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.98      0.83      1997
           1       0.79      0.14      0.24       923

    accuracy                           0.72      2920
   macro avg       0.75      0.56      0.53      2920
weighted avg       0.74      0.72      0.64      2920



#### Random Forest

In [None]:
# Initialize a Random Forest classifier
classifier_rf_pos = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier on the training data
classifier_rf_pos.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_rf_pos.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.76
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.94      0.84      1997
           1       0.73      0.36      0.48       923

    accuracy                           0.76      2920
   macro avg       0.74      0.65      0.66      2920
weighted avg       0.75      0.76      0.73      2920



#### Gradient Boosting Classifier

In [None]:
# Initialize a Gradient Boosting classifier
classifier_gc_pos = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the classifier on the training data
classifier_gc_pos.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_gc_pos.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

#### Support Vector Machine Classifier

In [None]:
# Initialize a Support Vector Machine Classifier
classifier_svc_pos = SVC()

# Fit the classifier on the training data
classifier_svc_pos.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_svc_pos.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))



### **Classification # 2. Negative vs Rest**


In [None]:
df_neg = df_total.copy()

# Define the mapping for replacement
sentiment_mapping = {'positive': 0, 'negative': 1, 'neutral': 0}

# Replace values in the "Sentiment" column
df_neg['Sentiment'] = df_neg['Sentiment'].replace(sentiment_mapping)

# Display the updated DataFrame
df_neg.head()

Unnamed: 0,Sentiment,News Headline
0,0,Technopolis plans to develop in stages an area...
1,1,The international electronic industry company ...
2,0,With the new production plant the company woul...
3,0,According to the company 's updated strategy f...
4,0,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [None]:
# Rename the columns
df_neg = df_neg.rename(columns={'Sentiment': 'label', 'News Headline': 'headline_text'})[['headline_text', 'label']]

# Display the updated dataframe
df_neg

Unnamed: 0,headline_text,label
0,Technopolis plans to develop in stages an area...,0
1,The international electronic industry company ...,1
2,With the new production plant the company woul...,0
3,According to the company 's updated strategy f...,0
4,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,0
...,...,...
4840,LONDON MarketWatch -- Share prices ended lower...,1
4841,Rinkuskiai 's beer sales fell by 6.5 per cent ...,0
4842,Operating profit fell to EUR 35.4 mn from EUR ...,1
4843,Net sales of the Paper segment decreased to EU...,1


In [None]:
# Creating a copy of df2
df2_neg = df2.copy()

# Define the mapping for replacement
sentiment_mapping = {1 : 0, -1 : 1, 0 : 0}

# Replace values in the "Sentiment" column
df2_neg['label'] = df2_neg['label'].replace(sentiment_mapping)

In [None]:
# Examining df2
df2_neg.head()

Unnamed: 0,headline_text,label
0,melbourne and adelaide battered by winds,1
1,woman dies after assault in cloverdale home,1
2,police: no hope for missing eucumbene fisherman,1
3,excited fans wait for paul mccartney at the ca...,0
4,serena through to us open quarter final,0


In [None]:
# Combine dataframes vertically
df1_neg = pd.concat([df_neg, df2_neg], ignore_index=True)

# Shuffle the rows randomly
df1_neg = df1_neg.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the combined and shuffled dataframe
df1_neg.head()

Unnamed: 0,headline_text,label
0,EUR 220 million of the transaction considerati...,0
1,islam awareness workshops teach bundaberg comm...,0
2,latham hopeful after latest diagnosis,0
3,`` The number of collection errors fell consid...,0
4,The impact of this acquisition to Teleste 's n...,0


#### Pre-Processing Negative Vs Rest

In [None]:
# Tokenization function
def tokenize_text(text, **kwargs):
    return word_tokenize(text.lower(), **kwargs)  # Convert to lowercase for consistency

# Apply tokenization to the 'text' column
df1_neg['tokenized_news'] = df1_neg['headline_text'].apply(tokenize_text)

In [None]:
# Lemmatization function
def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# Apply lemmatization to the 'tokenized_text' column
df1_neg['lemmatized_news'] = df1_neg['tokenized_news'].apply(lemmatize_text)

In [None]:
# Removal of stopwords function
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# Apply removal of stopwords to the 'lemmatized_text' column
df1_neg['text_no_stopwords'] = df1_neg['lemmatized_news'].apply(remove_stopwords)

In [None]:
# Removal of punctuation function
def remove_punctuation(tokens):
    return [token for token in tokens if token not in string.punctuation]

# Apply removal of punctuation to the 'text_no_stopwords' column
df1_neg['text_no_punctuation'] = df1_neg['text_no_stopwords'].apply(remove_punctuation)

# Reassemble the tokens into a single string
df1_neg['processed_text'] = df1_neg['text_no_punctuation'].apply(lambda tokens: ' '.join(tokens))

In [None]:
# TFIDF Vectorization
def text_representation(data):
    tfidf_vect_neg = TfidfVectorizer()

    # Convert non-string entries in 'processed_text' to strings
    data['processed_text'] = data['processed_text'].apply(lambda text: str(text) if isinstance(text, (float, int)) else text)

    # Tokenization and TF-IDF vectorization
    data['processed_text'] = data['processed_text'].apply(lambda text: " ".join(set(str(text).split())))
    X_tfidf = tfidf_vect_neg.fit_transform(data['processed_text'])

    print(X_tfidf.shape)
    print(tfidf_vect_neg.get_feature_names_out())

    # Convert the sparse matrix to a DataFrame
    X_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect_neg.get_feature_names_out())

    return X_tfidf, tfidf_vect_neg

# Apply the text_representation function
X_tfidf, tfidf_vect_neg = text_representation(df1_neg)

(9733, 14287)
['00' '000' '000063' ... 'ætehuolto' 'ðl' 'ˆeur']


#### **Fitting Models (Neg Vs Rest) Scenario**

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df1_neg['label'], test_size=0.3, random_state=42)

#### Logistic Regression

In [None]:
# Initialize a Logistic Regression classifier for multiclass classification
log_r_classifier_neg = LogisticRegression(multi_class='ovr')  # 'ovr' stands for One-vs-Rest

# Fit the classifier on the training data
log_r_classifier_neg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_r_classifier_neg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.81
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.96      0.88      2065
           1       0.83      0.43      0.57       855

    accuracy                           0.81      2920
   macro avg       0.82      0.70      0.72      2920
weighted avg       0.81      0.81      0.79      2920



#### Multinomial Naive Bayes classifier

In [None]:
# Initialize a Multinomial Naive Bayes classifier
classifier_nb_neg = MultinomialNB()

# Fit the classifier on the training data
classifier_nb_neg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_nb_neg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

#### Random Forest

In [None]:
# Initialize a Random Forest classifier
classifier_rf_neg = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier on the training data
classifier_rf_neg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_rf_neg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

#### Gradient Boosting Classifier

In [None]:
# Initialize a Gradient Boosting classifier
classifier_gb_neg = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the classifier on the training data
classifier_gb_neg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_gb_neg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

#### Support Vector Machine Classifier

In [None]:
# Initialize a Support Vector Machine Classifier
classifier_svc_neg = SVC()

# Fit the classifier on the training data
classifier_svc_neg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_svc_neg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


### **Snenario # 3. Neutral vs Rest**

In [None]:
# Creating a copy of the first dataset
df_neu = df_total.copy()

# Define the mapping for replacement
sentiment_mapping = {'positive': 0, 'negative': 0, 'neutral': 1}

# Replace values in the "Sentiment" column
df_neu['Sentiment'] = df_neu['Sentiment'].replace(sentiment_mapping)

# Display the updated DataFrame
df_neu.head()

Unnamed: 0,Sentiment,News Headline
0,1,Technopolis plans to develop in stages an area...
1,0,The international electronic industry company ...
2,0,With the new production plant the company woul...
3,0,According to the company 's updated strategy f...
4,0,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [None]:
# Rename the columns for consistency
df_neu = df_neu.rename(columns={'Sentiment': 'label', 'News Headline': 'headline_text'})[['headline_text', 'label']]

# Display the updated dataframe
df_neu

Unnamed: 0,headline_text,label
0,Technopolis plans to develop in stages an area...,1
1,The international electronic industry company ...,0
2,With the new production plant the company woul...,0
3,According to the company 's updated strategy f...,0
4,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,0
...,...,...
4840,LONDON MarketWatch -- Share prices ended lower...,0
4841,Rinkuskiai 's beer sales fell by 6.5 per cent ...,1
4842,Operating profit fell to EUR 35.4 mn from EUR ...,0
4843,Net sales of the Paper segment decreased to EU...,0


In [None]:
# Creating a copy of the second dataset
df2_neu = df2.copy()

# Define the mapping for replacement
sentiment_mapping = {1 : 0, -1 : 0, 0 : 1}

# Replace values in the "Sentiment" column
df2_neu['label'] = df2_neu['label'].replace(sentiment_mapping)

In [None]:
# Combine dataframes vertically
df1_neu = pd.concat([df_neu, df2_neu], ignore_index=True)

# Shuffle the rows randomly
df1_neu = df1_neu.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the combined and shuffled dataframe
df1_neu.head()

Unnamed: 0,headline_text,label
0,EUR 220 million of the transaction considerati...,1
1,islam awareness workshops teach bundaberg comm...,0
2,latham hopeful after latest diagnosis,0
3,`` The number of collection errors fell consid...,0
4,The impact of this acquisition to Teleste 's n...,1


#### Pre-Processing for Scenario 3

In [None]:
# Tokenization function
def tokenize_text(text, **kwargs):
    return word_tokenize(text.lower(), **kwargs)  # Convert to lowercase for consistency

# Apply tokenization to the 'text' column
df1_neu['tokenized_news'] = df1_neu['headline_text'].apply(tokenize_text)

In [None]:
# Lemmatization function
def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# Apply lemmatization to the 'tokenized_text' column
df1_neu['lemmatized_news'] = df1_neu['tokenized_news'].apply(lemmatize_text)

In [None]:
# Removal of stopwords function
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

# Apply removal of stopwords to the 'lemmatized_text' column
df1_neu['text_no_stopwords'] = df1_neu['lemmatized_news'].apply(remove_stopwords)

In [None]:
# Removal of punctuation function
def remove_punctuation(tokens):
    return [token for token in tokens if token not in string.punctuation]

# Apply removal of punctuation to the 'text_no_stopwords' column
df1_neu['text_no_punctuation'] = df1_neu['text_no_stopwords'].apply(remove_punctuation)

# Reassemble the tokens into a single string
df1_neu['processed_text'] = df1_neu['text_no_punctuation'].apply(lambda tokens: ' '.join(tokens))

In [None]:
# TFIDF Vectorization
def text_representation(data):
    tfidf_vect = TfidfVectorizer()

    # Convert non-string entries in 'processed_text' to strings
    data['processed_text'] = data['processed_text'].apply(lambda text: str(text) if isinstance(text, (float, int)) else text)

    # Tokenization and TF-IDF vectorization
    data['processed_text'] = data['processed_text'].apply(lambda text: " ".join(set(str(text).split())))
    X_tfidf = tfidf_vect.fit_transform(data['processed_text'])

    print(X_tfidf.shape)
    print(tfidf_vect.get_feature_names_out())

    # Convert the sparse matrix to a DataFrame
    X_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names_out())

    return X_tfidf, tfidf_vect

# Apply the text_representation function
X_tfidf, tfidf_vect = text_representation(df1_neu)

(9733, 14287)
['00' '000' '000063' ... 'ætehuolto' 'ðl' 'ˆeur']


### Model Fitting: Scenario 3

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df1_neu['label'], test_size=0.3, random_state=42)

#### Logistic Regression

In [None]:
# Initialize a Logistic Regression classifier for multiclass classification
log_r_classifier_neu = LogisticRegression(multi_class='ovr')  # 'ovr' stands for One-vs-Rest

# Fit the classifier on the training data
log_r_classifier_neu.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_r_classifier_neu.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.75
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.91      0.82      1778
           1       0.79      0.50      0.61      1142

    accuracy                           0.75      2920
   macro avg       0.76      0.71      0.72      2920
weighted avg       0.76      0.75      0.74      2920



#### Multinomial Naive Bayes classifier

In [None]:
# Initialize a Multinomial Naive Bayes classifier
classifier_nb_neu = MultinomialNB()

# Fit the classifier on the training data
classifier_nb_neu.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_nb_neu.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

#### Random Forest classifier

In [None]:
# Initialize a Random Forest classifier
classifier_rf_neu = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier on the training data
classifier_rf_neu.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_rf_neu.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

#### Gradient Boosting Classifier

In [None]:
# Initialize a Gradient Boosting classifier
classifier_gb_neu = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the classifier on the training data
classifier_gb_neu.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_gb_neu.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

#### Support Vector Machine Classifier

In [None]:
# Initialize a Support Vector Machine Classifier
classifier_svc_neu = SVC()

# Fit the classifier on the training data
classifier_svc_neu.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier_svc_neu.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

## Objective 2: Sentiments Based Stocks Classification

In [None]:
# Loading the S&P 500 dataframe
df_new = pd.read_csv('/content/drive/MyDrive/NLP/HS/New_Data/s&p_1.csv')

In [None]:
# Processing DF_NEW

In [None]:
# Apply processing functions to the 'headline_text' column
df_new['tokenized_news'] = df_new['headline_text'].apply(tokenize_text)
df_new['lemmatized_news'] = df_new['tokenized_news'].apply(lemmatize_text)
df_new['text_no_stopwords'] = df_new['lemmatized_news'].apply(remove_stopwords)
df_new['text_no_punctuation'] = df_new['text_no_stopwords'].apply(remove_punctuation)
df_new['processed_text'] = df_new['text_no_punctuation'].apply(lambda tokens: ' '.join(tokens))

In [None]:
# Create a new column based on the actual differences in two close and open prices
df_new['Difference'] = df_new['Close'] - df_new['Open']

# Extract the 'Difference' column for scaling
difference_values = df_new['Difference'].values.reshape(-1, 1)

# Initialize the MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))

# Fit and transform the 'Difference' values
scaled_difference_values = scaler.fit_transform(difference_values)

# Assign the scaled values back to the DataFrame
df_new['Scaled_Difference'] = scaled_difference_values.flatten()

# Examining the dataframe
df_new.head()

Unnamed: 0,publish_date,headline_text,Open,High,Low,Close,intraday_movement,tokenized_news,lemmatized_news,text_no_stopwords,text_no_punctuation,processed_text,Difference,Scaled_Difference
0,2019-01-02,adelaide man arrested over police officer hit ...,2476.959961,2519.48999,2467.469971,2510.030029,1,"[adelaide, man, arrested, over, police, office...","[adelaide, man, arrested, over, police, office...","[adelaide, man, arrested, police, officer, hit...","[adelaide, man, arrested, police, officer, hit...",adelaide man arrested police officer hit run,33.070068,0.258645
1,2019-01-02,aged care watchdog formed promises improvement...,2476.959961,2519.48999,2467.469971,2510.030029,1,"[aged, care, watchdog, formed, promises, impro...","[aged, care, watchdog, formed, promise, improv...","[aged, care, watchdog, formed, promise, improv...","[aged, care, watchdog, formed, promise, improv...",aged care watchdog formed promise improvement ...,33.070068,0.258645
2,2019-01-02,american detained in russia on spying charge i...,2476.959961,2519.48999,2467.469971,2510.030029,1,"[american, detained, in, russia, on, spying, c...","[american, detained, in, russia, on, spying, c...","[american, detained, russia, spying, charge, i...","[american, detained, russia, spying, charge, i...",american detained russia spying charge innocen...,33.070068,0.258645
3,2019-01-02,australia must be ready for cave rescues exper...,2476.959961,2519.48999,2467.469971,2510.030029,1,"[australia, must, be, ready, for, cave, rescue...","[australia, must, be, ready, for, cave, rescue...","[australia, must, ready, cave, rescue, expert,...","[australia, must, ready, cave, rescue, expert,...",australia must ready cave rescue expert warn,33.070068,0.258645
4,2019-01-02,australian housing prices fall 4.8pc weakest s...,2476.959961,2519.48999,2467.469971,2510.030029,1,"[australian, housing, prices, fall, 4.8pc, wea...","[australian, housing, price, fall, 4.8pc, weak...","[australian, housing, price, fall, 4.8pc, weak...","[australian, housing, price, fall, 4.8pc, weak...",australian housing price fall 4.8pc weakest si...,33.070068,0.258645


### Building the final prediction model

In [None]:
# Apply the same TF-IDF vectorizer used during training to transform new data (df_new)
X_tfidf_new = tfidf_vect_pos.transform(df_new['processed_text'])

# Make predictions using the trained Logistic Regression model for each row in the DataFrame
df_new['headline_predict_pos'] = classifier_rf_pos.predict(X_tfidf_new)
df_new['headline_predict_neg'] = log_r_classifier_neg.predict(X_tfidf_new)
df_new['headline_predict_neu'] = classifier_nb_neu.predict(X_tfidf_new)

NameError: ignored

In [None]:
# Examining the Pre-Processed Stocks Data
df_new

### PREDICTING INTRA-DAY MOVEMENT

In [None]:
# Selecting the desired columns
df2 = df_new[['publish_date', 'headline_predict_pos','headline_predict_neu','headline_predict_neg','intraday_movement','Scaled_Difference']]

# Displaying the new DataFrame
df2.head()

In [None]:
# Model 1: Positive Vs Rest against Intraday Movement
df2_pos= df2[['publish_date', 'headline_predict_pos', 'intraday_movement','Scaled_Difference']]
df2_pos

In [None]:
# Model 2: Negative Vs Rest against Intraday Movement
df2_neg= df2[['publish_date', 'headline_predict_neg', 'intraday_movement','Scaled_Difference']]
df2_neg

In [None]:
# Model #3: Neutral Vs Rest against Intraday Movement
df2_neu= df2[['publish_date', 'headline_predict_neu', 'intraday_movement','Scaled_Difference']]
df2_neu

### **Aggregating Sentiment Per Day (Positive vs Rest Scenario)**

In [None]:
df2_pos['publish_date'] = pd.to_datetime(df2_pos['publish_date'])

# Grouping by 'publish_date' and aggregating the values
aggregated_data_pos = df2_pos.groupby('publish_date').agg({
    'headline_predict_pos': lambda x: x.mean(),
    'intraday_movement': lambda x: x.mean(),
    'Scaled_Difference': lambda x: x.mean(),
}).reset_index()

In [None]:
# Examing the Dataframe
aggregated_data_pos.head()

In [None]:
# Split the data into features (X) and target variable (y)
X = aggregated_data_pos[['headline_predict_pos']]
y = aggregated_data_pos['intraday_movement']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### **Model 1: Logistic Regression**

In [None]:
# Initialize the logistic regression model
model_lr = LogisticRegression()

# Fit the model on the training data
model_lr.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model_lr.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Display confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.91      0.82      1778
           1       0.79      0.50      0.61      1142

    accuracy                           0.75      2920
   macro avg       0.76      0.71      0.72      2920
weighted avg       0.76      0.75      0.74      2920

Confusion Matrix:
[[1621  157]
 [ 568  574]]


### **Model 2: Random Forest Classifier**

In [None]:
# Initialize the Random Forest classifier
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
model_rf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_rf = model_rf.predict(X_test)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Accuracy (Random Forest): {accuracy_rf:.2f}')

# Display classification report for Random Forest
print('Classification Report (Random Forest):')
print(classification_report(y_test, y_pred_rf))

# Display confusion matrix for Random Forest
print('Confusion Matrix (Random Forest):')
print(confusion_matrix(y_test, y_pred_rf))

Accuracy (Random Forest): 0.74
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.77      0.80      0.79      1778
           1       0.67      0.64      0.65      1142

    accuracy                           0.74      2920
   macro avg       0.72      0.72      0.72      2920
weighted avg       0.73      0.74      0.73      2920

Confusion Matrix (Random Forest):
[[1422  356]
 [ 414  728]]


### **Model 3: Gradient Booster Classifier**

In [None]:
# Initialize the Gradient Boosting classifier
model_gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
model_gb.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_gb = model_gb.predict(X_test)

# Evaluate the Gradient Boosting model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f'Accuracy (Gradient Boosting): {accuracy_gb:.2f}')

# Display classification report for Gradient Boosting
print('Classification Report (Gradient Boosting):')
print(classification_report(y_test, y_pred_gb))

# Display confusion matrix for Gradient Boosting
print('Confusion Matrix (Gradient Boosting):')
print(confusion_matrix(y_test, y_pred_gb))

Accuracy (Gradient Boosting): 0.69
Classification Report (Gradient Boosting):
              precision    recall  f1-score   support

           0       0.68      0.94      0.79      1778
           1       0.77      0.31      0.44      1142

    accuracy                           0.69      2920
   macro avg       0.73      0.63      0.62      2920
weighted avg       0.72      0.69      0.65      2920

Confusion Matrix (Gradient Boosting):
[[1672  106]
 [ 787  355]]


### **Linear Regression & checking the statistical significance of variables**

In [None]:
# Extract features and target variable
X = aggregated_data_pos[['headline_predict_pos']]
y = aggregated_data_pos['Scaled_Difference']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Add a constant to the features (for intercept term)
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

# Fit the linear regression model using statsmodels
model = sm.OLS(y_train, X_train_scaled)
results = model.fit()

# Print summary statistics
print(results.summary())

# Make predictions on the test set
y_pred = results.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

NameError: ignored

Conclusion:
headline_predict_pos: 0.084 is the p-value for the coefficient of headline_predict_pos. p-value greater than 0.05 indicates that this coefficient is not statistically significant at the 0.05 significance level.

### **Model 2: Negative Vs Rest**

In [None]:
df2_neg['publish_date'] = pd.to_datetime(df2_neg['publish_date'])

# Grouping by 'publish_date' and aggregating the values
aggregated_data_neg = df2_neg.groupby('publish_date').agg({
    'headline_predict_neg': lambda x: x.mean(),
    'intraday_movement': lambda x: x.mean(),
    'Scaled_Difference': lambda x: x.mean(),
}).reset_index()

In [None]:
aggregated_data_neg.head()

In [None]:
# Split the data into features (X) and target variable (y)
X = aggregated_data_neg[['headline_predict_neg']]
y = aggregated_data_neg['intraday_movement']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Model 1: Logistic Regression

In [None]:
# Initialize the logistic regression model
model_lr = LogisticRegression()

# Fit the model on the training data
model_lr.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model_lr.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Display confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

### Model 2: Random Forest Classifier

In [None]:
# Initialize the Random Forest classifier
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
model_rf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_rf = model_rf.predict(X_test)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Accuracy (Random Forest): {accuracy_rf:.2f}')

# Display classification report for Random Forest
print('Classification Report (Random Forest):')
print(classification_report(y_test, y_pred_rf))

# Display confusion matrix for Random Forest
print('Confusion Matrix (Random Forest):')
print(confusion_matrix(y_test, y_pred_rf))

### **Model 3: Gradient Boosting**

In [None]:
# Initialize the Gradient Boosting classifier
model_gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
model_gb.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_gb = model_gb.predict(X_test)

# Evaluate the Gradient Boosting model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f'Accuracy (Gradient Boosting): {accuracy_gb:.2f}')

# Display classification report for Gradient Boosting
print('Classification Report (Gradient Boosting):')
print(classification_report(y_test, y_pred_gb))

# Display confusion matrix for Gradient Boosting
print('Confusion Matrix (Gradient Boosting):')
print(confusion_matrix(y_test, y_pred_gb))

### **Linear Regression & checking the significance of variables**

In [None]:
# Extract features and target variable
X = aggregated_data_neg[['headline_predict_neg']]
y = aggregated_data_neg['Scaled_Difference']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Add a constant to the features (for intercept term)
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

# Fit the linear regression model using statsmodels
model = sm.OLS(y_train, X_train_scaled)
results = model.fit()

# Print summary statistics
print(results.summary())

# Make predictions on the test set
y_pred = results.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Conclusion:

headline_predict_neg: 0.508 is the p-value for the coefficient of headline_predict_neg. This p-value is greater than 0.05, suggesting that this coefficient is not statistically significant.


### **Model 3: Neutral Vs Rest**

In [None]:
# Convert to DateTime
df2_neu['publish_date'] = pd.to_datetime(df2_neu['publish_date'])

# Grouping by 'publish_date' and aggregating the values
aggregated_data_neu = df2_neu.groupby('publish_date').agg({
    'headline_predict_neu': lambda x: x.mean(),
    'intraday_movement': lambda x: x.mean(),
    'Scaled_Difference': lambda x: x.mean(),
}).reset_index()

In [None]:
aggregated_data_neu.head()

In [None]:
# Split the data into features (X) and target variable (y)
X = aggregated_data_neu[['headline_predict_neu']]
y = aggregated_data_neu['intraday_movement']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### **Model 1: Logistic Regression**

In [None]:
# Initialize the logistic regression model
model_lr = LogisticRegression()

# Fit the model on the training data
model_lr.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model_lr.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Display confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

### **Model 2: Random Forest Classifier**

In [None]:
# Initialize the Random Forest classifier
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
model_rf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_rf = model_rf.predict(X_test)

# Evaluate the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Accuracy (Random Forest): {accuracy_rf:.2f}')

# Display classification report for Random Forest
print('Classification Report (Random Forest):')
print(classification_report(y_test, y_pred_rf))

# Display confusion matrix for Random Forest
print('Confusion Matrix (Random Forest):')
print(confusion_matrix(y_test, y_pred_rf))

### **Model 3: Gradient Boosting**

In [None]:
# Initialize the Gradient Boosting classifier
model_gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
model_gb.fit(X_train, y_train)

# Make predictions on the testing data
y_pred_gb = model_gb.predict(X_test)

# Evaluate the Gradient Boosting model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f'Accuracy (Gradient Boosting): {accuracy_gb:.2f}')

# Display classification report for Gradient Boosting
print('Classification Report (Gradient Boosting):')
print(classification_report(y_test, y_pred_gb))

# Display confusion matrix for Gradient Boosting
print('Confusion Matrix (Gradient Boosting):')
print(confusion_matrix(y_test, y_pred_gb))

### **Linear Regression & checking the significance of variables**

In [None]:
# Extract features and target variable
X = aggregated_data_neu[['headline_predict_neu']]
y = aggregated_data_neu['Scaled_Difference']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Add a constant to the features (for intercept term)
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

# Fit the linear regression model using statsmodels
model = sm.OLS(y_train, X_train_scaled)
results = model.fit()

# Print summary statistics
print(results.summary())

# Make predictions on the test set
y_pred = results.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Conclusion:

headline_predict_neu: 0.399 is the p-value for the coefficient of headline_predict_neu. This p-value is also greater than 0.05, indicating that this coefficient is not statistically significant.

## **Approach # 2: Measuring Stock Sentiment Strength using NLTK Vader**

In [None]:
# Create a new dataset by selecting specific columns
selected_columns = ['publish_date', 'headline_text', 'intraday_movement', 'processed_text', 'Scaled_Difference']
df_vad = df_new[selected_columns].copy()

# Display the new DataFrame
df_vad.head()

Unnamed: 0,publish_date,headline_text,intraday_movement,processed_text,Scaled_Difference
0,2019-01-02,adelaide man arrested over police officer hit ...,1,adelaide man arrested police officer hit run,0.258645
1,2019-01-02,aged care watchdog formed promises improvement...,1,aged care watchdog formed promise improvement ...,0.258645
2,2019-01-02,american detained in russia on spying charge i...,1,american detained russia spying charge innocen...,0.258645
3,2019-01-02,australia must be ready for cave rescues exper...,1,australia must ready cave rescue expert warn,0.258645
4,2019-01-02,australian housing prices fall 4.8pc weakest s...,1,australian housing price fall 4.8pc weakest si...,0.258645


In [None]:
# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Apply VADER sentiment analysis to the 'processed_text' column
df_vad['compound'] = df_vad['processed_text'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Labeling based on compound score
df_vad['sentiment_label'] = df_vad['compound'].apply(lambda score: 1 if score >= 0 else 0)

# Display the updated DataFrame
df_vad.head()

Unnamed: 0,publish_date,headline_text,intraday_movement,processed_text,Scaled_Difference,compound,sentiment_label
0,2019-01-02,adelaide man arrested over police officer hit ...,1,adelaide man arrested police officer hit run,0.258645,-0.4767,0
1,2019-01-02,aged care watchdog formed promises improvement...,1,aged care watchdog formed promise improvement ...,0.258645,0.8176,1
2,2019-01-02,american detained in russia on spying charge i...,1,american detained russia spying charge innocen...,0.258645,-0.0772,0
3,2019-01-02,australia must be ready for cave rescues exper...,1,australia must ready cave rescue expert warn,0.258645,0.6597,1
4,2019-01-02,australian housing prices fall 4.8pc weakest s...,1,australian housing price fall 4.8pc weakest si...,0.258645,-0.5106,0


In [None]:
### Aggregating Sentiment Per Day

# Aggregate sentiment for each day
agg_df = df_vad.groupby('publish_date').agg({
    'compound': 'mean',
    'sentiment_label': 'mean',
    'intraday_movement': 'mean',
    'Scaled_Difference': 'mean',
}).reset_index()

In [None]:
# Convert 'sentiment_label' to a binary variable based on the mean threshold
agg_df['binary_sentiment_label'] = (agg_df['sentiment_label'] > 0.65).astype(int)

# Display the aggregated DataFrame
agg_df.head()

Unnamed: 0,publish_date,compound,sentiment_label,intraday_movement,Scaled_Difference,binary_sentiment_label
0,2019-01-02,-0.066678,0.626866,1.0,0.258645,0
1,2019-01-03,-0.266311,0.438356,0.0,-0.270798,0
2,2019-01-04,-0.113891,0.620253,1.0,0.427158,0
3,2019-01-07,-0.040275,0.641791,1.0,0.12824,0
4,2019-01-08,-0.129704,0.595506,1.0,0.074815,0


### **Running a logistic regression model**

In [None]:
# Define the features and target variable
X = agg_df['binary_sentiment_label']
y = agg_df['intraday_movement']

# Add a constant to the features
X = sm.add_constant(X)

# Fit the logistic regression model
model = sm.Logit(y, X)
result = model.fit()

# Display the summary statistics
print(result.summary())

Interpretation: The logistic regression results indicate that the sentiment_label variable does not have a statistically significant impact on the odds of intraday_movement. The coefficient for sentiment_label is positive, suggesting a positive association, but the associated p-value is not significant, indicating that this association might be due to chance.

### **Running a Linear Regression Model**

In [None]:
# Define the features and target variable
X = agg_df['sentiment_label']
y = agg_df['Scaled_Difference']

# Add a constant to the features
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X)
result = model.fit()

# Display the summary statistics
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:      Scaled_Difference   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     2.688
Date:                Wed, 13 Dec 2023   Prob (F-statistic):              0.102
Time:                        03:37:54   Log-Likelihood:                 116.72
No. Observations:                 755   AIC:                            -229.4
Df Residuals:                     753   BIC:                            -220.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -0.0648      0.065     

Interpretation:
The linear regression results indicate that sentiment_label does not have a statistically significant impact on Scaled_Difference. The coefficient for sentiment_label is positive, suggesting a positive association, but the associated p-value is not significant, indicating that this association might be due to chance.