In [122]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [123]:
df = pd.read_csv('Cleaned_data/combined_df.csv')
df.head()

Unnamed: 0,Sentence,Sentiment
0,"Mid-cap funds can deliver more, stay put: Experts",1
1,Mid caps now turn into market darlings,1
2,Hudco raises Rs 279 cr via tax-free bonds,1
3,"EXL beats profit estimates, cuts sales outlook",1
4,"Would stick to banking: Girish Pai, Centrum Br...",1


In [124]:
df.shape

(15962, 2)

In [125]:
# There are approx. 3500 more positive than negative articles which is a problem
df['Sentiment'].value_counts()

1    9761
0    6201
Name: Sentiment, dtype: int64

In [126]:
# removing non-alphanumeric characters since not needed

non_alphanum = [',','.','/','"',':',';','!','@','#','$','%',"'","*","(",")","&","--"]
for char in non_alphanum:
  df['Sentence'] = df['Sentence'].str.replace(char,"")

df['Sentence'] = df['Sentence'].str.replace(" s "," ")
df['Sentence'] = df['Sentence'].str.replace(" '","'")
df['Sentence'] = df['Sentence'].str.replace("  "," ")
df['Sentence'] = df['Sentence'].str.replace("   "," ")
df['Sentence'] = df['Sentence'].str.lower()

  df['Sentence'] = df['Sentence'].str.replace(char,"")


In [127]:
# splitting into training and testing

from sklearn.model_selection import train_test_split

X = df['Sentence'].to_numpy().reshape(-1, 1)
y = df['Sentiment'].to_numpy().reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.15, stratify=y)

In [128]:
# putting each sentence and sentiment from the training and testing dataframes into lists

training_sentences=[]
testing_sentences=[]
training_labels=[]
testing_labels=[]

for i in X_train:
  training_sentences.append(i[0])
for i in y_train:
  training_labels.append(i[0])
for i in X_test:
  testing_sentences.append(i[0])
for i in y_test:
  testing_labels.append(i[0])


In [129]:
# Use the TfidfVectorizer tool to convert sentences into numerical vectors.
vectorizer = TfidfVectorizer(
    max_df=0.8,
    min_df=3,
)

# Train the vectorizer on the training data / convert that data into vectors.
X_train_vec = vectorizer.fit_transform(training_sentences)

# Use the trained vectorizer to convert the testing data into vectors
X_test_vec = vectorizer.transform(testing_sentences)

In [130]:
# Initialise and train the logistic regression model
model = LogisticRegression(max_iter=2000)
model.fit(X_train_vec, training_labels)

In [131]:
# Predict on training data to compare both train and test accuracy 
y_train_pred = model.predict(X_train_vec)
train_accuracy = accuracy_score(training_labels, y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")

# Evaluate the model
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(testing_labels, y_pred)
print(f"Val_Accuracy: {accuracy * 100:.2f}%")

Training Accuracy: 89.19%
Val_Accuracy: 85.89%


### Testing new headlines...

In [132]:
# Function to preprocess new test headlines
def preprocess_text(text):
    non_alphanum = [',','.','/','"',':',';','!','@','#','$','%',"'","*","(",")","&","--"]
    for char in non_alphanum:
        text = text.replace(char, "")
    text = text.replace(" s ", " ")
    text = text.replace(" '", "'")
    text = text.replace("  ", " ")
    text = text.replace("   ", " ")
    text = text.lower()
    return text

# New headlines to test
test_headlines = ["tesla stock decreases", 
                  "energy costs skyrocket",
                  "",
                  "Fortescue makes slow start on push for fifth iron ore export record",
                  "Microsoft embroiled in privacy concerns",
                  "Webcentral to divest assets for $165m; MacCap on ticket",
                  "Crown’s multibillion-dollar expense bill wipes casino profits",
                  "Pilbara Minerals shelves special dividend as lithium price dips",
                  "Critical minerals projects ‘at risk from Labor environment plan’",
                  "Lithium hopeful strikes $1.63b sale to Chile’s SQM"]

# Preprocess the new test  headlines
test_headlines_processed = [preprocess_text(headline) for headline in test_headlines]

# Vectorize the preprocessed headlines
test_headlines_vec = vectorizer.transform(test_headlines_processed)

# Predict using the logistic regression model
test_predictions = model.predict(test_headlines_vec)

# Display the predictions
for headline, prediction in zip(test_headlines, test_predictions):
    print(f"Headline: '{headline}' -> Prediction: {prediction}")


Headline: 'tesla stock decreases' -> Prediction: 1
Headline: 'energy costs skyrocket' -> Prediction: 0
Headline: '' -> Prediction: 1
Headline: 'Fortescue makes slow start on push for fifth iron ore export record' -> Prediction: 1
Headline: 'Microsoft embroiled in privacy concerns' -> Prediction: 0
Headline: 'Webcentral to divest assets for $165m; MacCap on ticket' -> Prediction: 0
Headline: 'Crown’s multibillion-dollar expense bill wipes casino profits' -> Prediction: 0
Headline: 'Pilbara Minerals shelves special dividend as lithium price dips' -> Prediction: 0
Headline: 'Critical minerals projects ‘at risk from Labor environment plan’' -> Prediction: 1
Headline: 'Lithium hopeful strikes $1.63b sale to Chile’s SQM' -> Prediction: 1
