In [1]:
import pandas as pd
df=pd.read_csv('data.csv')

In [2]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nandiniupadhyay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nandiniupadhyay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nandiniupadhyay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
titles=df['title'].tolist()
cleaned_titles=[]
# Define the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Preprocessing function
for text in titles:
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
      
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize each word
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join tokens back into a single string
    preprocessed_text = " ".join(tokens)
    
    cleaned_titles.append(preprocessed_text)

# Apply preprocessing to each title in the list

# Add the cleaned titles back to the DataFrame
df['cleaned_title'] = cleaned_titles

# Display the DataFrame
df.head()

Unnamed: 0,ticker,date,time,title,cleaned_title
0,AMZN,Oct-29-24,12:44PM,Ray Wang on Amazon.com Inc (NASDAQ:AMZN): Stro...,ray wang amazoncom inc nasdaqamzn strong funda...
1,AMZN,Oct-29-24,12:06PM,Duck Capital calls for 'significant' capital r...,duck capital call significant capital return a...
2,AMZN,Oct-29-24,12:00PM,Is an earnings beat enough for Big Tech invest...,earnings beat enough big tech investor
3,AMZN,Oct-29-24,11:37AM,Amazon pilots 'Rufus' generative AI shopping a...,amazon pilot rufus generative ai shopping assi...
4,AMZN,Oct-29-24,11:16AM,"Do Amazon, Alphabet, and Apple Have an AI Spen...",amazon alphabet apple ai spending problem mean...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# Apply TF-IDF to the 'cleaned_title' column
tfidf_matrix = vectorizer.fit_transform(df['cleaned_title'])

# Convert the TF-IDF matrix to a DataFrame for easier readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the TF-IDF DataFrame
print("TF-IDF Scores:\n", tfidf_df)

TF-IDF Scores:
      aapl  aapls  accelerating  access  according  account  accusing  action  \
0     0.0    0.0           0.0     0.0        0.0      0.0       0.0     0.0   
1     0.0    0.0           0.0     0.0        0.0      0.0       0.0     0.0   
2     0.0    0.0           0.0     0.0        0.0      0.0       0.0     0.0   
3     0.0    0.0           0.0     0.0        0.0      0.0       0.0     0.0   
4     0.0    0.0           0.0     0.0        0.0      0.0       0.0     0.0   
..    ...    ...           ...     ...        ...      ...       ...     ...   
495   0.0    0.0           0.0     0.0        0.0      0.0       0.0     0.0   
496   0.0    0.0           0.0     0.0        0.0      0.0       0.0     0.0   
497   0.0    0.0           0.0     0.0        0.0      0.0       0.0     0.0   
498   0.0    0.0           0.0     0.0        0.0      0.0       0.0     0.0   
499   0.0    0.0           0.0     0.0        0.0      0.0       0.0     0.0   

     actually   ad  ...

In [4]:

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

# Function to assign sentiment labels
def assign_sentiment(text):
    score = sia.polarity_scores(text)['compound']
    if score >= 0:
        return 'positive'
    else:
        return 'negative'
    

# Apply the function to the 'cleaned_title' column
df['sentiment_label'] = df['cleaned_title'].apply(assign_sentiment)

# Display the DataFrame with sentiment labels
print(df[['cleaned_title', 'sentiment_label']])


                                         cleaned_title sentiment_label
0    ray wang amazoncom inc nasdaqamzn strong funda...        positive
1    duck capital call significant capital return a...        positive
2               earnings beat enough big tech investor        positive
3    amazon pilot rufus generative ai shopping assi...        positive
4    amazon alphabet apple ai spending problem mean...        negative
..                                                 ...             ...
495  microsoft corporation msft gave back first hal...        positive
496  tesla stock tap brake still rising magnificent...        positive
497  betting bitcoin microsofts shareholder decide ...        positive
498     colgatepalmolive centene microsoft stock focus        positive
499  microsoft ceo satya nadella asked pay cut stil...        negative

[500 rows x 2 columns]


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/nandiniupadhyay/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['title'])  # Features (TF-IDF vectors)
y = df['sentiment_label']  # Target labels (positive, negative)

# Step 2: Split Data into Train and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the SVM Model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Step 4: Predict and Evaluate
y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.93


In [6]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

    negative       1.00      0.30      0.46        10
    positive       0.93      1.00      0.96        90

    accuracy                           0.93       100
   macro avg       0.96      0.65      0.71       100
weighted avg       0.94      0.93      0.91       100



In [7]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 4: Predict and Evaluate
y_pred = dt_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.94


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

label_mapping = {'positive': 1, 'negative': 0}
df['label'] = df['sentiment_label'].map(label_mapping)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['title'], df['label'], test_size=0.2, random_state=42)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences of tokens
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure consistent input size
max_length = 50  # max length for sequences
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Define RNN model
rnn_model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=max_length),
    SimpleRNN(64),
    Dense(2, activation='softmax')  # Output layer for 2 classes: positive, negative
])

# Compile the model
rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
rnn_model.fit(X_train_padded, y_train, epochs=10, batch_size=16, validation_data=(X_test_padded, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2860a25d0>

In [18]:
# Evaluate RNN
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test_padded, y_test)
print("RNN Test Accuracy:", rnn_accuracy)


RNN Test Accuracy: 0.8899999856948853


In [16]:
from tensorflow.keras.layers import LSTM

# Define LSTM model
lstm_model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=max_length),
    LSTM(64),
    Dense(2, activation='softmax')  # Output layer for 2 classes
])

# Compile the model
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
lstm_model.fit(X_train_padded, y_train, epochs=7, batch_size=16, validation_data=(X_test_padded, y_test))


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.src.callbacks.History at 0x28c8aded0>

In [17]:
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_padded, y_test)
print("LSTM Test Accuracy:", lstm_accuracy)

LSTM Test Accuracy: 0.9100000262260437
