In [None]:
import os
import re
import json
import spacy
import nltk
from nltk.corpus import stopwords
from string import punctuation


nlp = spacy.load("en_core_web_sm")


nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


SENTIMENT_MAP = {
    "@positive": 1,
    "@neutral": 0,
    "@negative": -1
}


def preprocess_text(text):

    sentiment = 0
    for label, value in SENTIMENT_MAP.items():
        if label in text:
            sentiment = value
            text = text.replace(label, '')
            break


    doc = nlp(text.lower())


    tokens = [
        token.lemma_ for token in doc
        if token.text not in stop_words and token.text not in punctuation and not token.is_space and not token.is_digit
    ]

    return {"sentence": tokens, "sentiment": sentiment}


file_paths = [
    "Sentences_50Agree.txt",
    "Sentences_66Agree.txt",
    "Sentences_75Agree.txt",
    "Sentences_AllAgree.txt"
]


all_processed_data = []
for file_path in file_paths:
    with open(file_path, "r", encoding="ISO-8859-1") as file:
        sentences = file.readlines()


    processed_data = [preprocess_text(sentence.strip()) for sentence in sentences]


    all_processed_data.extend(processed_data)


output_json = "bank.json"
with open(output_json, "w", encoding="utf-8") as json_file:
    json.dump(all_processed_data, json_file, indent=4, ensure_ascii=False)

print(f"Concatenated JSON file saved: bank.json")


import json
from collections import Counter

# Load processed sentences from JSON file
with open("processed_sentences.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Flatten the list of tokens from all sentences
all_tokens = [token for entry in data for token in entry["sentence"]]

# Count token frequencies
token_counts = Counter(all_tokens)

# Get N least common tokens
N = 20  # Change this to the number of least common tokens you need
least_common_tokens = token_counts.most_common()[:-N-1:-1]  # Get N least common

# Print results
print(f"The {N} least common tokens:")
for token, count in least_common_tokens:
    print(f"{token}: {count}")


In [None]:
import pandas as pd
import re
import json
import spacy
import nltk
from nltk.corpus import stopwords
from string import punctuation


nlp = spacy.load("en_core_web_sm")


nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


LABEL_MAP = {
    "LABEL_0": -1,
    "LABEL_1": 1,
    "LABEL_2": 0
}


def clean_text(text):
    text = re.sub(r'\$\w+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    return text.strip()


def preprocess_text(text, label):
    text = clean_text(text)
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc
        if token.text not in stop_words and token.text not in punctuation and not token.is_space and not token.is_digit
    ]
    return {"sentence": tokens, "sentiment": label}


file_paths = [
    "sent_train.csv",
    "sent_valid.csv",

]


all_processed_data = []

for path in file_paths:
    df = pd.read_csv(path)
    processed = [
        preprocess_text(row["text"], LABEL_MAP.get(row["label"], 0))  # default to Neutral if label not found
        for _, row in df.iterrows()
    ]
    all_processed_data.extend(processed)


with open("tweet.json", "w", encoding="utf-8") as f:
    json.dump(all_processed_data, f, indent=4, ensure_ascii=False)

print(" Merged processed data saved to 'merged_processed_data.json'")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Merged processed data saved to 'merged_processed_data.json'


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import pandas as pd
import re
import json
import spacy
import nltk
from nltk.corpus import stopwords
from string import punctuation
from datasets import load_dataset


nlp = spacy.load("en_core_web_sm")


nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


LABEL_MAP = {
    0: -1,  # 0 = Bearish
    1: 1,   # 1 = Bullish
    2: 0    # 2 = Neutral
}



def clean_text(text):
    text = re.sub(r'\$\w+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    return text.strip()


def preprocess_text(text, label):
    text = clean_text(text)
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc
        if token.text not in stop_words and token.text not in punctuation and not token.is_space and not token.is_digit
    ]
    return {"sentence": tokens, "sentiment": LABEL_MAP.get(label, 0)}  # default Neutral

# Load Hugging Face dataset
dataset = load_dataset("TimKoornstra/financial-tweets-sentiment", split="train")
df = dataset.to_pandas()
print(df["sentiment"].unique())

processed_data = [
    preprocess_text(row["tweet"], row["sentiment"]) for _, row in df.iterrows()
]


with open("hugging.json", "w", encoding="utf-8") as f:
    json.dump(processed_data, f, indent=4, ensure_ascii=False)

print(" Processed Hugging Face data saved to 'hugging.json'")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/38091 [00:00<?, ? examples/s]

[2 1 0]
✅ Processed Hugging Face data saved to 'hugging.json'


In [None]:
import json

# Load tweet.json
with open("tweet.json", "r", encoding="utf-8") as f:
    tweet_data = json.load(f)

# Load stock.json
with open("bank.json", "r", encoding="utf-8") as f:
    stock_data = json.load(f)

with open("hugging.json", "r", encoding="utf-8") as f:
    hugging_data = json.load(f)
#Combine both lists
merged_data = tweet_data + stock_data  + hugging_data

# Save to new JSON file
with open("final.json", "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=4, ensure_ascii=False)

print("final.json")

final.json


In [None]:
with open("final(dupless).json", "r", encoding="utf-8") as f:
    data = json.load(f)


print(f"Original count: {len(data)}")

# Remove duplicates based on both sentence and sentiment
seen = set()
unique_data = []
for item in data:
    key = (tuple(item["sentence"]), item["sentiment"])  # Convert list to tuple for hashing
    if key not in seen:
        seen.add(key)
        unique_data.append(item)

print(f"Unique count: {len(unique_data)}")

with open("final(dupless).json", "w", encoding="utf-8") as f:
    json.dump(unique_data, f, indent=4, ensure_ascii=False)


SyntaxError: incomplete input (<ipython-input-1-515fb3140b2a>, line 1)

In [None]:
from google.colab import files
uploaded = files.upload()



In [None]:
import json
import matplotlib.pyplot as plt
from collections import Counter

# Load processed data
with open("final(dupless).json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Count the number of sentences in each sentiment class
sentiment_counts = Counter(entry["sentiment"] for entry in data)

# For consistent order: -1 (negative), 0 (neutral), 1 (positive)
sentiment_labels = {-1: "Negative", 0: "Neutral", 1: "Positive"}
sentiments = [-1, 0, 1]
counts = [sentiment_counts.get(s, 0) for s in sentiments]
labels = [sentiment_labels[s] for s in sentiments]
total_count = sum(counts)

# Plot
plt.figure(figsize=(8, 5))
bars = plt.bar(labels, counts, color=["red", "gray", "green"])
plt.xlabel("Sentiment")
plt.ylabel("Number of Sentences")
plt.title("Sentence Count by Sentiment Class")

# Add value labels above each bar
for bar in bars:
    height = bar.get_height()
    plt.annotate(f'{height}', xy=(bar.get_x() + bar.get_width() / 2, height),
                 xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')

# Add total sentence count below the title
plt.text(0.5, max(counts)*1.05, f'Total Sentences: {total_count}',
         ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
import json

# Load processed data
with open("final.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Total number of sentences
total_sentences = len(data)

print(f"Total number of sentences: {total_sentences}")


Total number of sentences: 26711


In [None]:
import os
from collections import Counter


SENTIMENT_MAP = {
    "@positive": "Positive",
    "@neutral": "Neutral",
    "@negative": "Negative"
}

# List of files to process
file_paths = [
    "Sentences_50Agree.txt",
    "Sentences_66Agree.txt",
    "Sentences_75Agree.txt",
    "Sentences_AllAgree.txt"
]

# Track grand total across all files
grand_total = 0
overall_sentiment_counts = Counter()

# Process each file
for file_path in file_paths:
    with open(file_path, "r", encoding="ISO-8859-1") as f:
        lines = f.readlines()

    sentiment_counts = Counter()
    for line in lines:
        for tag, label in SENTIMENT_MAP.items():
            if tag in line:
                sentiment_counts[label] += 1
                break

    total = sum(sentiment_counts.values())
    grand_total += total
    overall_sentiment_counts += sentiment_counts

    print(f" {file_path}")
    for sentiment, count in sentiment_counts.items():
        print(f"  {sentiment}: {count}")
    print(f"  Total: {total}\n")

# Print grand total
print(" Overall Total Across All Files:")
for sentiment, count in overall_sentiment_counts.items():
    print(f"  {sentiment}: {count}")
print(f"  Grand Total Sentences: {grand_total}")


📄 Sentences_50Agree.txt
  Neutral: 2879
  Negative: 604
  Positive: 1363
  Total: 4846

📄 Sentences_66Agree.txt
  Neutral: 2535
  Positive: 1168
  Negative: 514
  Total: 4217

📄 Sentences_75Agree.txt
  Neutral: 2146
  Positive: 887
  Negative: 420
  Total: 3453

📄 Sentences_AllAgree.txt
  Neutral: 1391
  Positive: 570
  Negative: 303
  Total: 2264

🧾 Overall Total Across All Files:
  Neutral: 8951
  Negative: 1841
  Positive: 3988
  Grand Total Sentences: 14780


In [None]:
import pandas as pd
from collections import Counter


label_map = {
    0: -1,  # Bearish
    1: 1,   # Bullish
    2: 0    # Neutral
}

# File paths
csv_files = ["sent_train.csv", "sent_valid.csv"]

# Track overall totals
overall_counts = Counter()
grand_total = 0

for file_path in csv_files:
    df = pd.read_csv(file_path)

    # Map string labels to integers
    df['mapped_sentiment'] = df['label'].map(label_map)

    # Count each sentiment
    sentiment_counts = df['mapped_sentiment'].value_counts().sort_index()

    print(f" {file_path}")
    for sentiment in [-1, 0, 1]:
        count = sentiment_counts.get(sentiment, 0)
        print(f"  Sentiment {sentiment}: {count}")
        overall_counts[sentiment] += count
        grand_total += count
    print(f"  Total: {sentiment_counts.sum()}\n")

# Grand total across both files
print(" Overall Total Across Both CSV Files:")
for sentiment in [-1, 0, 1]:
    print(f"  Sentiment {sentiment}: {overall_counts.get(sentiment, 0)}")
print(f"  Grand Total Sentences: {grand_total}")


📄 sent_train.csv
  Sentiment -1: 1442
  Sentiment 0: 6178
  Sentiment 1: 1923
  Total: 9543

📄 sent_valid.csv
  Sentiment -1: 347
  Sentiment 0: 1566
  Sentiment 1: 475
  Total: 2388

🧾 Overall Total Across Both CSV Files:
  Sentiment -1: 1789
  Sentiment 0: 7744
  Sentiment 1: 2398
  Grand Total Sentences: 11931


In [None]:
!pip install nlpaug transformers torch --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m118.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m93.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:

import json
from tqdm import tqdm
import nlpaug.augmenter.word as naw
from collections import Counter


In [None]:


with open("final(dupless).json", "r") as f:
    data = json.load(f)


word_counts = Counter()
for item in data:
    word_counts.update(item["sentence"])


RARE_THRESHOLD = 5
rare_words = set([word for word, count in word_counts.items() if count < RARE_THRESHOLD])



bert_aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased',
    action="substitute",
    top_k=10,       # Top-k words to sample replacements from
    device='cuda'
)


syn_aug = naw.SynonymAug(aug_src='wordnet')


def contains_rare_words(tokens):
    return any(token in rare_words for token in tokens)


augmented_data = []


for item in tqdm(data, desc="Augmenting"):
    sentence = item["sentence"]
    sentiment = item["sentiment"]
    text = " ".join(sentence)


    if len(sentence) < 5:
        num_augments = 3
    elif contains_rare_words(sentence):
        num_augments = 2
    elif len(sentence) > 15:
        num_augments = 0
    else:
        num_augments = 1


    augmented_data.append(item)


    for i in range(num_augments):
        try:
            if i % 2 == 0:

                augmented = bert_aug.augment(text)
            else:
                augmented = syn_aug.augment(text)

            if augmented:

                augmented_data.append({
                    "sentence": augmented.split(),
                    "sentiment": sentiment
                })
        except:
            continue

with open("augmented_final_dupless.json", "w") as out_file:
    json.dump(augmented_data, out_file, indent=2)

print("finished")


SyntaxError: incomplete input (<ipython-input-1-47bb843141f2>, line 1)