# FinBert neutral positive bad

In [3]:
import os
import pandas as pd
import nltk
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer

# Load the FinBERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

# Specify the directory containing the text files
directory = r"C:\Users\Rober\Personal Projects\STOCK MONEY\Scrapers\Data\2022\April\22"

def load_data(directory):
    files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    texts = []
    for file in files:
        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
            texts.append((f.read(), file))
    return texts, files

def segment_text(text, max_length=512, overlap=50):
    sentences = nltk.sent_tokenize(text)
    segments = []
    segment = ""
    for sentence in sentences:
        if len(tokenizer.tokenize(segment + sentence)) <= max_length:
            segment += " " + sentence
        else:
            segments.append(segment)
            segment = sentence[max(0, len(sentence) - overlap):]
    if segment:
        segments.append(segment)
    return segments

def process_segments(segments):
    # Tokenize all segments in a single batch, with padding enabled
    inputs = tokenizer(
        segments,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    return inputs['input_ids'], inputs['attention_mask']


def get_predictions(model, dataloader):
    model.eval()
    predictions = []
    for batch in dataloader:
        input_ids, attention_mask = [t.to(device) for t in batch]  # Move data to the device
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        print(f"Batch size: {len(input_ids)}, Predictions: {preds}")
        predictions.extend(preds.cpu())  # Move predictions back to CPU for further processing
    return predictions

texts, files = load_data(directory)
# Initialize empty lists to hold file names and predicted labels
all_files = []
all_predicted_labels = []

# Process each file, generate predictions, and gather the results
for text, file in zip(texts, files):
    segments = segment_text(text[0])  # text is a tuple of (text_content, file_name)
    input_ids_tensor, attention_masks_tensor = process_segments(segments)
    dataset = TensorDataset(input_ids_tensor, attention_masks_tensor)
    dataloader = DataLoader(dataset, batch_size=4)
    predictions = get_predictions(model, dataloader)
    # Get the most common prediction for each file
    most_common_prediction = max(set(predictions), key=predictions.count)
    # Update the lists with the file name and the most common prediction
    all_files.append(file)
    all_predicted_labels.append(label_map[most_common_prediction.item()])

# Create a DataFrame to hold the results
results_df = pd.DataFrame({'File': all_files, 'Predicted Sentiment': all_predicted_labels})

# Optionally, save the results to a CSV file
results_df.to_csv('classification_results.csv', index=False)

# Print the results
print(results_df)



Batch size: 4, Predictions: tensor([1, 1, 1, 1])
Batch size: 1, Predictions: tensor([1])
Batch size: 4, Predictions: tensor([1, 1, 1, 1])
Batch size: 4, Predictions: tensor([1, 1, 1, 1])
Batch size: 4, Predictions: tensor([0, 2, 2, 1])
Batch size: 4, Predictions: tensor([1, 2, 2, 2])
Batch size: 2, Predictions: tensor([2, 2])
                                                File Predicted Sentiment
0  Bitcoin_Extends_Pullback;_Support_at_$37K,_Res...             neutral
1  Retail_Interest_in_Bitcoin_Is_Dwindling,_Googl...             neutral
2  Seized_Silk_Road_Bitcoin_to_Clear_Ross_Ulbrich...            positive


# Positive Negative CryptoBert

In [2]:
import os
import pandas as pd
import nltk
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# Load the CryptoBERT model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('ElKulako/cryptobert')
tokenizer = RobertaTokenizer.from_pretrained('ElKulako/cryptobert')

# Specify the directory containing the text files
directory = r"C:\Users\Rober\Personal Projects\STOCK MONEY\Scrapers\Data\2022\April\22"

def load_data(directory):
    files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    texts = []
    for file in files:
        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
            texts.append((f.read(), file))
    return texts, files

def segment_text(text, max_length=512, overlap=50):
    sentences = nltk.sent_tokenize(text)
    segments = []
    segment = ""
    for sentence in sentences:
        if len(tokenizer.tokenize(segment + sentence)) <= max_length:
            segment += " " + sentence
        else:
            segments.append(segment)
            segment = sentence[max(0, len(sentence) - overlap):]
    if segment:
        segments.append(segment)
    return segments

def process_segments(segments):
    # Tokenize all segments in a single batch, with padding enabled
    inputs = tokenizer(
        segments,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    return inputs['input_ids'], inputs['attention_mask']

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_predictions(model, dataloader):
    model.eval()
    predictions = []
    for batch in dataloader:
        input_ids, attention_mask = [t.to(device) for t in batch]  # Move data to the device
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        print(f"Batch size: {len(input_ids)}, Predictions: {preds}")
        predictions.extend(preds.cpu())  # Move predictions back to CPU for further processing
    return predictions

texts, files = load_data(directory)
# Initialize empty lists to hold file names and predicted labels
all_files = []
all_predicted_labels = []

# Define label_map to map numerical predictions to sentiment labels
label_map = {0: 'Negative', 1: 'Positive'}

# Process each file, generate predictions, and gather the results
for text, file in zip(texts, files):
    segments = segment_text(text[0])  # text is a tuple of (text_content, file_name)
    input_ids_tensor, attention_masks_tensor = process_segments(segments)
    dataset = TensorDataset(input_ids_tensor, attention_masks_tensor)
    dataloader = DataLoader(dataset, batch_size=4)
    predictions = get_predictions(model, dataloader)
    # Get the most common prediction for each file
    most_common_prediction = max(set(predictions), key=predictions.count)
    # Update the lists with the file name and the most common prediction
    all_files.append(file)
    all_predicted_labels.append(label_map[most_common_prediction.item()])

# Create a DataFrame to hold the results
results_df = pd.DataFrame({'File': all_files, 'Predicted Sentiment': all_predicted_labels})

# Optionally, save the results to a CSV file
results_df.to_csv('classification_results.csv', index=False)

# Print the results
print(results_df)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Batch size: 4, Predictions: tensor([1, 1, 1, 1], device='cuda:0')
Batch size: 1, Predictions: tensor([1], device='cuda:0')
Batch size: 4, Predictions: tensor([2, 1, 1, 2], device='cuda:0')
Batch size: 4, Predictions: tensor([2, 2, 1, 1], device='cuda:0')
Batch size: 3, Predictions: tensor([2, 1, 1], device='cuda:0')
Batch size: 4, Predictions: tensor([1, 1, 1, 1], device='cuda:0')
Batch size: 1, Predictions: tensor([1], device='cuda:0')
                                                File Predicted Sentiment
0  Bitcoin_Extends_Pullback;_Support_at_$37K,_Res...            Positive
1  Retail_Interest_in_Bitcoin_Is_Dwindling,_Googl...            Positive
2  Seized_Silk_Road_Bitcoin_to_Clear_Ross_Ulbrich...            Positive


In [3]:
import os
import pandas as pd
import nltk
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# Load the RoBERTa model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('j-hartmann/sentiment-roberta-large-english-3-classes')
tokenizer = RobertaTokenizer.from_pretrained('j-hartmann/sentiment-roberta-large-english-3-classes')

# Specify the directory containing the text files
directory = r"C:\Users\Rober\Personal Projects\STOCK MONEY\Scrapers\Data\2022\April\22"

def load_data(directory):
    files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    texts = []
    for file in files:
        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
            texts.append((f.read(), file))
    return texts, files

def segment_text(text, max_length=512, overlap=50):
    sentences = nltk.sent_tokenize(text)
    segments = []
    segment = ""
    for sentence in sentences:
        if len(tokenizer.tokenize(segment + sentence)) <= max_length:
            segment += " " + sentence
        else:
            segments.append(segment)
            segment = sentence[max(0, len(sentence) - overlap):]
    if segment:
        segments.append(segment)
    return segments

def process_segments(segments):
    # Tokenize all segments in a single batch, with padding enabled
    inputs = tokenizer(
        segments,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    return inputs['input_ids'], inputs['attention_mask']

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_predictions(model, dataloader):
    model.eval()
    predictions = []
    for batch in dataloader:
        input_ids, attention_mask = [t.to(device) for t in batch]  # Move data to the device
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu())  # Move predictions back to CPU for further processing
    return predictions

texts, files = load_data(directory)
# Initialize empty lists to hold file names and predicted labels
all_files = []
all_predicted_labels = []

# Process each file, generate predictions, and gather the results
for text, file in zip(texts, files):
    segments = segment_text(text[0])  # text is a tuple of (text_content, file_name)
    input_ids_tensor, attention_masks_tensor = process_segments(segments)
    dataset = TensorDataset(input_ids_tensor, attention_masks_tensor)
    dataloader = DataLoader(dataset, batch_size=4)
    predictions = get_predictions(model, dataloader)
    # Get the most common prediction for each file
    label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}  # Define the mapping between label IDs and label names
    most_common_prediction = max(set(predictions), key=predictions.count)
    # Update the lists with the file name and the most common prediction
    all_files.append(file)
    all_predicted_labels.append(label_map[most_common_prediction.item()])

# Create a DataFrame to hold the results
results_df = pd.DataFrame({'File': all_files, 'Predicted Sentiment': all_predicted_labels})

# Optionally, save the results to a CSV file
results_df.to_csv('classification_results.csv', index=False)

# Print the results
print(results_df)


Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at j-hartmann/sentiment-roberta-large-english-3-classes were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

                                                File Predicted Sentiment
0  Bitcoin_Extends_Pullback;_Support_at_$37K,_Res...             neutral
1  Retail_Interest_in_Bitcoin_Is_Dwindling,_Googl...             neutral
2  Seized_Silk_Road_Bitcoin_to_Clear_Ross_Ulbrich...             neutral


In [1]:
import os
import pandas as pd
import nltk
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer

# Load the FinBERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

# Specify the directory containing the text files
directory = r"C:\Users\Rober\Personal Projects\STOCK MONEY\Scrapers\Data\2023\September"

def load_data(directory):
    texts = []
    for root, dirs, files in os.walk(directory):
        print(dirs)
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    texts.append((f.read(), os.path.basename(root), file))
    return texts


def segment_text(text, max_length=512, overlap=50):
    sentences = nltk.sent_tokenize(text)
    segments = []
    segment = ""
    for sentence in sentences:
        if len(tokenizer.tokenize(segment + sentence)) <= max_length:
            segment += " " + sentence
        else:
            segments.append(segment)
            segment = sentence[max(0, len(sentence) - overlap):]
    if segment:
        segments.append(segment)
    return segments

def process_segments(segments):
    # Tokenize all segments in a single batch, with padding enabled
    inputs = tokenizer(
        segments,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    return inputs['input_ids'], inputs['attention_mask']


def get_predictions(model, dataloader):
    model.eval()
    predictions = []
    for batch in dataloader:
        input_ids, attention_mask = [t.to(device) for t in batch]  # Move data to the device
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        print(f"Batch size: {len(input_ids)}, Predictions: {preds}")
        predictions.extend(preds.cpu())  # Move predictions back to CPU for further processing
    return predictions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

texts = load_data(directory)

# Initialize empty lists to hold file names, predicted labels, and dates
all_files = []
all_predicted_labels = []
all_days = []

# Initialize Counters for daily and overall sentiment counts
day_counts = {}
overall_counts = Counter()

label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Process each file, generate predictions, and gather the results
for text, day, file in texts:
    segments = segment_text(text)
    input_ids_tensor, attention_masks_tensor = process_segments(segments)
    dataset = TensorDataset(input_ids_tensor, attention_masks_tensor)
    dataloader = DataLoader(dataset, batch_size=4)
    predictions = get_predictions(model, dataloader)
    prediction_labels = [label_map[pred.item()] for pred in predictions]
    most_common_prediction = max(set(predictions), key=predictions.count)
    day_counts.setdefault(day, Counter()).update(prediction_labels)
    overall_counts.update(prediction_labels)
    all_files.append(file)
    all_predicted_labels.append(label_map[most_common_prediction.item()])
    all_days.append(day)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({'File': all_files, 'Day': all_days, 'Predicted Sentiment': all_predicted_labels})

# Optionally, save the results to a CSV file
results_df.to_csv('classification_results.csv', index=False)

# Print the results
print(results_df)
print(f"Overall counts for the month: {overall_counts}")
for day, count in day_counts.items():
    print(f"Counts for {day}: {count}")


SyntaxError: invalid syntax (2712906331.py, line 106)

# Average sentiment score

In [16]:
import os
import pandas as pd
import nltk
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer
import torch.nn.functional as F  # Import the functional module from PyTorch
from collections import defaultdict
import numpy as np  #h
# Load the FinBERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

# Specify the directory containing the text files
directory = r"C:\Users\Rober\Personal Projects\STOCK MONEY\Scrapers\Data\2023\June"

def load_data(directory):
    texts = []
    for root, dirs, files in os.walk(directory):
        print(dirs)
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    texts.append((f.read(), os.path.basename(root), file))
    return texts


def segment_text(text, max_length=512, overlap=50):
    sentences = nltk.sent_tokenize(text)
    segments = []
    segment = ""
    for sentence in sentences:
        if len(tokenizer.tokenize(segment + sentence)) <= max_length:
            segment += " " + sentence
        else:
            segments.append(segment)
            segment = sentence[max(0, len(sentence) - overlap):]
    if segment:
        segments.append(segment)
    return segments

def process_segments(segments):
    # Tokenize all segments in a single batch, with padding enabled
    inputs = tokenizer(
        segments,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    return inputs['input_ids'], inputs['attention_mask']



def get_predictions(model, dataloader):
    model.eval()
    sentiment_scores = []  # List to hold the sentiment scores
    for batch in dataloader:
        input_ids, attention_mask = [t.to(device) for t in batch]  # Move data to the device
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1)  # Convert logits to probabilities
        # Calculate sentiment score: prob(positive) - prob(negative)
        scores = probs[:, 2] - probs[:, 0]
        sentiment_scores.extend(scores.cpu())  # Move scores back to CPU for further processing
    return sentiment_scores
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

texts = load_data(directory)

# Initialize empty lists to hold file names, predicted labels, and dates
all_files = []
all_predicted_labels = []
all_days = []
day_scores = defaultdict(list)
monthly_scores = []

# Initialize Counters for daily and overall sentiment counts
day_counts = {}
overall_counts = Counter()

label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Process each file, generate predictions, and gather the results
for text, day, file in texts:
    segments = segment_text(text)
    input_ids_tensor, attention_masks_tensor = process_segments(segments)
    dataset = TensorDataset(input_ids_tensor, attention_masks_tensor)
    dataloader = DataLoader(dataset, batch_size=4)
    sentiment_scores = get_predictions(model, dataloader)  # Get the sentiment scores
    avg_sentiment_score = np.mean(sentiment_scores)  # Get the average sentiment score
    day_scores[day].append(avg_sentiment_score)  # Store the average sentiment score
    monthly_scores.append(avg_sentiment_score)
    all_files.append(file)
    all_days.append(day)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({'File': all_files, 'Day': all_days, 'Average Sentiment Score': monthly_scores})

# Optionally, save the results to a CSV file
results_df.to_csv('classification_results_test.csv', index=False)

# Print the results
print(results_df)
print(f'Average sentiment score for the month: {np.mean(monthly_scores)}')
for day, scores in day_scores.items():
    print(f'Average sentiment score for {day}: {np.mean(scores)}')


['1', '10', '12', '13', '14', '15', '16', '19', '2', '20', '21', '22', '23', '26', '27', '28', '29', '30', '5', '6', '7', '8', '9']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
                                                  File Day  \
0    Bitcoin_Lingers_Under_27K_to_Continue_Its_May_...   1   
1    First_Mover_Americas_Bitcoin_Begins_June_Dropp...   1   
2    First_Mover_Asia_Bitcoin_Settles_Above_271K_Af...   1   
3    Litecoin_Starts_June_Strong_as_Investors_Eye_A...   1   
4    On_Heels_of_First_Losing_Month_of_2023_Bitcoin...   1   
..                                                 ...  ..   
190  Bitcoin_Payments_Firm_Strike_Moves_Custody_InH...   9   
191  Bitcoin_Trades_at_Narrow_Discount_on_BinanceUS...   9   
192  Datos_de_empleo_en_EE_UU_dejaron_una_débil_esp...   9   
193  First_Mover_Americas_BinanceUS_Suspends_Dollar...   9   
194  First_Mover_Asia_Bitcoin_Remains_Resilient_Nea...   9   

     Average Sentiment Score  
0                  -0.1

In [17]:
import os
import pandas as pd
import nltk
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer
import torch.nn.functional as F  # Import the functional module from PyTorch
from collections import defaultdict
import numpy as np  #h
# Load the FinBERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

# Specify the directory containing the text files
directory = r"C:\Users\Rober\Personal Projects\STOCK MONEY\Scrapers\Data\2023\July"

def load_data(directory):
    texts = []
    for root, dirs, files in os.walk(directory):
        print(dirs)
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    texts.append((f.read(), os.path.basename(root), file))
    return texts


def segment_text(text, max_length=512, overlap=50):
    sentences = nltk.sent_tokenize(text)
    segments = []
    segment = ""
    for sentence in sentences:
        if len(tokenizer.tokenize(segment + sentence)) <= max_length:
            segment += " " + sentence
        else:
            segments.append(segment)
            segment = sentence[max(0, len(sentence) - overlap):]
    if segment:
        segments.append(segment)
    return segments

def process_segments(segments):
    # Tokenize all segments in a single batch, with padding enabled
    inputs = tokenizer(
        segments,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    return inputs['input_ids'], inputs['attention_mask']



def get_predictions(model, dataloader):
    model.eval()
    sentiment_scores = []  # List to hold the sentiment scores
    for batch in dataloader:
        input_ids, attention_mask = [t.to(device) for t in batch]  # Move data to the device
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1)  # Convert logits to probabilities
        # Calculate sentiment score: prob(positive) - prob(negative)
        scores = probs[:, 2] - probs[:, 0]
        sentiment_scores.extend(scores.cpu())  # Move scores back to CPU for further processing
    return sentiment_scores
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

texts = load_data(directory)

# Initialize empty lists to hold file names, predicted labels, and dates
all_files = []
all_predicted_labels = []
all_days = []
day_scores = defaultdict(list)
monthly_scores = []

# Initialize Counters for daily and overall sentiment counts
day_counts = {}
overall_counts = Counter()

label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

# Process each file, generate predictions, and gather the results
for text, day, file in texts:
    segments = segment_text(text)
    input_ids_tensor, attention_masks_tensor = process_segments(segments)
    dataset = TensorDataset(input_ids_tensor, attention_masks_tensor)
    dataloader = DataLoader(dataset, batch_size=4)
    sentiment_scores = get_predictions(model, dataloader)  # Get the sentiment scores
    avg_sentiment_score = np.mean(sentiment_scores)  # Get the average sentiment score
    day_scores[day].append(avg_sentiment_score)  # Store the average sentiment score
    monthly_scores.append(avg_sentiment_score)
    all_files.append(file)
    all_days.append(day)

# Create a DataFrame to hold the results
results_df = pd.DataFrame({'File': all_files, 'Day': all_days, 'Average Sentiment Score': monthly_scores})

# Optionally, save the results to a CSV file
results_df.to_csv('classification_results.csv', index=False)

# Print the results
print(results_df)
print(f'Average sentiment score for the month: {np.mean(monthly_scores)}')
for day, scores in day_scores.items():
    print(f'Average sentiment score for {day}: {np.mean(scores)}')


['10', '11', '12', '13', '14', '17', '18', '19', '20', '21', '24', '25', '26', '27', '28', '3', '31', '4', '5', '6', '7']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
                                                  File Day  \
0    Apple_May_Not_Like_It_but_Zapple_Pay_Finds_Wor...  10   
1    Bitcoin_Could_Rise_to_120K_by_End2024_Standard...  10   
2    Bitcoin_Falls_Back_Below_31K_After_Late_Monday...  10   
3    Bitcoin_podría_alcanzar_los_US120K_a_finales_d...  10   
4    Bitcoin_Steady_Above_30K_as_China_Factory_Defl...  10   
..                                                 ...  ..   
130  Bitcoin_Retakes_30K_Asian_Stocks_Hit_5Week_Low...   7   
131  BlackRock_CEOs_Turnabout_on_Bitcoin_Elicits_Ch...   7   
132  First_Mover_Asia_Bitcoin_Whales_Are_Increasing...   7   
133  Storj_Filecoin_and_Solana_Lead_First_Week_of_J...   7   
134  US_Added_209K_Jobs_in_June_Missing_Expectation...   7   

     Average Sentiment Score  
0                   0.750995  
1       

# Classify 2022


In [1]:
import os
import pandas as pd
import nltk
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer
import torch.nn.functional as F  # Import the functional module from PyTorch
from collections import defaultdict
import numpy as np  #h
# Load the FinBERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
# Specify the directory containing the text files
directory = r"C:\Users\Rober\Personal Projects\STOCK MONEY\Scrapers\Data\2022"


def load_data(directory):
    texts = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    texts.append((f.read(), os.path.basename(root), file))
    return texts


def segment_text(text, max_length=512, overlap=50):
    sentences = nltk.sent_tokenize(text)
    segments = []
    segment = ""
    for sentence in sentences:
        if len(tokenizer.tokenize(segment + sentence)) <= max_length:
            segment += " " + sentence
        else:
            segments.append(segment)
            segment = sentence[max(0, len(sentence) - overlap):]
    if segment:
        segments.append(segment)
    return segments

def process_segments(segments):
    # Tokenize all segments in a single batch, with padding enabled
    inputs = tokenizer(
        segments,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )
    return inputs['input_ids'], inputs['attention_mask']

def process_month(directory):
    texts = load_data(directory)
    all_files = []
    all_predicted_labels = []
    all_days = []
    day_scores = defaultdict(list)
    monthly_scores = []
    
    # ... (rest of your processing logic)
    
    # Create a DataFrame to hold the results
    results_df = pd.DataFrame({'File': all_files, 'Day': all_days, 'Average Sentiment Score': monthly_scores})
    
    # Save the results to a CSV file named after the month
    month_name = os.path.basename(directory)
    results_df.to_csv(f'classification_results_{month_name}.csv', index=False)

def get_predictions(model, dataloader):
    model.eval()
    sentiment_scores = []  # List to hold the sentiment scores
    for batch in dataloader:
        input_ids, attention_mask = [t.to(device) for t in batch]  # Move data to the device
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1)  # Convert logits to probabilities
        # Calculate sentiment score: prob(positive) - prob(negative)
        scores = probs[:, 2] - probs[:, 0]
        sentiment_scores.extend(scores.cpu())  # Move scores back to CPU for further processing
    return sentiment_scores
def process_texts(texts):
    all_files = []
    all_days = []
    day_scores = defaultdict(list)
    monthly_scores = []

    for text, day, file in texts:
        segments = segment_text(text)
        input_ids_tensor, attention_masks_tensor = process_segments(segments)
        dataset = TensorDataset(input_ids_tensor, attention_masks_tensor)
        dataloader = DataLoader(dataset, batch_size=4)
        sentiment_scores = get_predictions(model, dataloader)
        avg_sentiment_score = np.mean(sentiment_scores)
        day_scores[day].append(avg_sentiment_score)
        monthly_scores.append(avg_sentiment_score)
        all_files.append(file)
        all_days.append(day)
        
    return all_files, all_days, monthly_scores, day_scores

# Function to process each month
def process_month(directory):
    texts = load_data(directory)
    all_files, all_days, monthly_scores, day_scores = process_texts(texts)
    
    # Create the new directory if it doesn't exist
    new_dir = os.path.join(directory, '2022_classification')
    os.makedirs(new_dir, exist_ok=True)  # 'exist_ok=True' will prevent an error if the directory already exists
    
    # Adjust the file path to include the new directory
    month_name = os.path.basename(directory)
    results_file_path = os.path.join(new_dir, f'classification_results_{month_name}.csv')
    results_df = pd.DataFrame({'File': all_files, 'Day': all_days, 'Average Sentiment Score': monthly_scores})
    results_df.to_csv(results_file_path, index=False)


# Main loop to process each month
def main(directory):
    months = [os.path.join(directory, month_dir) for month_dir in os.listdir(directory) if os.path.isdir(os.path.join(directory, month_dir))]
    for month_directory in months:
        process_month(month_directory)

# Set device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Call main function to process each month
main(directory)


['1', '10', '11', '12', '13', '14', '17', '18', '19', '20', '21', '22', '24', '25', '26', '27', '28', '29', '3', '4', '5', '6', '7', '8']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['1', '10', '11', '12', '14', '15', '16', '17', '18', '19', '2', '21', '22', '23', '24', '25', '26', '28', '29', '3', '30', '31', '4', '5', '7', '8', '9']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['1', '11', '12', '13', '14', '15', '16', '18', '19', '2', '20', '21', '22', '23', '25', '26', '27', '28', '29', '30', '4', '5', '6', '7', '8', '9']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['1', '10', '11', '13', '14', '15', '16', '17', '18', '2', '20', '21', '22', '23', '24', '25', '28', '3', '4', '7', '8', '9']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['10', '11', '12', '13', '14', '16', '18', '19', '20', '21', '24', '25', '26', '27', '28', '29', '3', '31', '4', '5', '6', '7', '8', '9