In [11]:
import collections
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

from argparse import Namespace

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joshw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
args = Namespace(
    raw_dataset_csv="data/stock/stock_data.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/stock/stock_data_split.csv",
    seed=1337
)

In [13]:
# Read raw data
news = pd.read_csv(args.raw_dataset_csv, header=0)
news = news.dropna()

replacement_mapping = {-1: 'negative', 1: 'positive'}

# Use the replace method to create a new column with replaced values
news['Sentiment'] = news['Sentiment'].replace(replacement_mapping)
news['Sentence'] = news['Text']
news = news[['Sentiment','Sentence']]

news.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [4]:
# Unique classes
set(news.Sentiment)

{'negative', 'positive'}

In [5]:
# Splitting train by category
# Create dict
by_category = collections.defaultdict(list)
for _, row in news.iterrows():
    by_category[row.Sentiment].append(row.to_dict())

In [6]:
by_category 

defaultdict(list,
            {'negative': [{'Sentiment': 'negative',
               'Sentence': 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing '},
              {'Sentiment': 'negative',
               'Sentence': "At the request of Finnish media company Alma Media 's newspapers , research manager Jari Kaivo-oja at the Finland Futures Research Centre at the Turku School of Economics has drawn up a future scenario for Finland 's national economy by using a model developed by the University of Denver "},
              {'Sentiment': 'negative',
               'Sentence': 'STOCK EXCHANGE ANNOUNCEMENT 20 July 2006 1 ( 1 ) BASWARE SHARE SUBSCRIPTIONS WITH WARRANTS AND INCREASE IN SHARE CAPITAL A total of 119 850 shares have been subscribed with BasWare Warrant Program '},
              {'Sentiment': 'negative',
               'Sentence': 'A maximum of 666,104 new shares can further be subscribed for by exercisin

In [7]:
# Create split data
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_category.items()):
    np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_proportion*n)
    n_val = int(args.val_proportion*n)
    n_test = int(args.test_proportion*n)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'  
    
    # Add to final list
    final_list.extend(item_list)

In [8]:
# Write split data to file
final_news = pd.DataFrame(final_list)

In [9]:
final_news.split.value_counts()

split
train    76124
test     16315
val      16311
Name: count, dtype: int64

In [14]:
lemmatizer = WordNetLemmatizer()

# Define a set of stopwords
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
   # Convert text to lowercase
    text = text.lower()    
    # Tokenize the text into words
    words = text.split()
    # Remove stopwords and perform lemmatization
    words = [word for word in words if word not in stop_words]
    # Join the words back into a single string
    text = ' '.join(words)
    # Add spaces before and after punctuation marks
    text = re.sub(r"([.,!?])", r" \1 ", text)    
    # Remove non-alphanumeric characters
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text) 
    return text

final_news.Sentence = final_news.Sentence.apply(preprocess_text)
print(final_news["Sentence"].isna().sum())

final_news.dropna(subset=["Sentence"], inplace=True)
print(final_news["Sentence"].isna().sum())

0
0


In [15]:
final_news.Sentiment.value_counts()

Sentiment
positive    55724
negative    53026
Name: count, dtype: int64

In [16]:
final_news.head()

Unnamed: 0,Sentiment,Sentence,split
0,negative,euro could affect,train
1,negative,audio diary immigration western us,train
2,negative,wonder zamora eyed top clubs,train
3,negative,"letters , april",train
4,negative,archives show churchill ordered ufo cover up,train


In [None]:
# Write munged data to CSV
final_news.to_csv(args.output_munged_csv, index=False)