In [1]:
import collections
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

from argparse import Namespace

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joshw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\joshw\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
args = Namespace(
    raw_dataset_csv="data/stock/twitter_training.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/stock/twitter_training_split.csv",
    seed=1337
)

In [3]:
# Read raw data
column = ['x','y','Sentiment','Sentence']
news = pd.read_csv(args.raw_dataset_csv, header=None, names = column)
news = news.dropna()

replacement_mapping = {-1: 'negative', 1: 'positive'}

# Use the replace method to create a new column with replaced values
#news['Sentiment'] = news['Sentiment'].replace(replacement_mapping)
#news['Sentence'] = news['Text']
news = news[['Sentiment','Sentence']]

news.head()

Unnamed: 0,Sentiment,Sentence
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
from sklearn.utils import resample

# Calculate the minimum count of samples in any category
min_category_count = news['Sentiment'].value_counts().min()

# Initialize an empty DataFrame for the downsampled data
downsampled_data = pd.DataFrame(columns=news.columns)

# Iterate over each category and perform downsampling
for category in news['Sentiment'].unique():
    category_data = news[news['Sentiment'] == category]
    downsampled_category_data = resample(category_data, n_samples=min_category_count, random_state=42)
    downsampled_data = pd.concat([downsampled_data, downsampled_category_data])

# Shuffle the downsampled data to ensure randomness
news = downsampled_data.sample(frac=1, random_state=42)

# Print the count of samples in each category
print(news)


        Sentiment                                           Sentence
21831    Positive          one of my favorite recorded clutchs today
36202    Positive  2012 It Already another year and i forgot to c...
25712     Neutral  Don't miss it at @ increased odyssey... Networ...
51844    Negative  @RockstarGames red dead redemption 2 Besides y...
949      Positive  Y'all stay with Mario Kart and Borderlands. So...
...           ...                                                ...
69869    Positive  While I “can’t wait” for @CyberpunkGame, I‘ll ...
60153  Irrelevant   twice stay winning .  pic.twitter.com/0dr4HMrOX5
20181    Negative  I LOVE everything about this.. I just earned t...
46689    Positive                      i’ve never related so closely
44247     Neutral  VERIZON Z ON SUPER BOWL RETURN FROM FUTURISTIC...

[51500 rows x 2 columns]


In [5]:
# Unique classes
set(news.Sentiment)

{'Irrelevant', 'Negative', 'Neutral', 'Positive'}

In [6]:
# Splitting train by category
# Create dict
by_category = collections.defaultdict(list)
for _, row in news.iterrows():
    by_category[row.Sentiment].append(row.to_dict())

In [7]:
by_category 

defaultdict(list,
            {'Positive': [{'Sentiment': 'Positive',
               'Sentence': 'one of my favorite recorded clutchs today'},
              {'Sentiment': 'Positive',
               'Sentence': '2012 It Already another year and i forgot to congratulate all who are affecting my life and business RhandlerR  RhandlerR RhandlerR  RhandlerR RhandlerR RhandlerR RhandlerR RhandlerR RhandlerR RhandlerR RhandlerR WELCOME TO 2020 pic.twitter.com/r7UAYX4Tjj'},
              {'Sentiment': 'Positive',
               'Sentence': "Y'all stay with Mario Kart and Borderlands. Some shit where you don't have to compete against someone else. Those are the main motherfuckers who hate me. People who play GTA or The Last Of Us."},
              {'Sentiment': 'Positive',
               'Sentence': 'ps5 The looks so NICE, I love white ; _ ;'},
              {'Sentiment': 'Positive',
               'Sentence': 'Red Team Dead Redemption 2 is also the best action game experience I ever had. That B

In [8]:
# Create split data
final_list = []
np.random.seed(args.seed)
for _, item_list in sorted(by_category.items()):
    np.random.shuffle(item_list)
    n = len(item_list)
    n_train = int(args.train_proportion*n)
    n_val = int(args.val_proportion*n)
    n_test = int(args.test_proportion*n)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:]:
        item['split'] = 'test'  
    
    # Add to final list
    final_list.extend(item_list)

In [9]:
# Write split data to file
final_news = pd.DataFrame(final_list)

In [10]:
final_news.split.value_counts()

split
train    36048
test      7728
val       7724
Name: count, dtype: int64

In [11]:
lemmatizer = WordNetLemmatizer()

# Define a set of stopwords
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
   # Convert text to lowercase
    text = text.lower()    
    # Tokenize the text into words
    words = text.split()
    # Remove stopwords and perform lemmatization
    #words = [word for word in words if word not in stop_words]
    # Join the words back into a single string
    text = ' '.join(words)
    # Add spaces before and after punctuation marks
    text = re.sub(r"([.,!?])", r" \1 ", text)    
    # Remove non-alphanumeric characters
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text) 
    return text

final_news.Sentence = final_news.Sentence.apply(preprocess_text)
print(final_news["Sentence"].isna().sum())

final_news.dropna(subset=["Sentence"], inplace=True)
print(final_news["Sentence"].isna().sum())

0
0


In [12]:
final_news.Sentiment.value_counts()

Sentiment
Irrelevant    12875
Negative      12875
Neutral       12875
Positive      12875
Name: count, dtype: int64

In [13]:
final_news.head()

Unnamed: 0,Sentiment,Sentence,split
0,Irrelevant,i love you katienolan but i think patmcafeesho...,train
1,Irrelevant,"she is sweet , but",train
2,Irrelevant,"as a trump supporter , i m shocked that he s b...",train
3,Irrelevant,resident evil ! ! ! ! ! let s fucking go i m h...,train
4,Irrelevant,ift . tt fufaa neil patel wastes a lot of mone...,train


In [14]:
# Write munged data to CSV
final_news.to_csv(args.output_munged_csv, index=False)