In [3]:
import glob
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import pandas as pd


In [4]:
st = SnowballStemmer('english')
stop_words = set(stopwords.words('english'))

def clean_data(df, col, clean_col, label):
    df = df.astype({"body": str})
    # change to lower and remove spaces on either side
    df[clean_col] = df[col].apply(lambda x: x.lower().strip())

    # remove punctuation
    df[clean_col] = df[clean_col].apply(lambda x: re.sub('[^a-zA-Z\']', ' ', x))

    # remove extra spaces in between
    df[clean_col] = df[clean_col].apply(lambda x: re.sub(' +', ' ', x))
    df.drop(['body', 'created_utc', 'created'], axis=1, inplace=True)
    df['label'] = label
    return df

In [7]:
for label in ['fear', 'greed']:

    path = f'data/{label}' # use your path

    all_files = glob.glob(os.path.join(path, "*.csv"))

    df = pd.concat((pd.read_csv(f, on_bad_lines='skip', header=0, engine='python', delimiter='|', index_col=False) for f in all_files), ignore_index=True)
    df_cleaned = clean_data(df, 'body', 'clean_body', label)
    df_cleaned.to_csv(f"data/cleaned_labeled/{label}.csv", sep="|", index=False)

In [6]:
    path = f'data/cleaned_labeled/' # use your path
    all_files = glob.glob(os.path.join(path, "*.csv"))
    greed_df = pd.read_csv(path+"greed.csv", on_bad_lines='skip', header=0, engine='python', delimiter='|', index_col=False)
    fear_df = pd.read_csv(path+"fear.csv", on_bad_lines='skip', header=0, engine='python', delimiter='|', index_col=False)

In [4]:
if greed_df.shape[0] > fear_df.shape[0]:
    greed_df = greed_df.sample(n=fear_df.shape[0])
else:
    fear_df = fear_df.sample(n=greed_df.shape[0])
df = pd.concat([greed_df,fear_df], ignore_index=True)

In [5]:
df.to_csv("data/merged_labeled_cleaned.csv", index=False, sep="|")

# Minimal Dataset
Since the amount of data was far beyond the computation power that I had, 
I used 0.01 fraction of final data to do the experiments

In [8]:
if greed_df.shape[0] > fear_df.shape[0]:
    greed_df = greed_df.sample(n=fear_df.shape[0])
else:
    fear_df = fear_df.sample(n=greed_df.shape[0])

small_greed_df = greed_df.sample(frac=0.01)
small_fear_df = fear_df.sample(frac=0.01)
df = pd.concat([small_greed_df,small_fear_df], ignore_index=True)
df = df[df["clean_body"].apply(lambda x: len(x.split()) > 5)]
df.to_csv("data/small_merged_labeled_cleaned.csv", index=False, sep="|")