# Importing Modules

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import shutil

# Loading the raw data

In [2]:
cnbc_df = pd.read_csv("Data/financial/cnbc_headlines.csv")
guardian_df = pd.read_csv("Data/financial/guardian_headlines.csv")
reuters_df = pd.read_csv("Data/financial/reuters_headlines.csv")

In [3]:
cnbc_hl = cnbc_df["Headlines"].rename({"Headlines": "text"}, axis=1)
guardian_hl = guardian_df["Headlines"].rename({"Headlines": "text"}, axis=1)
reuters_hl = reuters_df["Headlines"].rename({"Headlines": "text"}, axis=1)

In [4]:
labeled_df = pd.read_csv("Data/financial/fin_data.csv", encoding = "ISO-8859-1")

In [5]:
labeled_df

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


# Cleaning the data - removing rows with not unicode characters

In [6]:
mask = []
n_errors = 0

for row in range(len(labeled_df)):

    text = labeled_df.iloc[row, 1]
    
    file = open(f'temp.txt', "w")

    try:

        file.write(text)
        mask.append(True)

    except UnicodeEncodeError:
        
        mask.append(False)
        n_errors += 1



In [7]:
print(n_errors)
print(len(mask))

74
4846


In [8]:
labeled_df = labeled_df[mask]
labeled_df

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [9]:
labeled_hl = labeled_df["text"]

In [10]:
financial_df = pd.concat([cnbc_hl, guardian_hl, reuters_hl, labeled_hl], axis=0).reset_index(drop=True)

In [14]:
financial_df = financial_df.dropna()

In [15]:
financial_df.to_csv("Data/financial/fin_cleaned_data_large_unlabeled.csv", index=False)

# Creating balanced train and test datasets

In [16]:
def build_balanced_ds(df, text_col, sent_col, list_labels=["positive", "negative"], ds_size=599*2, test_size=0.5, random_state=0):

    container = []
    label_size = int(ds_size/len(list_labels))
    split_index = int(np.floor(ds_size*test_size/len(list_labels)))

    for label in list_labels:

        label_df = (df[df.iloc[:, sent_col]==label]
                    .sample(label_size, random_state=random_state)
                    )

        label_test_df = label_df.iloc[0:split_index, :]
        label_test_df["set"] = "test"
        label_train_df = label_df.iloc[split_index:, :]
        label_train_df["set"] = "train"

        container.append(label_test_df)
        container.append(label_train_df)
        

    output_df = (pd.concat(container, axis=0)
                .sample(frac=1, random_state=random_state)
                .reset_index(drop=True)
                )

    return output_df

In [17]:
labeled_red_df = build_balanced_ds(labeled_df, 1, 0, test_size=0.3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_test_df["set"] = "test"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_train_df["set"] = "train"


In [18]:
labeled_red_df
labeled_red_df.to_csv("Data/financial/fin_red_data.csv", index=False)

In [25]:
labeled_red_df["sentiment"].value_counts(dropna=False)

positive    599
negative    599
Name: sentiment, dtype: int64

In [26]:
labeled_red_df["set"].value_counts(dropna=False)

train    840
test     358
Name: set, dtype: int64

# Building a folder with the headlines stored as text files

In [27]:
def build_folder(df, sent_col, text_col, set_col, labels_list = ["positive", "negative"]):

    root = "Data/financial"

    for set in ["train", "test"]:

        path = root + "/" + set

        if os.path.isdir(path):

            shutil.rmtree(path)
        
        for label in labels_list:

            path = path + "/" + label

            os.makedirs(path)

            path = root + "/" + set

    for row in tqdm(range(len(df))):

        text = df.iloc[row, text_col]
        sentiment = df.iloc[row, sent_col]
        set_label = df.iloc[row, set_col]

        file = open(f'Data/financial/{set_label}/{sentiment}/{row}.txt', "w")
        file.write(text)
        file.close()


In [14]:
build_folder(labeled_red_df, 0, 1, 2)

100%|██████████| 1198/1198 [00:00<00:00, 2853.42it/s]
