# Importing Modules

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import shutil
from sklearn.model_selection import train_test_split
pd.set_option("display.max_rows", 100)

# Functions

In [2]:
def rearrange_columns(df, dict = None):

    if dict is None:

        text = df["text"]
        sentiment = df["sentiment"]

    else:

        text = df[dict["text"]]
        sentiment = df[dict["sentiment"]]

    df = pd.concat([text, sentiment], axis=1)

    return df

In [3]:
def human_supervision(df, start, end):

    for row in range(start,end):

        text = df["text"][row]
        print(text)
        print("---------")
        sentiment = input("Is the sentiment of this headline postive or negative?")
        print("---------")
        
        if sentiment=="2":
            
            df.iloc[row, 3] = "positive"

        elif sentiment=="1":

            df.iloc[row, 3] = "negative"

        elif sentiment=="3":

            df.iloc[row, 3] = "discard"

        else:

            print("Wrong Input!")

    return df.iloc[start:end,:]

# Loading the raw data

In [4]:
cnbc_df = pd.read_csv("Data/csv/cnbc_headlines.csv")
guardian_df = pd.read_csv("Data/csv/guardian_headlines.csv")
reuters_df = pd.read_csv("Data/csv/reuters_headlines.csv")

In [5]:
cnbc_df = cnbc_df["Headlines"].rename({"Headlines": "text"}, axis=1)
guardian_df = guardian_df["Headlines"].rename({"Headlines": "text"}, axis=1)
reuters_df = reuters_df["Headlines"].rename({"Headlines": "text"}, axis=1)

In [6]:
labeled_df = pd.read_csv("Data/csv/all-data.csv", encoding = "ISO-8859-1")
labeled_df

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [7]:
labeled_df = rearrange_columns(labeled_df)

In [8]:
labeled_df

Unnamed: 0,text,sentiment
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive
...,...,...
4841,LONDON MarketWatch -- Share prices ended lower...,negative
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,neutral
4843,Operating profit fell to EUR 35.4 mn from EUR ...,negative
4844,Net sales of the Paper segment decreased to EU...,negative


# Cleaning the data - removing rows with not unicode characters

In [9]:
mask = []
n_errors = 0

for row in range(len(labeled_df)):

    text = labeled_df.iloc[row, 0]
    
    file = open(f'temp.txt', "w")

    try:

        file.write(text)
        mask.append(True)

    except UnicodeEncodeError:
        
        mask.append(False)
        n_errors += 1



In [10]:
print(n_errors)
print(len(mask))

74
4846


In [11]:
labeled_df = labeled_df[mask]
labeled_df

Unnamed: 0,text,sentiment
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive
...,...,...
4841,LONDON MarketWatch -- Share prices ended lower...,negative
4842,Rinkuskiai 's beer sales fell by 6.5 per cent ...,neutral
4843,Operating profit fell to EUR 35.4 mn from EUR ...,negative
4844,Net sales of the Paper segment decreased to EU...,negative


In [12]:
labeled_hl_df = labeled_df["text"]

In [13]:
financial_df = pd.concat([cnbc_df, guardian_df, reuters_df, labeled_hl_df], axis=0).reset_index(drop=True)

In [14]:
financial_df = pd.DataFrame(financial_df, columns=["text"]).dropna().reset_index(drop=True)

In [15]:
financial_df

Unnamed: 0,text
0,Jim Cramer: A better way to invest in the Covi...
1,Cramer's lightning round: I would own Teradyne
2,"Cramer's week ahead: Big week for earnings, ev..."
3,IQ Capital CEO Keith Bliss says tech and healt...
4,Wall Street delivered the 'kind of pullback I'...
...,...
58137,LONDON MarketWatch -- Share prices ended lower...
58138,Rinkuskiai 's beer sales fell by 6.5 per cent ...
58139,Operating profit fell to EUR 35.4 mn from EUR ...
58140,Net sales of the Paper segment decreased to EU...


In [16]:
financial_df.to_csv("Data/csv/fin_unlab.csv", index=False)

# Creating the large labelled dataset

In [17]:
labeled_df = labeled_df[(labeled_df["sentiment"]=="positive") | (labeled_df["sentiment"]=="negative")]
labeled_df


Unnamed: 0,text,sentiment
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive
5,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,positive
6,"For the last quarter of 2010 , Componenta 's n...",positive
...,...,...
4840,HELSINKI Thomson Financial - Shares in Cargote...,negative
4841,LONDON MarketWatch -- Share prices ended lower...,negative
4843,Operating profit fell to EUR 35.4 mn from EUR ...,negative
4844,Net sales of the Paper segment decreased to EU...,negative


In [18]:
train, test = train_test_split(labeled_df, test_size=0.2, stratify=labeled_df["sentiment"])

In [19]:
train["set"] = "train"
test["set"] = "test"

labeled_df = pd.concat([train, test], axis=0).reset_index(drop=True)
labeled_df


Unnamed: 0,text,sentiment,set
0,Operating cash flow after investments totalled...,negative,train
1,"TietoEnator was down 1.13 pct to 18.38 , exten...",negative,train
2,In Finland 's Hobby Hall 's sales decreased by...,negative,train
3,"Svyturys-Utenos Alus , which is controlled by ...",positive,train
4,`` Lining stone sales were also good in the ea...,positive,train
...,...,...,...
1940,The chain posted sales of 298 million euros fo...,positive,test
1941,Satama earned Data Management Solutions compet...,positive,test
1942,"Rinkuskiai raised the sales by 18.1 percent , ...",positive,test
1943,Earnings per share ( EPS ) in the first half o...,negative,test


In [20]:
labeled_df.to_csv("Data/csv/fin_lab_large.csv", index=False)

# Creating balanced train and test datasets

In [21]:
def build_balanced_ds(df, text_col, sent_col, list_labels=["positive", "negative"], ds_size=599*2, test_size=0.5, random_state=0):

    container = []
    label_size = int(ds_size/len(list_labels))
    split_index = int(np.floor(ds_size*test_size/len(list_labels)))

    for label in list_labels:

        label_df = (df[df.iloc[:, sent_col]==label]
                    .sample(label_size, random_state=random_state)
                    )

        label_test_df = label_df.iloc[0:split_index, :]
        label_test_df["set"] = "test"
        label_train_df = label_df.iloc[split_index:, :]
        label_train_df["set"] = "train"

        container.append(label_test_df)
        container.append(label_train_df)
        

    output_df = (pd.concat(container, axis=0)
                .sample(frac=1, random_state=random_state)
                .reset_index(drop=True)
                )

    return output_df

In [22]:
labeled_df["sentiment"].value_counts()

positive    1346
negative     599
Name: sentiment, dtype: int64

In [23]:
labeled_bal_df = build_balanced_ds(labeled_df, 0, 1, test_size=0.3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_test_df["set"] = "test"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_train_df["set"] = "train"


In [24]:
labeled_bal_df
labeled_bal_df.to_csv("Data/csv/fin_lab_bal.csv", index=False)

In [25]:
labeled_bal_df["sentiment"].value_counts(dropna=False)

positive    599
negative    599
Name: sentiment, dtype: int64

In [26]:
labeled_bal_df["set"].value_counts(dropna=False)

train    840
test     358
Name: set, dtype: int64

In [27]:
labeled_bal_df

Unnamed: 0,text,sentiment,set
0,The StoneGate UTM solution offers protection a...,positive,train
1,HELSINKI AFX - Outokumpu Technology said it ha...,positive,train
2,Finnish IT consultancy Satama Interactive Oyj ...,positive,test
3,HK Ruokatalo now accounts for about 50 % of po...,positive,train
4,Net cash flow from operating activities was a ...,negative,test
...,...,...,...
1193,Finnish Bank of +_land reports its operating p...,negative,train
1194,"( ADPnews ) - May 4 , 2010 - Finnish cutlery a...",negative,test
1195,Finnish developer and manufacturer of mobile p...,negative,train
1196,"Comptel , a vendor of dynamic Operations Suppo...",positive,train


# Building a folder with the headlines stored as text files

In [28]:
def build_folder(df, sent_col, text_col, set_col, labels_list = ["positive", "negative"]):

    root = "Data/financial"

    for set in ["train", "test"]:

        path = root + "/" + set

        if os.path.isdir(path):

            shutil.rmtree(path)
        
        for label in labels_list:

            path = path + "/" + label

            os.makedirs(path)

            path = root + "/" + set

    for row in tqdm(range(len(df))):

        text = df.iloc[row, text_col]
        sentiment = df.iloc[row, sent_col]
        set_label = df.iloc[row, set_col]

        file = open(f'Data/financial/{set_label}/{sentiment}/{row}.txt', "w")
        file.write(text)
        file.close()


In [29]:
build_folder(labeled_df, 1, 0, 2)

100%|██████████| 1945/1945 [00:00<00:00, 2403.57it/s]


# Human supervision on highly uncertainty headlines

## Reuters dataset

In [30]:
reuters_headlines_labelled_df = pd.read_csv("Data/csv/reuters_headlines_labeled.csv")

In [31]:
reuters_headlines_labelled_df

Unnamed: 0,text,prob_pos,prob_neg,certainty,sentiment
0,TikTok considers London and other locations fo...,0.814905,0.185095,0.629809,positive
1,Disney cuts ad spending on Facebook amid growi...,0.083292,0.916708,0.833416,negative
2,Trail of missing Wirecard executive leads to B...,0.957814,0.042186,0.915627,positive
3,Twitter says attackers downloaded data from up...,0.161916,0.838084,0.676169,negative
4,U.S. Republicans seek liability protections as...,0.325199,0.674801,0.349602,negative
...,...,...,...,...,...
32765,Malaysia says never hired British data firm at...,,,,
32766,Prosecutors search Volkswagen headquarters in ...,,,,
32767,McDonald's sets greenhouse gas reduction targets,,,,
32768,Pratt & Whitney to deliver spare A320neo engin...,,,,


In [32]:
top10_certain = reuters_headlines_labelled_df[['text', 'certainty', "sentiment"]].sort_values(by='certainty', ascending=False).head(10)

In [33]:
top10_certain = top10_certain.reset_index(drop=True)

In [34]:
(top10_certain["text"][1], top10_certain["sentiment"][1])

('Burberry aims to woo more customers in China with Tencent tie-up',
 'positive')

In [35]:
top_uncertain = reuters_headlines_labelled_df[['text', 'certainty', "sentiment"]].sort_values(by='certainty', ascending=True).head(200)

In [36]:
(reuters_headlines_labelled_df["text"][row], reuters_headlines_labelled_df["sentiment"][row], reuters_headlines_labelled_df["prob_pos"][row], reuters_headlines_labelled_df["prob_neg"][row])

('Walmart to limit number of customers at stores as virus crisis deepens',
 'negative',
 0.3913768529891968,
 0.6086231470108032)

In [37]:
top_uncertain["manual_sentiment"] = None
top_uncertain = top_uncertain.reset_index(drop=True)
top_uncertain

Unnamed: 0,text,certainty,sentiment,manual_sentiment
0,Boeing shares plummet as travel restrictions h...,0.000000,negative,
1,U.S. opposes massive liquidity IMF boost: Mnuchin,0.000092,negative,
2,Fiat Chrysler in talks over 6.3 billion euro s...,0.000153,positive,
3,Fiat Chrysler chairman sees Peugeot deal final...,0.000244,negative,
4,Saudi Arabia faces reality check as Wall Stree...,0.000244,negative,
...,...,...,...,...
195,GM stops paying for health insurance for strik...,0.017332,negative,
196,"U.S. Energy Department urges Saudi, Russia to ...",0.017393,positive,
197,"Nissan to resume production at Spanish plant, ...",0.017393,negative,
198,Venture firm Benchmark raises new fund without...,0.017393,positive,


In [38]:
def manual_labeling(df, start, end):

    for row in range(start,end):

        text = df["text"][row]
        print(text)
        print("---------")
        sentiment = input("Is the sentiment of this headline postive or negative?")
        print("---------")
        
        if sentiment=="2":
            
            df.iloc[row, 3] = "positive"

        elif sentiment=="1":

            df.iloc[row, 3] = "negative"

        else:

            print("Wrong Input!")

In [39]:
manual_labeling(top_uncertain,150,200)

DoubleLine CEO Jeffrey Gundlach warns Fed rate cuts will not stop U.S. recession
---------


KeyboardInterrupt: Interrupted by user

In [None]:
reuters_labelled_newbatch = (top_uncertain.drop(["sentiment", "certainty"], axis=1)
                            .rename({"manual_sentiment": "sentiment"}, axis=1)
                            .iloc[100:200,:]
                            )

In [None]:
reuters_labelled_newbatch

Unnamed: 0,text,sentiment
100,"'Am I going to get shot?' kids ask, as brands ...",negative
101,Credit Suisse revisits ex-U.S. employee's spyi...,negative
102,Indonesia readying measures to stabilize finan...,positive
103,Russia must press on with Nord Stream 2 gas li...,negative
104,Amazon and other large ecosystems at risk of r...,negative
105,Explainer: The jet subsidy row that threatens ...,negative
106,Here's what the new U.S. restrictions on Europ...,negative
107,Explainer: The jet subsidy row that threatens ...,negative
108,Hudson's Bay Co seeks to bolster Saks off-pric...,positive
109,U.S. Supreme Court clears way for pipeline to ...,positive


In [None]:
reuters_labelled = pd.concat([reuters_labelled,reuters_labelled_newbatch], axis=0)
reuters_labelled

Unnamed: 0,text,sentiment
0,Boeing shares plummet as travel restrictions h...,negative
1,U.S. opposes massive liquidity IMF boost: Mnuchin,negative
2,Fiat Chrysler in talks over 6.3 billion euro s...,positive
3,Fiat Chrysler chairman sees Peugeot deal final...,positive
4,Saudi Arabia faces reality check as Wall Stree...,negative
...,...,...
195,GM stops paying for health insurance for strik...,negative
196,"U.S. Energy Department urges Saudi, Russia to ...",negative
197,"Nissan to resume production at Spanish plant, ...",positive
198,Venture firm Benchmark raises new fund without...,positive


In [None]:
labeled_df = labeled_df.drop(["set"], axis=1)

In [None]:
labeled_df = pd.concat([labeled_df, reuters_labelled], axis=0)

In [None]:
labeled_df

Unnamed: 0,text,sentiment
0,The borrower was happy to do the roadshow and ...,positive
1,The court found TelecomInvest 's arguments con...,positive
2,"In addition to the Indian market , the new pla...",positive
3,"Operating profit totalled EUR 9.0 mn , down fr...",negative
4,Biohit and VWR have been in close cooperation ...,positive
...,...,...
195,GM stops paying for health insurance for strik...,negative
196,"U.S. Energy Department urges Saudi, Russia to ...",negative
197,"Nissan to resume production at Spanish plant, ...",positive
198,Venture firm Benchmark raises new fund without...,positive


In [None]:
train, test = train_test_split(labeled_df, test_size=0.2, stratify=labeled_df["sentiment"])

In [None]:
train["set"] = "train"
test["set"] = "test"

labeled_df = pd.concat([train, test], axis=0).reset_index(drop=True)
labeled_df


Unnamed: 0,text,sentiment,set
0,The loss for the third quarter of 2007 was EUR...,positive,train
1,The company said that it will supply the WCDMA...,positive,train
2,"( ADPnews ) - Oct 21 , 2009 - Finland-based IT...",negative,train
3,This is the first time Finnair has been awarde...,positive,train
4,"a January 11 , 2010 EPHC board of directors ha...",positive,train
...,...,...,...
2240,"mn , and pretax profit to EUR 46.4 mn from EUR...",positive,test
2241,The situation of coated magazine printing pape...,negative,test
2242,Exclusive: U.S. states plan Google antitrust m...,negative,test
2243,The first ship has been delayed and is estimat...,negative,test


In [None]:
labeled_df.to_csv("Data/financial/fin_labelled_large_reuters.csv", index=False)

## Guardian dataset

In [None]:
guardian_headlines_labelled_df = pd.read_csv("Data/csv/guardian_headlines_labeled.csv")

In [None]:
guardian_headlines_labelled_df

Unnamed: 0,text,prob_pos,prob_neg,certainty,sentiment
0,Johnson is asking Santa for a Christmas recovery,0.452906,0.547094,0.094187,negative
1,‘I now fear the worst’: four grim tales of wor...,0.182790,0.817210,0.634420,negative
2,Five key areas Sunak must tackle to serve up e...,0.650689,0.349311,0.301379,positive
3,Covid-19 leaves firms ‘fatally ill-prepared’ f...,0.634331,0.365669,0.268662,positive
4,The Week in Patriarchy \n\n\n Bacardi's 'lad...,0.974264,0.025736,0.948528,positive
...,...,...,...,...,...
17795,How investing in solar energy can create a bri...,0.987204,0.012797,0.974407,positive
17796,Poundland suppliers hit by insurance downgrade,0.068102,0.931898,0.863795,negative
17797,Cryptocurrencies: City watchdog to investigate...,0.964288,0.035712,0.928576,positive
17798,Unilever sells household name spreads to KKR f...,0.877687,0.122313,0.755373,positive


In [None]:
top10_certain = guardian_headlines_labelled_df[['text', 'certainty', "sentiment"]].sort_values(by='certainty', ascending=False).head(10)

In [None]:
top10_certain = top10_certain.reset_index(drop=True)

In [None]:
(top10_certain["text"][1], top10_certain["sentiment"][1])

('Developing a new way of creating more homes', 'positive')

In [None]:
top_uncertain = guardian_headlines_labelled_df[['text', 'certainty', "sentiment"]].sort_values(by='certainty', ascending=True).head(300).reset_index(drop=True)

In [None]:
row=1

In [None]:
(top_uncertain["text"][row], top_uncertain["sentiment"][row])

('Lockdown lifting: US exercise equipment sales soar amid pandemic',
 'positive')

In [None]:
top_uncertain["manual_sentiment"] = None
top_uncertain

Unnamed: 0,text,certainty,sentiment,manual_sentiment
0,The long read The George Soros philosophy – a...,0.000000,negative,
1,Lockdown lifting: US exercise equipment sales ...,0.000061,positive,
2,The Guardian view on His Dark Materials on the...,0.000099,positive,
3,EasyJet founder offers £5m reward as he seeks ...,0.000244,positive,
4,Business live Government won't hold inquiry i...,0.000305,negative,
...,...,...,...,...
295,Viewpoint column \n\n\n There's zero interes...,0.022945,negative,
296,US insurer Chubb to stop investing in or selli...,0.023037,positive,
297,Caledonian Sleeper suffers setbacks with Highl...,0.023067,positive,
298,Record numbers switch electricity supplier ami...,0.023098,negative,


In [None]:
batch1 = human_supervision(top_uncertain,100,250)

UK inflation hits six-month high as petrol and energy prices rise
---------
Is the sentiment of this headline postive or negative?1
---------
Harley-Davidson: Trump's tariffs pose grave threat to famous American name
---------
Is the sentiment of this headline postive or negative?1
---------
BT unveils new logo after years of work – its name in a circle
---------
Is the sentiment of this headline postive or negative?1
---------
'Body blow for area': Bridgend devastated by Ford factory closure
---------
Is the sentiment of this headline postive or negative?1
---------
UK living wage rises above inflation rate for 180,000 workers
---------
Is the sentiment of this headline postive or negative?2
---------
Fashion industry reels as coronavirus shuts shops and cancels orders
---------
Is the sentiment of this headline postive or negative?1
---------
Dominic Cummings accused of conflict of interest over NHS fund
---------
Is the sentiment of this headline postive or negative?1
---------
Debe

Is the sentiment of this headline postive or negative?3
---------
Money talks  


  Retirement – can you make your pension last at least 20 years?
---------
Is the sentiment of this headline postive or negative?3
---------
The race to find a coronavirus treatment has one major obstacle: big pharma
---------
Is the sentiment of this headline postive or negative?1
---------
World Bank boss calls for more transparent lending to poor nations
---------
Is the sentiment of this headline postive or negative?3
---------
Comic Relief's Richard Curtis backs ethical pension campaign
---------
Is the sentiment of this headline postive or negative?3
---------
No-deal Brexit threatens overseas territories
---------
Is the sentiment of this headline postive or negative?1
---------
Mexico could tighten migration controls to defuse Trump tariffs threat
---------
Is the sentiment of this headline postive or negative?3
---------
Freezing Britain and the chill wind of austerity
---------
Is the sentiment 

Is the sentiment of this headline postive or negative?1
---------
Nils Pratley on finance  


  It’s long past time to give auditors teeth
---------
Is the sentiment of this headline postive or negative?3
---------
Shoppers expected to spend more than £4bn on Boxing Day
---------
Is the sentiment of this headline postive or negative?1
---------
Cruise and Maritime Voyages in emergency talks after potential loan deal collapses
---------
Is the sentiment of this headline postive or negative?1
---------
Dixons Carphone CEO will step down to run chemist chain Boots
---------
Is the sentiment of this headline postive or negative?1
---------
Guardian Opinion cartoon  Ben Jennings on British Airways and data security – cartoon
---------
Is the sentiment of this headline postive or negative?3
---------
US jobs report: record-breaking streak continues as 225,000 added in January
---------
Is the sentiment of this headline postive or negative?2
---------
Gambling watchdog fails to curb fixed-odd

In [None]:
batch1_cleaned = (batch1.drop(["sentiment", "certainty"], axis=1)
                            .rename({"manual_sentiment": "sentiment"}, axis=1)
                 )

In [None]:
batch1_cleaned

Unnamed: 0,text,sentiment
100,UK inflation hits six-month high as petrol and...,negative
101,Harley-Davidson: Trump's tariffs pose grave th...,negative
102,BT unveils new logo after years of work – its ...,negative
103,'Body blow for area': Bridgend devastated by F...,negative
104,UK living wage rises above inflation rate for ...,positive
...,...,...
245,The panel \n\n\n 2020 budget: the Guardian p...,discard
246,Pressure grows for developing world debt relie...,positive
247,York shop bans cheesy Christmas music for sake...,discard
248,UK manufacturing sector ‘far larger than polit...,discard


In [None]:
labeled_df = labeled_df.drop(["set"], axis=1)

In [None]:
labeled_df

Unnamed: 0,text,sentiment
0,The company reported today an operating loss o...,negative
1,Global sports equipment maker Amer Sports Corp...,positive
2,`` The second quarter of 2010 was the firstqua...,positive
3,Net sales surged by 30 % to EUR 36 million .,positive
4,25 March 2011 - Finnish electronics contract m...,negative
...,...,...
1940,The rebuilds are designed to improve the machi...,positive
1941,Operating profit rose to EUR 4.7 mn from EUR 3...,positive
1942,"In January-September 2007 , Finnlines ' net sa...",positive
1943,"According to A-Rehu 's Managing Director , Jou...",positive


In [None]:
labeled_df = pd.concat([labeled_df, batch_cleaned, batch1_cleaned], axis=0)

In [None]:
labeled_df = labeled_df[labeled_df["sentiment"] != "discard"]
labeled_df

Unnamed: 0,text,sentiment
0,The company reported today an operating loss o...,negative
1,Global sports equipment maker Amer Sports Corp...,positive
2,`` The second quarter of 2010 was the firstqua...,positive
3,Net sales surged by 30 % to EUR 36 million .,positive
4,25 March 2011 - Finnish electronics contract m...,negative
...,...,...
240,"Here in Liverpool, we know what northern auste...",negative
241,UK needs 'biggest-ever peacetime job creation ...,positive
242,Business live Stock market turmoil wipes £56b...,negative
243,Economics viewpoint \n\n\n Saudi Arabia has ...,negative


In [None]:
train, test = train_test_split(labeled_df, test_size=0.2, stratify=labeled_df["sentiment"])

In [None]:
train["set"] = "train"
test["set"] = "test"

labeled_df = pd.concat([train, test], axis=0).reset_index(drop=True)
labeled_df


Unnamed: 0,text,sentiment,set
0,"Vaisala Oyj Press Release September 30 , 2010 ...",positive,train
1,"Operating profit totaled EUR 3.8 mn , down fro...",negative,train
2,The negotiation has resulted in the reduction ...,negative,train
3,Tekla will implement the renewal in software v...,positive,train
4,Publishing Sweden 's operating loss was EUR 1....,negative,train
...,...,...,...
2114,The company said that the fall in turnover had...,negative,test
2115,Consolidated operating profit from continuing ...,negative,test
2116,Incap Contract Manufacturing Services Private ...,positive,test
2117,Operating profit rose to EUR 4.7 mn from EUR 4...,positive,test


In [None]:
labeled_df.to_csv("Data/csv/fin_labelled_large_imp3.csv", index=False)

In [None]:
guardian_labelled_humansup_df = pd.concat([batch, batch1], axis=0)
guardian_labelled_humansup_df


Unnamed: 0,text,certainty,sentiment,manual_sentiment
0,The long read The George Soros philosophy – a...,0.000000,negative,discard
1,Lockdown lifting: US exercise equipment sales ...,0.000061,positive,negative
2,The Guardian view on His Dark Materials on the...,0.000099,positive,discard
3,EasyJet founder offers £5m reward as he seeks ...,0.000244,positive,negative
4,Business live Government won't hold inquiry i...,0.000305,negative,positive
...,...,...,...,...
245,The panel \n\n\n 2020 budget: the Guardian p...,0.018919,positive,discard
246,Pressure grows for developing world debt relie...,0.018919,positive,positive
247,York shop bans cheesy Christmas music for sake...,0.018964,negative,discard
248,UK manufacturing sector ‘far larger than polit...,0.019010,positive,discard


In [None]:
guardian_labelled_humansup_df.to_csv("Data/csv/guardian_headlines_labeled_humansup.csv", index=False)

In [41]:
labeled_df = pd.read_csv("Data/csv/fin_labelled_large_imp3.csv")

In [42]:
build_folder(labeled_df, 1,0,2)

100%|██████████| 2119/2119 [00:00<00:00, 2364.78it/s]
