In [12]:
import pandas as pd
import numpy as np

#Below libraries will be used for data preparation
import nltk, re, string
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing

In [23]:
#prepare original dataset and make it balanced
df = pd.read_csv("finbank_data.csv")

# downsampling positive and neutral sentences to 2000, in order to get a balanced dataset
df_pos = df[df.label=='positive'].head(2000)
df_neu = df[df.label=='neutral'].head(2000)
df_neg = df[df.label=='negative']

df_train = pd.concat([df_pos.head(1600), df_neu.head(1600), df_neg.head(1473)])
#df_val = pd.concat([df_pos.loc[1600:1800], df_neu.loc[1600:1800], df_neg.loc[1600:1800]])
df_val = pd.concat([df_pos.head(1800).tail(200), df_neu.head(1800).tail(200), df_neg.head(1657).tail(184)])
df_test = pd.concat([df_pos.tail(200), df_neu.tail(200), df_neg.tail(184)])

df_train = df_train.reindex(np.random.permutation(df_train.index))
df_val = df_val.reindex(np.random.permutation(df_val.index))
df_test = df_test.reindex(np.random.permutation(df_test.index))

print(df_train)
print(df_val)
print(df_test)


df_train.to_csv("finbank_data_train.csv", index=False)
df_val.to_csv("finbank_data_val.csv", index=False)
df_test.to_csv("finbank_data_test.csv", index=False)

# concatenating the datasets and shuffle
#df_final = pd.concat([df_pos,df_neg],axis=0)
#df_final = pd.concat([df_final,df_neu],axis=0)
#df_final = df_final.reindex(np.random.permutation(df_final.index))

#take a look at the initial table
#df_final

                                                   text     label
577   Cost cutting measures , which have produced ar...  positive
9388  The Finnish national carrier said net loss in ...  negative
163   In total , more than 3000 surveillance cameras...   neutral
1580  Rohwedder Group is an automotive supplies , te...   neutral
2653  The Russian gas giant invested another 46 mill...   neutral
...                                                 ...       ...
748         The identity of the buyer is not yet known    neutral
1252  Finnish Bore that is owned by the Rettig famil...  positive
1945  Also , a six-year historic analysis is provide...   neutral
3850  Shareholders of Rakvere Lihakombinaat decided ...  positive
37    In addition , the Kazakh delegation will visit...   neutral

[4673 rows x 2 columns]
                                                    text     label
6484   It is in line with its plans to focus on selec...  positive
12378  Earnings per share ( EPS ) amounted to a l

Unnamed: 0,text,label
739,In the Baltic states the company reports net s...,negative
6965,This new deal has strengthened the partnership...,positive
3133,The deal includes an option for Cramo to buy o...,neutral
1071,These share transactions are part of the compa...,neutral
2512,"Raute , headquartered in Nastola , Finland , i...",neutral
...,...,...
1373,"As an alternative to the share exchange , Pano...",neutral
90,The mall will be financed on a parity basis by...,neutral
1552,CDP was established on the initiative of insti...,neutral
700,F-Secure Internet Security 2010 is a security ...,neutral


In [30]:
#remove stop words, perform lemmatization and remove symbols from the texts
lemma = WordNetLemmatizer()
stop = stopwords.words('english')
def Text_clean(txt):
    txt = txt.lower()
    words = nltk.word_tokenize(txt)
    words = ' '.join([lemma.lemmatize(word) for word in words if word not in stop])
    words_joined = "".join(words)
    text = re.sub('[^a-z]',' ',words_joined) #remove symbols 
    return text  

#apply Text_clean to the text column in the table, and create a new column "Text_clean"
df_final['text_clean'] = df_final['text'].apply(Text_clean)

In [31]:
#transform the labels to numerical format
le = preprocessing.LabelEncoder()
df_final['label_number'] = le.fit_transform(df_final['label'])
df_final

Unnamed: 0,text,label,text_clean,label_number
3086,"In 2009 , Fiskars ' cash flow from operating a...",positive,fiskars cash flow operating activity ...,2
14608,"Finnish Scanfil , a systems supplier and contr...",negative,finnish scanfil system supplier contract man...,0
916,Finnish OKO bank has signed a cooperation agre...,positive,finnish oko bank signed cooperation agreement ...,2
2271,"In recent months , Capman has taken significan...",positive,recent month capman taken significant step s...,2
2011,"At 10.33 am , Huhtamaki was the market 's bigg...",negative,huhtamaki market s biggest faller ...,0
...,...,...,...,...
113,"The original name Componenta +�m+�l , as a sub...",positive,original name componenta m l subsidiary f...,2
5423,"Operating profit totalled EUR 83.0 mn , up fro...",positive,operating profit totalled eur mn eur ...,2
2999,Net profit was 35.5 mln compared with 29.8 mln,positive,net profit mln compared mln,2
1886,According to Finnish pension insurance company...,positive,according finnish pension insurance company va...,2


In [33]:
#prepare feature(x) and target(y) for machine learning model

#define x and y 
x = df_final.text_clean
y=df_final.label_number

#split data to train, test and valid
from sklearn.model_selection import train_test_split

train_ratio = 0.75
val_ratio = 0.15
test_ratio = 0.10

#set random state as none for now
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=val_ratio)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + val_ratio)) 

#check the size of train, validation and test
print(len(x_train), len(x_val), len(x_test)) 

#check if the size of each label is balanced
print(list(y_train).count(0), list(y_train).count(1), list(y_train).count(2))
print(list(y_val).count(0), list(y_val).count(1), list(y_val).count(2))
print(list(y_test).count(0), list(y_test).count(1), list(y_test).count(2))

4964 526 351
1540 1690 1734
173 175 178
128 135 88


In [34]:
#use bag of words method to vectorize the texts
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

x_train = vectorizer.transform(x_train)
x_val = vectorizer.transform(x_val)
x_test  = vectorizer.transform(x_test)

x_train.toarray() #check the result


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)