<a href="https://colab.research.google.com/github/JonathanWalkerCS/Coursework/blob/main/nlp_bakingFinancialService.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h2 align="center"> Sentiment Analysis in Banking and Financial Services<h2>
<h3 align="center"> Natural Language Processing <h3>

In [None]:
!pip install keras

In [None]:
#Import required Libraries
import pandas as pd
import numpy as np

#Libraies to clean the data
import nltk
from nltk.corpus import stopwords

#library for data Visualization
import wordcloud
import matplotlib.pyplot as plt

#library and function for tokanization and vectorization
from tensorflow.keras.preprocessing.text import Tokenizer
#from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#from keras_nlp.tokenizers import TextVectorization
#from keras.preprocessing.sequence import pad_sequences

#Importing libraries to build our models
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from sklearn.metrics import classification_report, accuracy_score

<h3 align='center'>1. Corpus <h3>

In [None]:
#Import our datset into our Jupyter Notebook

data = pd.read_csv("all-data.csv", encoding='iso-8859-1')
#See the data
#pd.read_csv("all-data.csv", encoding='iso-8859-1')

In [None]:
#Print how our dataset looks like: The dataset has two columns, which are sentiments and the news.
#Shows the first eight rows of data using the .head(value) function
data.head(8)

In [None]:
#rename the columns of your dataframe
data.columns = ["sentiment","news"]
data.head()

In [None]:
#Printing the first 5 news of our dataset
#iloc is index location, in this case a range 0 to 5 in order to print the first 5 rows/elements
for news in data['news'].iloc[0:5]:
    print("\n",news)

<h3 align="center"> 2. Cleaning and Segmentation <h3>

In [None]:
#Turn all Uppercase words into lowercase and remove special character and digits.

#1. Turn all UpperCase and Capitalized word into lowecase.
#implementing the lambda function
#turning every uppercase word to lowercase
data['news'] = data['news'].apply(lambda word: ' '.join(word.lower() for word in word.split()))

#2. Remove any digits/numbers and special character
#regular expression to identify any digits and replace them with an empty space
data['news'] = data['news'].str.replace('\d+',' ',regex=True)

#3. Remove special characters
data['news'] = data['news'].str.replace('\W',' ',regex=True)

#print the dataset after the clean.
data.head()

In [None]:
#download the nltk package needed for stopwords
nltk.download('stopwords')

In [None]:
#Remove StopWords: To remove stopword you use the Library nltk, in particular you will need to import nltk.corpus.
#stopwords package from the ntlk libary corpus
#here we import the stopwords in english
stopwords_list = set(stopwords.words('english'))

#overriding the news and taking out the stopwords
#split the sentence to identify the stopwords.
#If the word is not in the stopword list, we add it
data['news'] = data['news'].apply(lambda word:' '.join([word for word in word.split() if word not in stopwords_list]))

#printing dataset
data.head()

In [None]:
#Printing the first 5 news of our dataset

for news in data['news'].iloc[0:5]:
    print("\n",news)

In [None]:
#Create a cloud of words upon their frequency: We will use two libraries wordcloud and matplotlib.
#Largest word has the most frequency in the text
common_words=''

for i in data.news:
    i = str(i)
    word = i.split()
    common_words += " ".join(word)+" "
print(common_words)


wordcloud = wordcloud.WordCloud(width = 800, height = 800, background_color='white', min_font_size=10, collocations=False).generate(common_words)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()




<h3 align="center">3-4. Tokanization and Vectorization<h3>

In [None]:
#Let breakdown each news into words: We will need to import Tokanizer library and keras.
#Turning each sentence into vectors
#Had to use add a constructor: tokenizer = Tokenizer()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['news'].values)
X = tokenizer.texts_to_sequences(data['news'].values)

#print the news tokes-vectors.
print("number of rows:{}\nnumber of columns: {}".format(len(X),len(X[0])))
X[0:5]

In [None]:
#The length of our tokenized data are not even, lets set each text with the same length.
X = pad_sequences(X)

#print the news tokes-vectors.
print("number of rows:{}\nnumber of columns: {}".format(len(X),len(X[0])))
X[0:5]

<h1 align=center>5-6. Embedding and Building our Model<h1>

In [None]:
# Defining a new variable to store the sentiment [labels]
Y = data['sentiment']

#Using LabelEncoding:this method will turn the sentiment values into numerical values [encode strings values into numerical values]
Y = LabelEncoder().fit_transform(Y)

#Turning labels into categorical values.
Y = to_categorical(Y)


In [None]:
#Splitting the data into training and testing
#X_train contains the news used to train the mode, Y_train is used to test it
#Y is the categorical labels
#train will contain 80% of the news
#test will contain 20% of the news
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20)

print("Number of samples for training [news]:{}\nNumber of sample for training [Labels]:{}".format(X_train.shape,y_train.shape))
print("Number of samples for testing [news]:{}\nNumber of sample for testing [Labels]:{}".format(X_test.shape,y_test.shape))

In [None]:
#define model
#Input shape: the number of features
#Pad sequence: defines the length of your array (number of features)
#Flatten turns 4x4 layers into 1x16 for example
#Dense(3,) is 3 layers to represent the categorization
n_features = 31
model = Sequential()
model.add(Embedding(500,120,input_shape=(n_features,)))
model.add(Flatten())
model.add(Dense(10, activation='relu', kernel_initializer ='he_normal', input_shape=(n_features,)))
model.add(Dense(8, activation='relu', kernel_initializer ='he_normal'))
model.add(Dense(3, activation='sigmoid'))


In [None]:
# compile the model
#can name the model anything, we just used the word model here
#crossentropy is a function used for classification tasks
#metric is used to check the accuracy
#Still WORKING on this section
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
# fit the model
model.fit(X_train,y_train,epochs=20, batch_size=32, verbose=2)

In [None]:
# Evaluate out model using the accuracy metric
loss, acc = model.evaluate( )
print('Test Accuracy: %.3f' % acc)

In [None]:
# Evaluating the model
sentiment_prediction = model.predict(X_test)
predicted_inverse = np.argmax(sentiment_prediction,axis=1)
y_test_inverse = np.argmax(y_test,axis=1)

In [None]:
target_name = ["Class {}".format(i) for i in range(3)]

print( )