In [1]:
import os
import re
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator
from collections import Counter

import tensorflow as tf

from tqdm import tqdm
tqdm.pandas()
# Other libraries we will install at the time of application

In [2]:
# Let's check the version for tf and if it's working eagerly
print(tf.__version__)

print(tf.executing_eagerly())

2.0.0
True


In [3]:
data = pd.read_csv('New_combined_data')
data

Unnamed: 0.1,Unnamed: 0,Flair,URL,Title,Comments,Selftext,ID,Combined
0,0,News,https://www.cnbc.com/2021/09/08/job-openings-s...,"Job openings soar to record 10.9 million, outn...",['Rule-breaking comments have reached an exces...,,0,"Job openings soar to record 10.9 million, outn..."
1,1,News,https://www.yahoo.com/finance/news/china-prope...,China’s property market runs out of steam as m...,"['[removed]', ""I'm not saying this is the same...",,0,China’s property market runs out of steam as m...
2,2,News,https://www.cnbc.com/2021/08/31/millions-of-am...,Millions of Americans will lose unemployment b...,"['Please keep all comments substantive, on top...",,0,Millions of Americans will lose unemployment b...
3,3,News,https://www.cbc.ca/news/business/lumber-prices...,Lumber crash leads to 'blowout' sales as price...,"[""I'm in a totally different field but you can...",,0,Lumber crash leads to 'blowout' sales as price...
4,4,News,https://nyti.ms/3jYqJgO,"Skilled Workers Are Scarce, Posing a Challenge...",['It looks like this post may have political c...,,0,"Skilled Workers Are Scarce, Posing a Challenge..."
...,...,...,...,...,...,...,...,...
3327,3327,Policy/Economy,https://www.livemint.com/news/india/mumbaiahme...,Mumbai-Ahmedabad bullet train: Railways launch...,[],,17,Mumbai-Ahmedabad bullet train: Railways launch...
3328,3328,Policy/Economy,https://timesofindia.indiatimes.com/city/kolka...,Supercar owners in Kolkata frustrated with fue...,['Supercar owners fristated with fuel prices? ...,,17,Supercar owners in Kolkata frustrated with fue...
3329,3329,Policy/Economy,https://www.deccanherald.com/national/dont-lec...,"Don't lecture India on freedom of speech, demo...","[""So they don't want to be answerable to anybo...",,17,"Don't lecture India on freedom of speech, demo..."
3330,3330,Policy/Economy,https://www.reddit.com/r/india/comments/o2ikfk...,Revealed: RTI Request uncovers MeitY letters t...,"['Holy shit. Amazing work you guys', 'It would...",\n\n### tl;dr\n\nReports emerged last month t...,17,Revealed: RTI Request uncovers MeitY letters t...


In [4]:
data.drop(columns= ['Unnamed: 0'],inplace = True)
data

Unnamed: 0,Flair,URL,Title,Comments,Selftext,ID,Combined
0,News,https://www.cnbc.com/2021/09/08/job-openings-s...,"Job openings soar to record 10.9 million, outn...",['Rule-breaking comments have reached an exces...,,0,"Job openings soar to record 10.9 million, outn..."
1,News,https://www.yahoo.com/finance/news/china-prope...,China’s property market runs out of steam as m...,"['[removed]', ""I'm not saying this is the same...",,0,China’s property market runs out of steam as m...
2,News,https://www.cnbc.com/2021/08/31/millions-of-am...,Millions of Americans will lose unemployment b...,"['Please keep all comments substantive, on top...",,0,Millions of Americans will lose unemployment b...
3,News,https://www.cbc.ca/news/business/lumber-prices...,Lumber crash leads to 'blowout' sales as price...,"[""I'm in a totally different field but you can...",,0,Lumber crash leads to 'blowout' sales as price...
4,News,https://nyti.ms/3jYqJgO,"Skilled Workers Are Scarce, Posing a Challenge...",['It looks like this post may have political c...,,0,"Skilled Workers Are Scarce, Posing a Challenge..."
...,...,...,...,...,...,...,...
3327,Policy/Economy,https://www.livemint.com/news/india/mumbaiahme...,Mumbai-Ahmedabad bullet train: Railways launch...,[],,17,Mumbai-Ahmedabad bullet train: Railways launch...
3328,Policy/Economy,https://timesofindia.indiatimes.com/city/kolka...,Supercar owners in Kolkata frustrated with fue...,['Supercar owners fristated with fuel prices? ...,,17,Supercar owners in Kolkata frustrated with fue...
3329,Policy/Economy,https://www.deccanherald.com/national/dont-lec...,"Don't lecture India on freedom of speech, demo...","[""So they don't want to be answerable to anybo...",,17,"Don't lecture India on freedom of speech, demo..."
3330,Policy/Economy,https://www.reddit.com/r/india/comments/o2ikfk...,Revealed: RTI Request uncovers MeitY letters t...,"['Holy shit. Amazing work you guys', 'It would...",\n\n### tl;dr\n\nReports emerged last month t...,17,Revealed: RTI Request uncovers MeitY letters t...


In [5]:
# Shuffled dataset
data = data.sample(frac=1).reset_index(drop = True)

# Let's define some variables
vocab_size = 5000
embedding_dim = 35
max_length = 350
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8
dropout = 0.5

In [6]:
def clean_text(x):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',x)
    return x

In [7]:
data['Combined'] = data['Combined'].apply(lambda x: clean_text(x))

# Let's create our dictionary to seperate out some common words to avoid contradictions
# And some functions to replace these words from dictionary to our combined dataset
contradiction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", 
                      "could've": "could have", "couldn't": "could not", "didn't": "did not",  
                      "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", 
                      "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", 
                      "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                      "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have",
                      "I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", 
                      "i'll": "i will",  "i'll've": "i will have", "i'm": "i am", "i've": "i have", 
                      "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", 
                      "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", 
                      "mayn't": "may not", "might've": "might have","mightn't": "might not",
                      "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", 
                      "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                      "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
                      "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", 
                      "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                      "she'll've": "she will have", "she's": "she is", "should've": "should have", 
                      "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", 
                      "so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", 
                      "that's": "that is", "there'd": "there would", "there'd've": "there would have", 
                      "there's": "there is", "here's": "here is", "they'd": "they would", 
                      "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", 
                      "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", 
                      "we'd": "we would", "we'd've": "we would have", "we'll": "we will", 
                      "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", 
                      "what'll": "what will", "what'll've": "what will have", "what're": "what are",  
                      "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have",
                      "where'd": "where did", "where's": "where is", "where've": "where have", 
                      "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", 
                      "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", 
                      "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                      "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                      "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                      "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                      "you'll've": "you will have", "you're": "you are", "you've": "you have"}

def get_contradictions(contradinction_dict):
    contradiction_re = re.compile('%s' % '|'.join(contradiction_dict.keys()))
    return contradiction_dict, contradiction_re

contradictions, contradictions_re = get_contradictions(contradiction_dict)

def replace_contradictions(text):
    def replace(match):
        return contradictions[match.group(0)]
    return contradictions_re.sub(replace,text)

data['Combined'] = data['Combined'].apply(lambda x: replace_contradictions(x))

from sklearn import preprocessing
labelencoder = preprocessing.LabelEncoder()
df_label = labelencoder.fit_transform(data['Flair'])

data

Unnamed: 0,Flair,URL,Title,Comments,Selftext,ID,Combined
0,Editorial,https://capx.co/consumers-will-pick-up-the-bil...,Consumers will pick up the bill for an ‘Amazon...,"['[deleted]', '> Business rates are levied acc...",,1,Consumers will pick up the bill for an ‘Amazon...
1,Credit,https://www.reddit.com/r/personalfinance/comme...,I missed a payment for the first time.... Was ...,['Call and ask to have it reversed. Most cred...,Hi all. About 2-3 months ago I made a small pa...,9,I missed a payment for the first time.... Was ...
2,Taxes,https://www.reddit.com/r/personalfinance/comme...,I moved last year and now both states are tryi...,"[""As an Iowa resident, Iowa taxes all of your ...","I lived in Nebraska up until 5/11/2020, then m...",11,I moved last year and now both states are tryi...
3,News,https://www.theguardian.com/us-news/2021/aug/1...,US judge throws out Trump-era approval for gia...,['It looks like this post may have political c...,,0,US judge throws out Trump-era approval for gia...
4,Employment,https://www.reddit.com/r/personalfinance/comme...,First job right out of college completely fres...,"[""don't use a short term index fund, those typ...",Hey! I just got a job while in college and hav...,12,First job right out of college completely fres...
...,...,...,...,...,...,...,...
3327,Investing,https://www.reddit.com/r/personalfinance/comme...,Just opened a Roth IRA with Fidelity. What is ...,"[""FSKAX/FZROX are the equivalent to VTSAX. \n\...","Hey guys,\n\nI'm a pretty much clueless 18 yea...",8,Just opened a Roth IRA with Fidelity. What is ...
3328,Politics,https://www.reddit.com/r/india/comments/p49ngp...,TIL Bihar has had two extra days of casual lea...,['Governments in India usually have good to ex...,Some area where Bihar is ahead of other states...,15,TIL Bihar has had two extra days of casual lea...
3329,Business/Finance,https://www.moneycontrol.com/news/technology/a...,Ford to close both India manufacturing plants ...,"['They sell gas guzzling machines, even in US ...",,16,Ford to close both India manufacturing plants ...
3330,Housing,https://www.reddit.com/r/personalfinance/comme...,Overwhelmed with profits from upcoming home sa...,"[""In my opinion you don't need a financial adv...",I'm a few weeks out from selling my townhome a...,7,Overwhelmed with profits from upcoming home sa...


In [8]:
# Let's make our splits
from sklearn.model_selection import train_test_split

train_size = int(len(data) * training_portion)

train_matrics = data['Combined'][0: train_size]
train_labels = df_label[0: train_size]

validation_matrics = data['Combined'][train_size:]
validation_labels = df_label[train_size:]

print(train_matrics)
print(train_labels)
print(validation_matrics)
print(validation_labels)

0       Consumers will pick up the bill for an ‘Amazon...
1       I missed a payment for the first time.... Was ...
2       I moved last year and now both states are tryi...
3       US judge throws out Trump-era approval for gia...
4       First job right out of college completely fres...
                              ...                        
2660    Should I max out my IRA this year? Look for so...
2661    Paying off Student Loans with 401k? I am not g...
2662    Karnataka government to object bringing fuel p...
2663    Tiny medical bill just sent to collections—can...
2664    Do I need to change my W4? Just recently marri...
Name: Combined, Length: 2665, dtype: object
[ 5  3 17 ... 12  3 17]
2665    Can someone please help me fill out this new W...
2666    Credit card for Excellent Credit Score but low...
2667    My dad is self employed but doesn’t put the ti...
2668    Not sure if I should sell uncle managed stock(...
2669    Budget Review - I appreciate any advice! [Budg...
    

Now, let's perform some preprocessing steps

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Let's run the tokenizer
tokenizer = Tokenizer(num_words = vocab_size, oov_token= oov_tok, filters= '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                     lower = True)
tokenizer.fit_on_texts(train_matrics)

word_index = tokenizer.word_index
dict(list(word_index.items())[0: 10])

{'<OOV>': 1,
 'the': 2,
 'to': 3,
 'a': 4,
 'you': 5,
 'and': 6,
 'i': 7,
 'is': 8,
 'of': 9,
 'in': 10}

Let's get those sequences

In [10]:
train_sequences = tokenizer.texts_to_sequences(train_matrics)
print(train_sequences[10])

train_padded = pad_sequences(train_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

[67, 142, 117, 64, 10, 277, 141, 7, 92, 4, 136, 29, 4, 154, 9, 735, 23, 4, 99, 117, 164, 13, 668, 579, 142, 14, 408, 14, 73, 2, 233, 9, 668, 579, 7, 18, 3, 3384, 1, 1, 7, 18, 535, 14, 2, 117, 21, 4, 154, 112, 22, 183, 1, 29, 4, 456, 9, 2, 1190, 2, 205, 5, 18, 3578, 37, 4665, 7, 68, 2197, 338, 44, 99, 278, 71, 11, 1, 155, 129, 507, 164, 21, 2, 354, 286, 98, 217, 2, 136, 354, 447, 105, 286, 1, 4605, 4, 351, 249, 154, 33, 99, 13, 244, 65, 15, 173, 98, 34, 22, 1, 4122, 103, 384, 5, 34, 233, 46, 135, 2347, 1, 260, 10, 117, 16, 338, 44, 129, 507, 9, 99, 170, 98, 11, 854, 4, 735, 154, 1388, 44, 1, 278, 98, 2, 137, 98, 1388, 22, 1321, 129, 507, 9, 99, 29, 1, 1, 37, 2, 137, 98, 2, 354, 9, 15, 154, 2844, 73, 351, 249, 1, 1, 1, 2, 524, 98, 15, 154, 8, 76, 1860, 1, 14, 98, 5, 35, 344, 35, 1321, 129, 507, 9, 99, 27, 76, 488, 14, 1428, 313, 337, 37, 24, 98, 62, 5, 44, 1, 5, 35, 1321, 1, 2211, 176, 117, 1, 2347, 37, 5, 44, 78, 1, 2211, 40, 9, 2, 81, 58, 5, 116, 23, 173, 129, 6, 37, 21, 6, 37, 21, 16,

In [11]:
validation_sequences = tokenizer.texts_to_sequences(validation_matrics)
validation_padded = pad_sequences(validation_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

In [12]:
print(train_matrics[10])

How does interest work in loans? Say I take a loan or a mortgage of 100k $ with a 2% interest rate for X period. Does that mean that by the end of X period, I have to repay 102k?

Cause I have read that the interest on a mortgage may be two thirds or a half of the principal (the amount you have borrowed). So naturally I am confused. ['You pay 2% per year. \n\nRealistically, it’s 1/12 * rate on the balance each month, since the loan balance goes down each month', 'On a $100,000 mortgage at 2% for 30 years your payment / month would be $369.62, which means you would end up paying $33,063.20 in interest.', 'You pay 1/12 of 2% every month. \n\nIn a $100K mortgage, you’d pay $369 per month. The first month, you’d be charged 1/12 of 2%, or $166.67.So the first month, the balance of your mortgage drops by $100,000-(369-166.67). The second month, your mortgage is now $99,797. That month you get again get charged 1/12 of 2%, but now against that slightly lower number. So this month, when you pa

### Let's dive into the model

In [13]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.compat.v1.keras.layers import CuDNNLSTM

In [14]:
pip install numpy==1.19.5


Note: you may need to restart the kernel to use updated packages.


In [15]:
model = tf.keras.Sequential([
    # An embedding layer is to be added expecting input as vocab_size and output embedding dimention as 64 (set above)
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dropout(dropout),
    # Let's use the ReLU function inplace of tanh since they are good alternatives for each other
    tf.keras.layers.Dense(embedding_dim, activation = 'relu'),
    # The softmax converts the output layers into probability distributions for multiple outputs 
    tf.keras.layers.Dense(len(labelencoder.classes_), activation = 'softmax')
])

2021-10-26 17:04:02.331515: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-26 17:04:02.332769: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [16]:
es = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 0.05, mode = 'min', patience=10, verbose=1)
mc = tf.keras.callbacks.ModelCheckpoint('best_model.h5', monitor = 'val_accuracy',mode = 'max',verbose = 1, save_best_only = True)

Compilation and running of model

In [17]:
model.compile(optimizer = 'nadam',loss = 'sparse_categorical_crossentropy',metrics = ['accuracy'])
num_epochs = 30
history = model.fit(train_padded, train_labels, epochs = num_epochs, validation_data = (validation_padded, validation_labels),
                   verbose= 2, callbacks = [es, mc], shuffle = True)

Train on 2665 samples, validate on 667 samples
Epoch 1/30


2021-10-26 17:04:05.227958: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference___backward_cudnn_lstm_with_fallback_4456_4638' and '__inference___backward_standard_lstm_5251_5736_specialized_for_StatefulPartitionedCall_at___inference_distributed_function_6706' both implement 'lstm_218ce0cc-6fb9-4b2f-b1e5-3d14d681e6d9' but their signatures do not match.
2021-10-26 17:05:14.116210: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference_standard_lstm_7158_specialized_for_sequential_bidirectional_forward_lstm_StatefulPartitionedCall_at___inference_distributed_function_8000' and '__inference_cudnn_lstm_with_fallback_7269' both implement 'lstm_e0e0a1df-e554-4c43-8471-eb1925831721' but their signatures do not match.



Epoch 00001: val_accuracy improved from -inf to 0.12744, saving model to best_model.h5
2665/2665 - 78s - loss: 2.7880 - accuracy: 0.0953 - val_loss: 2.5466 - val_accuracy: 0.1274
Epoch 2/30

Epoch 00002: val_accuracy improved from 0.12744 to 0.16042, saving model to best_model.h5
2665/2665 - 72s - loss: 2.3478 - accuracy: 0.1625 - val_loss: 2.1582 - val_accuracy: 0.1604
Epoch 3/30

Epoch 00003: val_accuracy improved from 0.16042 to 0.20840, saving model to best_model.h5
2665/2665 - 70s - loss: 2.0919 - accuracy: 0.2068 - val_loss: 2.2489 - val_accuracy: 0.2084
Epoch 4/30

Epoch 00004: val_accuracy improved from 0.20840 to 0.24588, saving model to best_model.h5
2665/2665 - 84s - loss: 1.9702 - accuracy: 0.2615 - val_loss: 2.0544 - val_accuracy: 0.2459
Epoch 5/30

Epoch 00005: val_accuracy improved from 0.24588 to 0.27736, saving model to best_model.h5
2665/2665 - 76s - loss: 1.7652 - accuracy: 0.3174 - val_loss: 1.9738 - val_accuracy: 0.2774
Epoch 6/30

Epoch 00006: val_accuracy did no

In [18]:
#saved_model = tf.keras.models.load_model('best_model.h5')

# Let's evaluate the model
_, train_acc = model.evaluate(train_padded, train_labels, verbose=0)
_, test_acc = model.evaluate(validation_padded, validation_labels, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

Train: 0.826, Test: 0.423


The model we trained is overfitting. But since the dynamicity of the unknown data we will cross path with would be very challenging to test and according to our personal preference it would be better for a model to be trained using deep learning than to use simple ML models, we will go with this choice.  

Now, to create the flask app we will use the joblib library of python since it provides utilities for saving and loading Python objects that use Numpy arrays,etc. efficiently.

In [19]:
# Importing library
import joblib

# Saving the label encoder
joblib.dump(labelencoder, 'label_encoder.joblib')

# Saving the tokenizer
joblib.dump(tokenizer, 'tokenizer.joblib')

['tokenizer.joblib']