## Steps to build the next word recommender system

1. Loading and exploring the dataset
2. Creating N-grams of the dialogue
3. Building the N-gram Language Model
4. Predicting the next word using N-gram Language Model

## 1. Loading and exploring the dataset

In [1]:
# loading the required libraries
import pandas as pd
import numpy as np
import re
import random

In [4]:
df = pd.read_csv('sample_reuters_dataset.csv',usecols=['sentence_text'])
df.shape

(10000, 1)

In [5]:
df.head()

Unnamed: 0,sentence_text
0,ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPA...
1,They told Reuter correspondents in Asian capit...
2,But some exporters said that while the conflic...
3,The U . S . Has said it will impose 300 mln dl...
4,Unofficial Japanese estimates put the impact o...


In [9]:
text_list = df['sentence_text'].tolist()

In [13]:
len(text_list)

10000

In [56]:
random.sample(text_list,5)

['He also said Monier will not now proceed with the one - for - two bonus issue announced with its interim results on March 19 in view of the proposed takeover bids .',
 'The House was to vote today on a bill -- supported by the Democratic and Republican congressional leadership and the administration -- which demanded a report within seven days on plans to meet the security needs of U . S . forces in the gulf .',
 'NATIONAL DISTILLERS & lt ; DR > TO SELL SPIRITS UNIT National Distillers and Chemical Corp said it signed a definitive agreement to sell its spirits division for 545 mln dlrs to James Beam Distilling Co , a unit of American Brands Inc & lt ; AMB >.',
 'CANAM MANAC WINS 8 . 5 MLN DLR CONTRACT ( The Canam Manac Group Inc ) said its Canam Steel Works unit received a contract valued at 8 . 5 mln dlrs to supply steel trusses to ( Canron Inc ) for a new car plant in Ingersoll , Ontario .',
 'CANANDAIGUA WINE CO INC & lt ; CDG . A > 2ND QTR NET Qtr ended Feb 28 Shr 35 cts vs 38 ct

In [36]:
# text cleaning
df_clean = []

for i in text_list:
    # remove everything except alphabets, ' and white spaces
    i = re.sub("[^a-zA-Z' ]", "", i)
    i = re.sub("  ", " ", i)
    i = re.sub(" ' ", "'", i)
    # convert text to lowercase
    i = i.lower()
    # add cleaned text to the list
    df_clean.append(i)

In [38]:
random.sample(df_clean, 5)

['in its semi annual review of the world economy the oecd forecast that growth in the french gross domestic product gdp would run at about two pct in the next six months ',
 'the congressional budget office cbo the nonpartisan budget analysis arm of congress said federal loans or loan guarantees would be preferable options for congress rather than increased trade protection which could lead to foreign retaliation ',
 'stocks on may   at   vs  last month ',
 'diamond shamrock raises crude oil posted prices cts a bbl effective yesterday wti to dlrs ',
 "ncr's full year earnings rose to  mln dlrs from  mln dlrs in the prior year "]

In [39]:
# creating the vocabulary
# get list of all the words
all_words = " ".join(df_clean).split()

words_dict = {}

# add word-count pair to the dictionary
for word in all_words:   
    # check if the word is already in dictionary 
    if word in words_dict:
        # increment count of word by 1 
        words_dict[word] = words_dict[word] + 1
    else:
        # add the word to dictionary with count 1 
        words_dict[word] = 1

In [40]:
words_dict

{'asian': 13,
 'exporters': 49,
 'fear': 8,
 'damage': 29,
 'from': 1368,
 'u': 1116,
 's': 1097,
 'japan': 363,
 'rift': 1,
 'mounting': 5,
 'trade': 548,
 'friction': 8,
 'between': 191,
 'the': 12495,
 'and': 4598,
 'has': 974,
 'raised': 70,
 'fears': 13,
 'among': 44,
 'many': 54,
 'of': 6670,
 "asia's": 1,
 'exporting': 12,
 'nations': 68,
 'that': 1367,
 'row': 3,
 'could': 291,
 'inflict': 1,
 'far': 55,
 'reaching': 7,
 'economic': 244,
 'businessmen': 15,
 'officials': 190,
 'said': 4649,
 'they': 502,
 'told': 237,
 'reuter': 27,
 'correspondents': 3,
 'in': 5069,
 'capitals': 3,
 'a': 4409,
 'move': 101,
 'against': 270,
 'might': 59,
 'boost': 45,
 'protectionist': 22,
 'sentiment': 10,
 'lead': 96,
 'to': 6337,
 'curbs': 12,
 'on': 1642,
 'american': 126,
 'imports': 242,
 'their': 230,
 'products': 199,
 'but': 650,
 'some': 278,
 'while': 164,
 'conflict': 3,
 'would': 926,
 'hurt': 11,
 'them': 58,
 'long': 118,
 'run': 20,
 'short': 87,
 'term': 120,
 "tokyo's": 5,
 '

In [41]:
# prepare a dataframe
words_df = pd.DataFrame({'word':list(words_dict.keys()), 'count':list(words_dict.values())})

# sort words by their count in increasing order
words_df = words_df.sort_values(by = ['count'])

# reset dataframe index
words_df.reset_index(inplace = True, drop=True)

In [42]:
# vocab size
len(words_df)

13176

In [43]:
words_df.head()

Unnamed: 0,word,count
0,sb,1
1,jiangsu,1
2,anhui,1
3,sichuan,1
4,shanxi,1


In [44]:
words_df.tail()

Unnamed: 0,word,count
13171,said,4649
13172,in,5069
13173,to,6337
13174,of,6670
13175,the,12495


## 2. Creating N-grams of the dialogue

In [45]:
# creating an empty dataframe
dataset = pd.DataFrame()
dataset['Sentences'] = df_clean
dataset.head()

Unnamed: 0,Sentences
0,asian exporters fear damage from u s japan rif...
1,they told reuter correspondents in asian capit...
2,but some exporters said that while the conflic...
3,the u s has said it will impose mln dlrs of ta...
4,unofficial japanese estimates put the impact o...


In [46]:
dataset.shape

(10000, 1)

In [48]:
def create_unigram(sentence):
    tokens = sentence.split()
    unigram_list = []
    for i in range(len(tokens)):
        unigram_list.append(tokens[i:i+1])   
    return unigram_list

In [58]:
def create_bigram(sentence):
    tokens = sentence.split()
    bigram_list = []
    for i in range(len(tokens)-1):
        bigram_list.append(tokens[i:i+2])
    return bigram_list

In [59]:
def create_trigram(sentence):
    tokens = sentence.split()
    trigram_list = []
    for i in range(len(tokens)-2):
        trigram_list.append(tokens[i:i+3])
    return trigram_list

In [50]:
final_unigram = []
for i in range(dataset.shape[0]):
    final_unigram.append(create_unigram(dataset['Sentences'][i]))

dataset['unigram'] = final_unigram

In [60]:
final_bigram = []
for i in range(dataset.shape[0]):
    final_bigram.append(create_bigram(dataset['Sentences'][i]))

dataset['bigram'] = final_bigram

In [61]:
final_trigram = []
for i in range(dataset.shape[0]):
    final_trigram.append(create_trigram(dataset['Sentences'][i]))

dataset['trigram'] = final_trigram

In [62]:
dataset.head()

Unnamed: 0,Sentences,unigram,bigram,trigram
0,asian exporters fear damage from u s japan rif...,"[[asian], [exporters], [fear], [damage], [from...","[[asian, exporters], [exporters, fear], [fear,...","[[asian, exporters, fear], [exporters, fear, d..."
1,they told reuter correspondents in asian capit...,"[[they], [told], [reuter], [correspondents], [...","[[they, told], [told, reuter], [reuter, corres...","[[they, told, reuter], [told, reuter, correspo..."
2,but some exporters said that while the conflic...,"[[but], [some], [exporters], [said], [that], [...","[[but, some], [some, exporters], [exporters, s...","[[but, some, exporters], [some, exporters, sai..."
3,the u s has said it will impose mln dlrs of ta...,"[[the], [u], [s], [has], [said], [it], [will],...","[[the, u], [u, s], [s, has], [has, said], [sai...","[[the, u, s], [u, s, has], [s, has, said], [ha..."
4,unofficial japanese estimates put the impact o...,"[[unofficial], [japanese], [estimates], [put],...","[[unofficial, japanese], [japanese, estimates]...","[[unofficial, japanese, estimates], [japanese,..."


## 3. Building the N-gram Language Model

In [63]:
from collections import Counter, defaultdict
model = defaultdict(lambda: defaultdict(lambda: 0))
for i in range(dataset.shape[0]):
    for w1, w2, w3 in create_trigram(dataset['Sentences'][i]):
        # count the occurance of word 3, given word 1 and word 2
        model[(w1, w2)][w3] += 1

## 4. Predicting the next word using N-gram Language Model

In [65]:
dict(model["semi", "annual"])

{'meetings': 2, 'div': 1, 'review': 3, 'survey': 1, 'report': 2}

In [68]:
dict(model["plans", "to"])

{'increase': 3,
 'sell': 7,
 'dispose': 1,
 'follow': 1,
 'diversify': 1,
 'monitor': 3,
 'close': 1,
 'deregulate': 1,
 'maintain': 1,
 'issue': 3,
 'change': 2,
 'invest': 2,
 'operate': 2,
 'start': 1,
 'slap': 1,
 'cut': 2,
 'privatise': 1,
 'bring': 1,
 'spend': 1,
 'apply': 1,
 'seek': 5,
 'merge': 2,
 'ask': 1,
 'offer': 1,
 'take': 1,
 'continue': 1,
 'drill': 1,
 'recover': 1,
 'discuss': 1,
 'protect': 2,
 'buy': 3,
 'raise': 4,
 'meet': 1,
 'leave': 1,
 'let': 1,
 'put': 1,
 'assure': 1,
 'again': 1,
 'restructure': 1,
 'pay': 2,
 'engage': 1,
 'restart': 2,
 'renew': 1,
 'partially': 1}

### Probabilistic Output

In [69]:
# creating the unigram list
unigram_dict = {}
for i in range(dataset.shape[0]):
    # add word-count pair to the dictionary
    for word in dataset['unigram'][i]:   
        # check if the word is already in dictionary 
        if word[0] in unigram_dict:
            # increment count of word by 1 
            unigram_dict[word[0]] = unigram_dict[word[0]] + 1
        else:
            # add the word to dictionary with count 1 
            unigram_dict[word[0]] = 1

In [70]:
# transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [71]:
dict(model["semi", "annual"])

{'meetings': 0.2222222222222222,
 'div': 0.1111111111111111,
 'review': 0.3333333333333333,
 'survey': 0.1111111111111111,
 'report': 0.2222222222222222}

In [72]:
dict(model["plans", "to"])

{'increase': 0.0410958904109589,
 'sell': 0.0958904109589041,
 'dispose': 0.0136986301369863,
 'follow': 0.0136986301369863,
 'diversify': 0.0136986301369863,
 'monitor': 0.0410958904109589,
 'close': 0.0136986301369863,
 'deregulate': 0.0136986301369863,
 'maintain': 0.0136986301369863,
 'issue': 0.0410958904109589,
 'change': 0.0273972602739726,
 'invest': 0.0273972602739726,
 'operate': 0.0273972602739726,
 'start': 0.0136986301369863,
 'slap': 0.0136986301369863,
 'cut': 0.0273972602739726,
 'privatise': 0.0136986301369863,
 'bring': 0.0136986301369863,
 'spend': 0.0136986301369863,
 'apply': 0.0136986301369863,
 'seek': 0.0684931506849315,
 'merge': 0.0273972602739726,
 'ask': 0.0136986301369863,
 'offer': 0.0136986301369863,
 'take': 0.0136986301369863,
 'continue': 0.0136986301369863,
 'drill': 0.0136986301369863,
 'recover': 0.0136986301369863,
 'discuss': 0.0136986301369863,
 'protect': 0.0273972602739726,
 'buy': 0.0410958904109589,
 'raise': 0.0547945205479452,
 'meet': 0.01