## Read first 10000 rows from json file

In [1]:
import json
import pandas as pd
def read_json_nrows(nrows,filename):
    n= 0
    with open(filename) as f:
        while n < nrows:
            if n == 0:
                line = f.readline()
                line = json.loads(line.rstrip())
                train = pd.DataFrame(line,index = [0])
            else:
                temp = pd.DataFrame(json.loads(f.readline().rstrip()),index = [n])
                train = train.append(temp)
            n = n+1
    return train

In [2]:
train_10000 = read_json_nrows(10000,'review_train.json')

In [3]:
train_10000.head(5)

Unnamed: 0,business_id,stars,text,date
0,31292,1.0,Total bill for this horrible service? Over $8G...,2013-05-07 04:34:36
1,35344,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,2017-01-14 21:30:33
2,152538,5.0,I have to say that this office really has it t...,2016-11-09 20:09:03
3,71871,5.0,Went in for a lunch. Steak sandwich was delici...,2018-01-09 20:56:38
4,64913,1.0,Today was my second out of three sessions I ha...,2018-01-30 23:07:38


## Cleaning data

In [4]:
from collections import Counter
from nltk.util import ngrams
import re
import numpy as np 
from nltk.corpus import words
from nltk.corpus import brown
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize

### Languages

When I first tried to apply the *detect* function I got a 'No features in text' error. So I have to find out which review is empty.

In [5]:
train_10000.loc[6687]

business_id                 137207
stars                            1
text                            :(
date           2017-07-18 20:31:03
Name: 6687, dtype: object

This review is not language, but an emoticon.

In [6]:
def not_language(text):
    # First delete all common emoticons.
    text = re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)','',text)
    if re.sub('[\W]+','',text) == '':
        return True
    else:
        return False

For simplicity, just consider emoticons as English.

In [7]:
not_lang = train_10000[train_10000.text.apply(not_language)].index.values

In [8]:
train_10000.loc[not_lang,'lang_type'] = 'english'

from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
languages_ratios = {}
for i in range(10000):
    tokens = wordpunct_tokenize(train_10000.text[i])
    words = [word.lower() for word in tokens]
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)
    most_rated_language = max(languages_ratios, key=languages_ratios.get)
    train_10000.loc[i,'lang_type'] = most_rated_language

In [9]:
from langdetect import detect
for i in range(10000):
    if i in not_lang:
        continue
    else:
        train_10000.loc[i,'lang_type'] = detect(train_10000.text[i])

In [10]:
re.findall(':[\W]{0,1}',':(nmsl')

[':(']

In [11]:
train_10000.lang_type.value_counts()

en         9956
fr           37
es            3
english       1
it            1
ja            1
de            1
Name: lang_type, dtype: int64

Focus on English only at present.

In [12]:
train_10000_eng = train_10000[train_10000.lang_type == 'en']

Most common words and phrases.

In [13]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 1)]
Counter(text_trigrams).most_common(20)

[(('the',), 43484),
 (('and',), 36445),
 (('I',), 27219),
 (('a',), 26196),
 (('to',), 25807),
 (('was',), 18766),
 (('of',), 15249),
 (('is',), 12814),
 (('for',), 12221),
 (('in',), 11026),
 (('it',), 9188),
 (('The',), 9180),
 (('with',), 8640),
 (('my',), 8424),
 (('that',), 8146),
 (('but',), 7072),
 (('on',), 7003),
 (('have',), 6583),
 (('you',), 6540),
 (('this',), 6388)]

In [14]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 2)]
Counter(text_trigrams).most_common(20)

[(('of', 'the'), 3443),
 (('and', 'the'), 2772),
 (('in', 'the'), 2722),
 (('it', 'was'), 2336),
 (('I', 'was'), 2307),
 (('on', 'the'), 2107),
 (('and', 'I'), 2017),
 (('to', 'the'), 1836),
 (('for', 'the'), 1732),
 (('for', 'a'), 1732),
 (('I', 'had'), 1559),
 (('I', 'have'), 1490),
 (('is', 'a'), 1378),
 (('to', 'be'), 1358),
 (('was', 'a'), 1313),
 (('this', 'place'), 1278),
 (('with', 'the'), 1262),
 (('at', 'the'), 1245),
 (('with', 'a'), 1216),
 (('to', 'get'), 1206)]

In [15]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 3)]
Counter(text_trigrams).most_common(20)

[(('and', 'it', 'was'), 516),
 (('one', 'of', 'the'), 451),
 (('a', 'lot', 'of'), 392),
 (('I', 'had', 'the'), 338),
 (('This', 'place', 'is'), 316),
 (('I', 'have', 'been'), 265),
 (('I', 'had', 'to'), 262),
 (('the', 'food', 'was'), 254),
 (('I', 'ordered', 'the'), 247),
 (('it', 'was', 'a'), 244),
 (('of', 'the', 'best'), 239),
 (('The', 'food', 'was'), 237),
 (('I', 'had', 'a'), 236),
 (('some', 'of', 'the'), 206),
 (('The', 'food', 'is'), 199),
 (('this', 'place', 'is'), 198),
 (('The', 'service', 'was'), 187),
 (('the', 'food', 'is'), 187),
 (('to', 'get', 'a'), 182),
 (('This', 'is', 'a'), 181)]

In [16]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 4)]
Counter(text_trigrams).most_common(20)

[(('one', 'of', 'the', 'best'), 126),
 (('My', 'husband', 'and', 'I'), 88),
 (('I', 'will', 'definitely', 'be'), 77),
 (('is', 'one', 'of', 'the'), 73),
 (('the', 'end', 'of', 'the'), 67),
 (('in', 'the', 'middle', 'of'), 65),
 (('for', 'the', 'first', 'time'), 65),
 (('a', 'great', 'place', 'to'), 62),
 (('some', 'of', 'the', 'best'), 60),
 (('and', 'the', 'service', 'was'), 56),
 (('you', 'are', 'looking', 'for'), 55),
 (('was', 'one', 'of', 'the'), 54),
 (('and', 'the', 'food', 'was'), 53),
 (('I', 'have', 'to', 'say'), 52),
 (('My', 'wife', 'and', 'I'), 50),
 (('and', 'the', 'food', 'is'), 50),
 (('was', 'my', 'first', 'time'), 49),
 (('one', 'of', 'my', 'favorite'), 49),
 (('the', 'rest', 'of', 'the'), 48),
 (('I', "can't", 'wait', 'to'), 47)]

Oddest words

In [17]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 1)]
Counter(text_trigrams).most_common()[-500:]

[(('qualified,',), 1),
 (('exuded',), 1),
 (("'cool'",), 1),
 (('Danny.',), 1),
 (('accord',), 1),
 (('b-day.',), 1),
 (('Terroni.',), 1),
 (('raptor',), 1),
 (('flats',), 1),
 (('gripe...no',), 1),
 (('GPS?',), 1),
 (('yardage',), 1),
 (('caliber,',), 1),
 (('troon',), 1),
 (('Quintero',), 1),
 (('parking....my',), 1),
 (('pefect',), 1),
 (('Dewy',), 1),
 (('appoinment,',), 1),
 (('in...)',), 1),
 (('HVAC/Electrical/etc...',), 1),
 (('electrical....(I',), 1),
 (('NHW',), 1),
 (('compressor',), 1),
 (('verified',), 1),
 (('TUESDAY',), 1),
 (('compressor....',), 1),
 (('occasions....',), 1),
 (('SH!*',), 1),
 (('Moe',), 1),
 (('matted',), 1),
 (('Choosing',), 1),
 (('changing,',), 1),
 (('eats!',), 1),
 (('loong',), 1),
 (('lo-mean,',), 1),
 (('Discovered',), 1),
 (('Beach.',), 1),
 (('Thrilled',), 1),
 (('favs:',), 1),
 (('DELISH!)',), 1),
 (('is..',), 1),
 (('Woo',), 1),
 (('Che.',), 1),
 (('grilles,',), 1),
 (('bugogi',), 1),
 (('thicker,',), 1),
 (('Louie,',), 1),
 (('hmmm,',), 1),


Here I found two sentences in Chinese:
- (('不要在这里吃！',), 1),
- (('我们刚在这里吃午饭，在我们的汤里发现了一个蟑螂，我把它展示给服务员，她说：哦，对不起。你不必付饭费"，这意味着他们的厨房里有很多蟑螂，他们知道，她一点都不惊讶。',),  

Most of these rare words have some punctuations.

In [18]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 2)]
Counter(text_trigrams).most_common()[-20:]

[(('visit!', "Haven't"), 1),
 (('years', 'bc'), 1),
 (('something', 'drew'), 1),
 (('whites', 'mixed'), 1),
 (('with', 'bellpeppers,'), 1),
 (('bellpeppers,', 'onions'), 1),
 (('tomatoes,', 'avocado'), 1),
 (('fruit,', 'with'), 1),
 (('muffin.', 'Super'), 1),
 (('Super', 'healthy'), 1),
 (('with', 'Pam'), 1),
 (('Pam', 'instead'), 1),
 (('they', 'willingly'), 1),
 (('willingly', 'accommodated!'), 1),
 (('accommodated!', 'This'), 1),
 (('server', '(Matt!)'), 1),
 (('(Matt!)', 'was'), 1),
 (('AWESOME!', 'Come'), 1),
 (('and', 'visit--worth'), 1),
 (('visit--worth', 'it!'), 1)]

### Check bad words

#### First emoticons

In [19]:
def find_emoticons(text):
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    if emoticons == []:
        return False
    else:
        return True

In [20]:
eg1 = np.where(train_10000_eng.text.apply(find_emoticons) == True)[0][0]

In [21]:
train_10000_eng.iloc[eg1].text

"Man, I love Toronto! Hiding in a strip mall on Overlea, find a dingy looking restaurant that serves up fantastic, cheap kabob...be warned regardless of when you come here you'll likely be waiting for a table as this place is always overrun with people...If you're a fan of perfectly grilled spicy meat, you must try this place out...now that I've been here I need to limit how many times I go here per month :)\n\nTake the drive out and have some kabob...if you don't own a car, borrow one or get a zipcar membership...totally worth it!"

Here we can find one  :)  together with two '\n' 

In [22]:
re.sub('\\n','',train_10000_eng.iloc[eg1].text)

"Man, I love Toronto! Hiding in a strip mall on Overlea, find a dingy looking restaurant that serves up fantastic, cheap kabob...be warned regardless of when you come here you'll likely be waiting for a table as this place is always overrun with people...If you're a fan of perfectly grilled spicy meat, you must try this place out...now that I've been here I need to limit how many times I go here per month :)Take the drive out and have some kabob...if you don't own a car, borrow one or get a zipcar membership...totally worth it!"

#### Check typos

In [23]:
# Most packages can't handle some words like 'nooooo'. 
def check_same(word):
    intervals = {}
    for index,letter in enumerate(word):
        if letter == word[index-1]:
            if letter in intervals.keys():
                interval = intervals.pop(letter)
                lastguy = interval[len(interval)-1] 
                if lastguy[1] == (index-1):
                    lastguy = (lastguy[0],index)
                    interval[len(interval)-1] = lastguy
                    intervals[letter] = interval
                else:
                    lastguy1 = (index-1,index)
                    interval.append(lastguy1)
                    intervals[letter] = interval
            else:
                intervals[letter] = [(index-1,index)]
    return intervals

In [24]:
# I have never seen a word with three continuous same letter, so I will delete till two.
def no_more_than_2(word,dupli):
    for key in dupli.keys():
        for interval in dupli[key]:
            length = interval[1]-interval[0]+1
            regex = '(%s'%key + '{%i})'%length
            word = re.sub(regex,key+key,word)
    return word

In [25]:
from pattern.en import suggest
from itertools import combinations
def right_spelling(word,dupli):
    key_list = []
    for key in dupli.keys():
        key_list.append(key)
    n = len(key_list)
    for i in range(n):
        for comb in combinations(key_list,i):
            for letter in comb:
                regex = '(%s'%letter + '{2})'
                new_word = re.sub(regex,letter,word)
                if new_word in brown.words():
                    return new_word
    return suggest(word)[0][0]

In [26]:
def no_typo(word):
    if len(word) == len(set(word)):
        return word
    if suggest(word)[0][1] == 1:
        return suggest(word)[0][0]
    else:
        duplicates = check_same(word)
        two = no_more_than_2(word,duplicates)
        suggest_two = suggest(two)
        if suggest_two[0][1] == 1:
            return suggest_two[0][0]
        else:
            return right_spelling(two,duplicates)
        
        return right_spelling(word)

In [27]:
no_typo('finaaallly')

'finally'

#### Lemmatization

In [28]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet

In [29]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [30]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
wnl = WordNetLemmatizer()
def lemmatizer(text):
    tokens = word_tokenize(text)
    lemmas = []
    tagged = pos_tag(tokens)
    for tag in tagged:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
    return lemmas

#### Stop-words

I think words which mean negative are important.

In [31]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jywsi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jywsi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jywsi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [32]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.pop(stop.index('but'))
stop.pop(stop.index('not'))
preposition = ['of','with','at','from','into','during',
               'including','until','till','against','among',
               'throughout','despite','towards','upon','concerning','to','in',
               'for','on','by','about','like','through','over',
               'before','between','after','since','without','under',
               'within','along','following','across','behind',
               'beyond','plus','except','but','up','out','around','down','off','above','near']
for prep in preposition:
    if prep in stop:
        stop.pop(stop.index(prep))

#### Convert n't to not

In [33]:
def no_abbreviation(text):
    text = re.sub('n\'t',' not',text)
    return text

#### Adversatives

In [34]:
but = ['yet','however','nonetheless','whereas','nevertheless']
although = ['although','though','notwithstanding','albeit']

In [35]:
def change_but(text):
    for x in but:
        text = re.sub(x,'but',text)
    return text
def change_although(text):
    for x in although:
        text = re.sub(x,'although',text)
    return text
def change_adversatives(text):
    text = change_but(text)
    text = change_although(text)
    return text

Now I want to capture the key information near but and although.

def although_phrase(text):
    words = text.split()
    for (index,word) in enumerate(words):
        if word == 'altough.':
            for x in range(index,index-10,-1):
                if re.sub('(.*)\.([a-z])\..*','\\2',str(wordnet.synsets(words[x])[0])) in ['v','adj']

#### Phrases

In [36]:
def preprocessing(text):

    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)

    text = re.sub('\\n',' ',text)

    text = no_abbreviation(text)

    text = re.sub('[\W]+',' ', text.lower())

    text = change_adversatives(text)

    tokens = lemmatizer(text)
    text = ''
    for index, token in enumerate(tokens):

        #tokens[index] = no_typo(token)
        if token in stop:
            tokens[index] = ''
        else:
            text = text + tokens[index] + ' '
    return {'text':text,'emoticons':emoticons}

In [37]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jywsi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [38]:
from tqdm import tqdm, tqdm_pandas
tqdm.pandas()
dictionary = train_10000_eng.text.progress_apply(preprocessing)

100%|██████████| 9956/9956 [02:02<00:00, 81.24it/s] 


In [39]:
emoticons = [dictionary[i]['emoticons'] for i in train_10000_eng.index]

In [40]:
texts = [dictionary[i]['text'] for i in train_10000_eng.index]

In [41]:
from gensim.models.phrases import Phrases, Phraser



In [42]:
sentence_stream = [sent.split(' ') for sent in texts]

In [44]:
bigram = Phraser(Phrases(sentence_stream, min_count=5, threshold=5))

In [45]:
gg = bigram[sentence_stream]

In [None]:
gg[]

## TF IDF

In [46]:
def concatenate(text):
    for x in text:
        return " ".join(text)

In [47]:
text1 = []
for i in range(len(gg)):
    text1.append(concatenate(gg[i]))

In [107]:
text1[5]

'first to admit not excite_about go to la tavolta food snob group_of friend suggest go for_dinner look online at menu to nothing_special seem overpriced im also not big on order pasta go out ala outnumber thank_goodness order sea_bass special to_die for cooked perfectly_season perfectly perfect portion not say_enough good thing_about dish server ask seem proud_of dish say not chef incredible job hubby get crab tortellini also love heard mmmm good from around table waiter super_nice even give_u free_dessert of last people in restaurant service_slow place_pack but jug of wine large_group with good conversation not seem to bother anyone order calamari fry zucchini appetizer leave out mussel sea_bass special highly_recommend chicken_parm crab tortellini also good big chicken romano bit_bland house_salad teeny make_reservation but_still expect to wait for food go with large_group of people plan for to loud not go with date unless fight not feel_like hear_anything to say ask to sit in side r

In [59]:
TfidfVectorizer?

In [111]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', min_df = 1)

In [112]:
response =  tf.fit_transform(text1)

In [113]:
response

<9956x23714 sparse matrix of type '<class 'numpy.float64'>'
	with 472413 stored elements in Compressed Sparse Row format>

In [None]:
feature_names = tf.get_feature_names()

In [167]:
res_df = pd.DataFrame(response.toarray(),columns = feature_names)

In [172]:
res_df.loc[1]

00                 0.0
000                0.0
00_pm              0.0
00am               0.0
00pm               0.0
01                 0.0
01pm               0.0
02                 0.0
025                0.0
03                 0.0
03pm               0.0
04                 0.0
05                 0.0
055                0.0
05am               0.0
06                 0.0
0600               0.0
07                 0.0
0700               0.0
08                 0.0
09                 0.0
0_50               0.0
0_star             0.0
0t                 0.0
10                 0.0
100                0.0
1000               0.0
10000              0.0
100psi             0.0
100th              0.0
                  ... 
zoe                0.0
zoey               0.0
zoinks             0.0
zoltan             0.0
zombie             0.0
zomg               0.0
zomggggg           0.0
zone               0.0
zoo                0.0
zounds             0.0
zoës               0.0
zucchini           0.0
zuchinni   

In [206]:
value = sorted(res_df.iloc[1], reverse = True)[:10] 
value

[0.5023896319308714,
 0.4118900882633719,
 0.20985684249575187,
 0.16075168978832,
 0.15341055294175138,
 0.13066056721340277,
 0.10231372742103392,
 0.09643067535454679,
 0.09643067535454679,
 0.09643067535454679]

In [207]:
names = np.array(feature_names)[np.argsort(res_df.iloc[1])[-10:]][::-1]
names

array(['travis', 'blowout', 'hair', 'perfectly', 'shampoo', 'stylist',
       'way', 'scented', 'swoosh', 'admirable'], dtype='<U36')

In [197]:
dd = dict(zip(names,value))

{'travis': 0.5023896319308714,
 'blowout': 0.4118900882633719,
 'hair': 0.20985684249575187,
 'perfectly': 0.16075168978832,
 'shampoo': 0.15341055294175138,
 'stylist': 0.13066056721340277,
 'way': 0.10231372742103392,
 'scented': 0.09643067535454679,
 'swoosh': 0.09643067535454679,
 'admirable': 0.09643067535454679}

In [229]:
def gettoptfidf(df,x):
    feature_names = tf.get_feature_names()
    list1 = []
    for i in range(len(df)):
        value = sorted(df.iloc[i], reverse = True)[:x] 
        names = np.array(feature_names)[np.argsort(df.iloc[i])[-x:]][::-1]
        dd = dict(zip(names,value))
        list1.append(dd)
    return list1

In [230]:
tfidf_final = gettoptfidf(res_df,15)

In [231]:
tfidf_final

[{'pill': 0.48217652192556987,
  '8gs': 0.29372499295778365,
  '69': 0.25980364066631106,
  'crook': 0.25980364066631106,
  'total_bill': 0.2509210120736448,
  'nerve': 0.2336420097484103,
  'er': 0.2226291196217625,
  'cent': 0.21968627052071552,
  'hospital': 0.21831375365221206,
  'charge_u': 0.21699965978217223,
  'horrible_service': 0.20909732131681763,
  '19': 0.20811703118950603,
  'avoid': 0.17055894564129642,
  'online': 0.16459773154591745,
  'cost': 0.14930023408962892},
 {'travis': 0.5023896319308714,
  'blowout': 0.4118900882633719,
  'hair': 0.20985684249575187,
  'perfectly': 0.16075168978832,
  'shampoo': 0.15341055294175138,
  'stylist': 0.13066056721340277,
  'way': 0.10231372742103392,
  'scented': 0.09643067535454679,
  'swoosh': 0.09643067535454679,
  'admirable': 0.09643067535454679,
  'pagent': 0.09643067535454679,
  'beauuuutiful': 0.09643067535454679,
  'bouncey': 0.09643067535454679,
  'style': 0.09481202425400279,
  'wrist': 0.09232053645901393},
 {'dental': 