## Read first 10000 rows from json file

In [1]:
import json
import pandas as pd
def read_json_nrows(nrows,filename):
    n= 0
    with open(filename) as f:
        while n < nrows:
            if n == 0:
                line = f.readline()
                line = json.loads(line.rstrip())
                train = pd.DataFrame(line,index = [0])
            else:
                temp = pd.DataFrame(json.loads(f.readline().rstrip()),index = [n])
                train = train.append(temp)
            n = n+1
    return train
    

In [2]:
train_10000 = read_json_nrows(10000,'review_train.json')

In [3]:
train_10000.head(5)

Unnamed: 0,business_id,stars,text,date
0,31292,1.0,Total bill for this horrible service? Over $8G...,2013-05-07 04:34:36
1,35344,5.0,I *adore* Travis at the Hard Rock's new Kelly ...,2017-01-14 21:30:33
2,152538,5.0,I have to say that this office really has it t...,2016-11-09 20:09:03
3,71871,5.0,Went in for a lunch. Steak sandwich was delici...,2018-01-09 20:56:38
4,64913,1.0,Today was my second out of three sessions I ha...,2018-01-30 23:07:38


## Cleaning data

In [4]:
from collections import Counter
from nltk.util import ngrams
import re
import numpy as np 
from nltk.corpus import words
from nltk.corpus import brown
from nltk.corpus import wordnet
from nltk import wordpunct_tokenize

### Languages

When I first tried to apply the *detect* function I got a 'No features in text' error. So I have to find out which review is empty.

In [5]:
train_10000.loc[6687]

business_id                 137207
stars                            1
text                            :(
date           2017-07-18 20:31:03
Name: 6687, dtype: object

This review is not language, but an emoticon.

In [6]:
def not_language(text):
    # First delete all common emoticons.
    text = re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)','',text)
    if re.sub('[\W]+','',text) == '':
        return True
    else:
        return False

For simplicity, just consider emoticons as English.

In [7]:
not_lang = train_10000[train_10000.text.apply(not_language)].index.values

In [8]:
train_10000.loc[not_lang,'lang_type'] = 'english'

from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
languages_ratios = {}
for i in range(10000):
    tokens = wordpunct_tokenize(train_10000.text[i])
    words = [word.lower() for word in tokens]
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)
    most_rated_language = max(languages_ratios, key=languages_ratios.get)
    train_10000.loc[i,'lang_type'] = most_rated_language

In [9]:
from langdetect import detect
for i in range(10000):
    if i in not_lang:
        continue
    else:
        train_10000.loc[i,'lang_type'] = detect(train_10000.text[i])

In [10]:
re.findall(':[\W]{0,1}',':(nmsl')

[':(']

In [11]:
train_10000.lang_type.value_counts()

en         9956
fr           37
es            3
english       1
de            1
ja            1
it            1
Name: lang_type, dtype: int64

Focus on English only at present.

In [12]:
train_10000_eng = train_10000[train_10000.lang_type == 'en']

Most common words and phrases.

In [13]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 1)]
Counter(text_trigrams).most_common(20)

[(('the',), 43484),
 (('and',), 36445),
 (('I',), 27219),
 (('a',), 26196),
 (('to',), 25807),
 (('was',), 18766),
 (('of',), 15249),
 (('is',), 12814),
 (('for',), 12221),
 (('in',), 11026),
 (('it',), 9188),
 (('The',), 9180),
 (('with',), 8640),
 (('my',), 8424),
 (('that',), 8146),
 (('but',), 7072),
 (('on',), 7003),
 (('have',), 6583),
 (('you',), 6540),
 (('this',), 6388)]

In [14]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 2)]
Counter(text_trigrams).most_common(20)

[(('of', 'the'), 3443),
 (('and', 'the'), 2772),
 (('in', 'the'), 2722),
 (('it', 'was'), 2336),
 (('I', 'was'), 2307),
 (('on', 'the'), 2107),
 (('and', 'I'), 2017),
 (('to', 'the'), 1836),
 (('for', 'the'), 1732),
 (('for', 'a'), 1732),
 (('I', 'had'), 1559),
 (('I', 'have'), 1490),
 (('is', 'a'), 1378),
 (('to', 'be'), 1358),
 (('was', 'a'), 1313),
 (('this', 'place'), 1278),
 (('with', 'the'), 1262),
 (('at', 'the'), 1245),
 (('with', 'a'), 1216),
 (('to', 'get'), 1206)]

In [15]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 3)]
Counter(text_trigrams).most_common(20)

[(('and', 'it', 'was'), 516),
 (('one', 'of', 'the'), 451),
 (('a', 'lot', 'of'), 392),
 (('I', 'had', 'the'), 338),
 (('This', 'place', 'is'), 316),
 (('I', 'have', 'been'), 265),
 (('I', 'had', 'to'), 262),
 (('the', 'food', 'was'), 254),
 (('I', 'ordered', 'the'), 247),
 (('it', 'was', 'a'), 244),
 (('of', 'the', 'best'), 239),
 (('The', 'food', 'was'), 237),
 (('I', 'had', 'a'), 236),
 (('some', 'of', 'the'), 206),
 (('The', 'food', 'is'), 199),
 (('this', 'place', 'is'), 198),
 (('The', 'service', 'was'), 187),
 (('the', 'food', 'is'), 187),
 (('to', 'get', 'a'), 182),
 (('This', 'is', 'a'), 181)]

In [16]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 4)]
Counter(text_trigrams).most_common(20)

[(('one', 'of', 'the', 'best'), 126),
 (('My', 'husband', 'and', 'I'), 88),
 (('I', 'will', 'definitely', 'be'), 77),
 (('is', 'one', 'of', 'the'), 73),
 (('the', 'end', 'of', 'the'), 67),
 (('in', 'the', 'middle', 'of'), 65),
 (('for', 'the', 'first', 'time'), 65),
 (('a', 'great', 'place', 'to'), 62),
 (('some', 'of', 'the', 'best'), 60),
 (('and', 'the', 'service', 'was'), 56),
 (('you', 'are', 'looking', 'for'), 55),
 (('was', 'one', 'of', 'the'), 54),
 (('and', 'the', 'food', 'was'), 53),
 (('I', 'have', 'to', 'say'), 52),
 (('My', 'wife', 'and', 'I'), 50),
 (('and', 'the', 'food', 'is'), 50),
 (('was', 'my', 'first', 'time'), 49),
 (('one', 'of', 'my', 'favorite'), 49),
 (('the', 'rest', 'of', 'the'), 48),
 (('I', "can't", 'wait', 'to'), 47)]

Oddest words

In [17]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 1)]
Counter(text_trigrams).most_common()[-500:]

[(('qualified,',), 1),
 (('exuded',), 1),
 (("'cool'",), 1),
 (('Danny.',), 1),
 (('accord',), 1),
 (('b-day.',), 1),
 (('Terroni.',), 1),
 (('raptor',), 1),
 (('flats',), 1),
 (('gripe...no',), 1),
 (('GPS?',), 1),
 (('yardage',), 1),
 (('caliber,',), 1),
 (('troon',), 1),
 (('Quintero',), 1),
 (('parking....my',), 1),
 (('pefect',), 1),
 (('Dewy',), 1),
 (('appoinment,',), 1),
 (('in...)',), 1),
 (('HVAC/Electrical/etc...',), 1),
 (('electrical....(I',), 1),
 (('NHW',), 1),
 (('compressor',), 1),
 (('verified',), 1),
 (('TUESDAY',), 1),
 (('compressor....',), 1),
 (('occasions....',), 1),
 (('SH!*',), 1),
 (('Moe',), 1),
 (('matted',), 1),
 (('Choosing',), 1),
 (('changing,',), 1),
 (('eats!',), 1),
 (('loong',), 1),
 (('lo-mean,',), 1),
 (('Discovered',), 1),
 (('Beach.',), 1),
 (('Thrilled',), 1),
 (('favs:',), 1),
 (('DELISH!)',), 1),
 (('is..',), 1),
 (('Woo',), 1),
 (('Che.',), 1),
 (('grilles,',), 1),
 (('bugogi',), 1),
 (('thicker,',), 1),
 (('Louie,',), 1),
 (('hmmm,',), 1),


Here I found two sentences in Chinese:
- (('不要在这里吃！',), 1),
- (('我们刚在这里吃午饭，在我们的汤里发现了一个蟑螂，我把它展示给服务员，她说：哦，对不起。你不必付饭费"，这意味着他们的厨房里有很多蟑螂，他们知道，她一点都不惊讶。',),  

Most of these rare words have some punctuations.

In [18]:
text = ' '.join(train_10000_eng.text.values)
text_trigrams = [i for i in ngrams(text.split(), 2)]
Counter(text_trigrams).most_common()[-20:]

[(('visit!', "Haven't"), 1),
 (('years', 'bc'), 1),
 (('something', 'drew'), 1),
 (('whites', 'mixed'), 1),
 (('with', 'bellpeppers,'), 1),
 (('bellpeppers,', 'onions'), 1),
 (('tomatoes,', 'avocado'), 1),
 (('fruit,', 'with'), 1),
 (('muffin.', 'Super'), 1),
 (('Super', 'healthy'), 1),
 (('with', 'Pam'), 1),
 (('Pam', 'instead'), 1),
 (('they', 'willingly'), 1),
 (('willingly', 'accommodated!'), 1),
 (('accommodated!', 'This'), 1),
 (('server', '(Matt!)'), 1),
 (('(Matt!)', 'was'), 1),
 (('AWESOME!', 'Come'), 1),
 (('and', 'visit--worth'), 1),
 (('visit--worth', 'it!'), 1)]

### Check bad words

#### First emoticons

In [19]:
def find_emoticons(text):
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    if emoticons == []:
        return False
    else:
        return True

In [20]:
eg1 = np.where(train_10000_eng.text.apply(find_emoticons) == True)[0][0]

In [21]:
train_10000_eng.iloc[eg1].text

"Man, I love Toronto! Hiding in a strip mall on Overlea, find a dingy looking restaurant that serves up fantastic, cheap kabob...be warned regardless of when you come here you'll likely be waiting for a table as this place is always overrun with people...If you're a fan of perfectly grilled spicy meat, you must try this place out...now that I've been here I need to limit how many times I go here per month :)\n\nTake the drive out and have some kabob...if you don't own a car, borrow one or get a zipcar membership...totally worth it!"

Here we can find one  :)  together with two '\n' 

In [22]:
re.sub('\\n','',train_10000_eng.iloc[eg1].text)

"Man, I love Toronto! Hiding in a strip mall on Overlea, find a dingy looking restaurant that serves up fantastic, cheap kabob...be warned regardless of when you come here you'll likely be waiting for a table as this place is always overrun with people...If you're a fan of perfectly grilled spicy meat, you must try this place out...now that I've been here I need to limit how many times I go here per month :)Take the drive out and have some kabob...if you don't own a car, borrow one or get a zipcar membership...totally worth it!"

#### Check typos

In [23]:
# Most packages can't handle some words like 'nooooo'. 
def check_same(word):
    intervals = {}
    for index,letter in enumerate(word):
        if letter == word[index-1]:
            if letter in intervals.keys():
                interval = intervals.pop(letter)
                lastguy = interval[len(interval)-1] 
                if lastguy[1] == (index-1):
                    lastguy = (lastguy[0],index)
                    interval[len(interval)-1] = lastguy
                    intervals[letter] = interval
                else:
                    lastguy1 = (index-1,index)
                    interval.append(lastguy1)
                    intervals[letter] = interval
            else:
                intervals[letter] = [(index-1,index)]
    return intervals

In [24]:
# I have never seen a word with three continuous same letter, so I will delete till two.
def no_more_than_2(word,dupli):
    for key in dupli.keys():
        for interval in dupli[key]:
            length = interval[1]-interval[0]+1
            regex = '(%s'%key + '{%i})'%length
            word = re.sub(regex,key+key,word)
    return word

In [25]:
from pattern.en import suggest
from itertools import combinations
def right_spelling(word,dupli):
    key_list = []
    for key in dupli.keys():
        key_list.append(key)
    n = len(key_list)
    for i in range(n):
        for comb in combinations(key_list,i):
            for letter in comb:
                regex = '(%s'%letter + '{2})'
                new_word = re.sub(regex,letter,word)
                if new_word in brown.words():
                    return new_word
    return suggest(word)[0][0]

In [26]:
def no_typo(word):
    if len(word) == len(set(word)):
        return word
    if suggest(word)[0][1] == 1:
        return suggest(word)[0][0]
    else:
        duplicates = check_same(word)
        two = no_more_than_2(word,duplicates)
        suggest_two = suggest(two)
        if suggest_two[0][1] == 1:
            return suggest_two[0][0]
        else:
            return right_spelling(two,duplicates)
        
        return right_spelling(word)

In [27]:
no_typo('finaaallly')

'finally'

#### Lemmatization

In [28]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet

In [29]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [30]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
wnl = WordNetLemmatizer()
def lemmatizer(text):
    tokens = word_tokenize(text)
    lemmas = []
    tagged = pos_tag(tokens)
    for tag in tagged:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        lemmas.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
    return lemmas

#### Stop-words

I think words which mean negative are important.

In [31]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joeyqiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/joeyqiang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/joeyqiang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [32]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.pop(stop.index('but'))
stop.pop(stop.index('not'))
preposition = ['of','with','at','from','into','during',
               'including','until','till','against','among',
               'throughout','despite','towards','upon','concerning','to','in',
               'for','on','by','about','like','through','over',
               'before','between','after','since','without','under',
               'within','along','following','across','behind',
               'beyond','plus','except','but','up','out','around','down','off','above','near']
for prep in preposition:
    if prep in stop:
        stop.pop(stop.index(prep))

#### Convert n't to not

In [33]:
def no_abbreviation(text):
    text = re.sub('n\'t',' not',text)
    return text

#### Adversatives

In [34]:
but = ['yet','however','nonetheless','whereas','nevertheless']
although = ['although','though','notwithstanding','albeit']

In [35]:
def change_but(text):
    for x in but:
        text = re.sub(x,'but',text)
    return text
def change_although(text):
    for x in although:
        text = re.sub(x,'although',text)
    return text
def change_adversatives(text):
    text = change_but(text)
    text = change_although(text)
    return text

Now I want to capture the key information near but and although.

def although_phrase(text):
    words = text.split()
    for (index,word) in enumerate(words):
        if word == 'altough.':
            for x in range(index,index-10,-1):
                if re.sub('(.*)\.([a-z])\..*','\\2',str(wordnet.synsets(words[x])[0])) in ['v','adj']

In [36]:
def preprocessing(text):
    # 取表情
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    # 去回车
    text = re.sub('\\n',' ',text)
    # not
    text = no_abbreviation(text)
    # 只保留字母
    text = re.sub('[\W]+',' ', text.lower())
    # 统一转折词
    text = change_adversatives(text)
    # 词性还原
    tokens = lemmatizer(text)
    text = ''
    for index, token in enumerate(tokens):
        # 去拼写错误
        #tokens[index] = no_typo(token)
        if token in stop:
            tokens[index] = ''
        else:
            text = text + tokens[index] + ' '
    return {'text':text,'emoticons':emoticons}

In [37]:
from tqdm import tqdm, tqdm_pandas
tqdm.pandas()
dictionary = train_10000_eng.text.progress_apply(preprocessing)

100%|██████████| 9956/9956 [00:52<00:00, 188.48it/s]


In [38]:
emoticons = [dictionary[i]['emoticons'] for i in train_10000_eng.index]

In [39]:
texts = [dictionary[i]['text'] for i in train_10000_eng.index]

In [40]:
texts

['total bill for horrible service over 8gs crook actually nerve to charge u 69 for 3 pill check online pill for 19 cent avoid hospital er at cost ',
 'adore travis at hard rock new kelly cardenas salon always fan of great blowout stranger to chain offer service but travis take flawless blowout to whole new level travis greets with perfectly green swoosh in otherwise perfectly style black hair vega worthy rockstar outfit next come relaxing incredible shampoo get full head message could cure even bad migraine in minute scented shampoo room travis freakishly strong finger in good way use perfect amount of pressure superb start glorious blowout not one not two but three people involve in best round brush action hair ever see team of stylist clearly get along extremely well evident from way talk to help one another really genuine not corporate requirement much fun to next travis start with flat iron way flip wrist to get volume around without over make look like texas pagent girl admirable 

In [67]:
from autocorrect import spell

new_texts = ['']
for i in tqdm(range(len(texts))):
    new_texts.append([spell(j) for j in texts[i].split(' ')])

new_texts = new_texts[1:]

100%|██████████| 9956/9956 [09:23<00:00,  7.58it/s]


In [76]:
result = ['']
for i in range(len(new_texts)):
    result.append(' '.join(new_texts[i]))
    
new_texts = result[1:]

#### Bigrams for phrase

In [77]:
from gensim.models.phrases import Phrases, Phraser

In [78]:
sentence_stream = [sent.split(' ') for sent in new_texts]

In [79]:
bigram = Phraser(Phrases(sentence_stream, min_count=5, threshold=5)) #mincount越小识别出来的越少，threshold higher means fewer phrases

In [107]:
test_num = 100

In [108]:
new_texts[test_num]

'excited for happy hour heard great thing food less average charge of for bread apps greasy fry still spend 250 on happy hour menu for group of a first review on yelp cared enough to take time to write sorry wo not come back a'

In [109]:
print(sentence_stream[test_num])

['excited', 'for', 'happy', 'hour', 'heard', 'great', 'thing', 'food', 'less', 'average', 'charge', 'of', 'for', 'bread', 'apps', 'greasy', 'fry', 'still', 'spend', '250', 'on', 'happy', 'hour', 'menu', 'for', 'group', 'of', 'a', 'first', 'review', 'on', 'yelp', 'cared', 'enough', 'to', 'take', 'time', 'to', 'write', 'sorry', 'wo', 'not', 'come', 'back', 'a']


In [110]:
print(bigram[sentence_stream[test_num]])

['excited', 'for', 'happy_hour', 'heard', 'great', 'thing', 'food', 'less', 'average', 'charge', 'of', 'for', 'bread', 'apps', 'greasy', 'fry', 'still', 'spend', '250', 'on', 'happy_hour', 'menu', 'for', 'group_of', 'a', 'first', 'review', 'on_yelp', 'cared', 'enough', 'to', 'take', 'time', 'to', 'write', 'sorry', 'wo_not', 'come_back', 'a']


In [84]:
len(sentence_stream)

9956

In [85]:
sentence_with_phrase = bigram[sentence_stream]

In [193]:
result = ['']
for i in range(len(new_texts)):
    result.append(' '.join(bigram[sentence_stream[i]]))
    
new_texts = result[1:]

In [194]:
new_texts

['total_bill for horrible_service over gs crook actually nerve to charge_u of for a pill check online pill for of_cent avoid hospital er at cost a',
 'adore travis at hard_rock new kelly Cardenas salon always fan_of great blowout stranger to chain offer service but travis take flawless blowout to whole new level travis greets with perfectly green swoosh in otherwise perfectly style black hair vega worthy rockstar outfit next come relaxing incredible shampoo get full head message could cure even bad migraine in minute scented shampoo room travis freakishly strong finger in good way use perfect_amount of pressure superb start glorious blowout not one not two but three people involve in best round brush action hair ever_see team of stylist clearly get along extremely well evident from way talk_to help one another really genuine not corporate requirement much_fun to next travis start_with flat iron way flip wrist to get volume around without over make look_like texas agent girl admirable a

#### Most Frequent Words

In [195]:
type([sentence_with_phrase])

list

In [196]:
from collections import defaultdict
word_freq = defaultdict(int)
for sent in sentence_with_phrase:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

21756

In [197]:
sorted(word_freq, key=word_freq.get, reverse=True)[:100]

['to',
 'a',
 'of',
 'not',
 'for',
 'but',
 'in',
 'with',
 'get',
 'on',
 'place',
 'good',
 'food',
 'go',
 'great',
 'at',
 'one',
 'order',
 'would',
 'service',
 'like',
 'come',
 'time',
 'from',
 'try',
 'make',
 'out',
 'say',
 'also',
 'take',
 'well',
 'love',
 'about',
 'really',
 'after',
 'u',
 'best',
 'restaurant',
 'nice',
 'by',
 'give',
 'wait',
 'always',
 'know',
 'back',
 'up',
 'eat',
 'even',
 'work',
 'need',
 'find',
 'drink',
 'although',
 'tell',
 'people',
 'price',
 'see',
 'delicious',
 'never',
 'could',
 'little',
 'much',
 'day',
 'way',
 'thing',
 'over',
 'staff',
 'table',
 'experience',
 'use',
 'want_to',
 'call',
 'everything',
 'leave',
 'menu',
 'think',
 'bad',
 'look',
 'first',
 'location',
 'ask',
 'before',
 'taste',
 'two',
 'chicken',
 'definitely',
 'friend',
 'ever',
 'pizza',
 'right',
 'dish',
 'ca_not',
 'another',
 'still',
 'meal',
 'server',
 'room',
 'small',
 'friendly',
 'new']

### word2vec

In [198]:
import multiprocessing
cores = multiprocessing.cpu_count()
cores


4

In [199]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(min_count=1,
                     window=2,
                     size=200,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1,
                     seed = 123,
                     sg=0) # sg默认为0，对应CBOW算法；sg=1则采用skip-gram算法

In [200]:
import time

t = time.clock()

w2v_model.build_vocab(sentence_with_phrase, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time.clock() - t) / 60, 2)))

Time to build vocab: 0.03 mins


In [201]:
t = time.clock()

w2v_model.train(sentence_with_phrase, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time.clock() - t) / 60, 2)))

Time to train the model: 1.58 mins


In [202]:
w2v_model.wv.most_similar(positive=["awesome"])

[('great_atmosphere', 0.8750365972518921),
 ('fabulous', 0.8691188097000122),
 ('amaze', 0.8671000003814697),
 ('great', 0.8670890927314758),
 ('service_outstanding', 0.8658528327941895),
 ('attentive_friendly', 0.8656123876571655),
 ('terrific', 0.8578321933746338),
 ('fantastic', 0.8488726615905762),
 ('amazing', 0.8450025320053101),
 ('friendly_attentive', 0.8408991098403931)]

In [206]:
w2v_model.wv.most_similar(positive=["huge"])

[('large', 0.8283247351646423),
 ('tiny', 0.8128261566162109),
 ('gigantic', 0.8055514097213745),
 ('small', 0.7974947690963745),
 ('massive', 0.7930153608322144),
 ('ample', 0.7871751189231873),
 ('sized', 0.7848326563835144),
 ('teeny', 0.7839747667312622),
 ('decent_size', 0.7812745571136475),
 ('big', 0.7703492641448975)]

#### Sensitive Analysis

In [204]:
design_matrix = np.zeros([len(design_matrix),200])

In [205]:
for i in range(design_matrix.shape[0]):
    temp_sent = new_texts[i].split(' ')
    for j in range(len(temp_sent)):
        design_matrix[i] += w2v_model[temp_sent[j]]
    design_matrix[i] = design_matrix[i]/len(temp_sent)

  after removing the cwd from sys.path.


In [229]:
from sklearn.ensemble import RandomForestClassifier
y = train_10000_eng["stars"].values[:9000]
x = design_matrix[:9000,]
clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
clf.fit(x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [234]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
y_true = train_10000_eng["stars"].values[-1000:]
y_pred = clf.predict(design_matrix[-1000:,])
accuracy_score(y_true, y_pred)
np.sqrt(mean_squared_error(y_true, y_pred))

1.5316657598836634