## Read file

In [1]:
import pandas as pd
import tqdm
from tqdm import tqdm, tqdm_pandas
from multiprocessing import Pool
import multiprocessing
n_cpu = multiprocessing.cpu_count()
import re
import numpy as np
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.utils import lemmatize
import pyLDAvis.gensim
import pyLDAvis
from gensim.models import CoherenceModel
%matplotlib inline

### Business

In [2]:
chunk = pd.read_json('business_train.json',orient = 'records',lines=True,chunksize = 10000)
i=0
for c in chunk:
    if i == 0:
        business = c
    else:
        business = business.append(c)
    i = i +1

In [3]:
business = business.drop(business[business.categories.isna()].index)

In [4]:
def restaurant_index(categories):
    if re.findall('restaurants',categories.lower()) != []:
        return True
    else:
        return False

In [5]:
restaurant = business.categories.apply(restaurant_index)

restaurants = business[restaurant]

restaurants = restaurants.drop(['postal_code','state'],axis = 1)

In [6]:
restaurants = restaurants.set_index(np.arange(0,restaurants.shape[0]))

In [7]:
del business

In [8]:
def kill_restaurant(text, cores = n_cpu-1):
    with Pool(processes=cores) as pool:
        result = pool.map(kill,text)
    return result
def kill(text):
    return re.sub('restaurants','',text.lower())
restaurants.categories = kill_restaurant(restaurants.categories)

In [9]:
def tokenize(text):
    token = [re.sub(' ','_',x.strip()) for x in text.split(',')]
    token = [x for x in token if x != 'food' and x != '']
    return token

In [10]:
def token(text, cores=n_cpu-1):
    with Pool(processes=cores) as pool:
        result = pool.map(tokenize,text)
    return result

In [11]:
restaurants.categories = token(restaurants.categories)

In [12]:
restaurants_dictionary = Dictionary(restaurants.categories.values)

restaurants_corpus = [restaurants_dictionary.doc2bow(text) for text in restaurants.categories.values]

In [13]:
from gensim.models import ldamodel as LDA

In [14]:
lda_model = LDA.LdaModel(restaurants_corpus,
                         id2word=restaurants_dictionary,
                         num_topics = 5,
                         alpha='auto',
                         per_word_topics=True,
                         random_state = 123)

In [15]:
lda_model.print_topics()

[(0,
  '0.176*"fast_food" + 0.156*"burgers" + 0.116*"sandwiches" + 0.114*"american_(traditional)" + 0.050*"chicken_wings" + 0.043*"breakfast_&_brunch" + 0.042*"indian" + 0.036*"diners" + 0.036*"delis" + 0.023*"american_(new)"'),
 (1,
  '0.125*"japanese" + 0.102*"sushi_bars" + 0.089*"event_planning_&_services" + 0.078*"mediterranean" + 0.054*"middle_eastern" + 0.048*"caterers" + 0.046*"greek" + 0.028*"canadian_(new)" + 0.028*"caribbean" + 0.027*"venues_&_event_spaces"'),
 (2,
  '0.155*"pizza" + 0.108*"italian" + 0.075*"cafes" + 0.074*"sandwiches" + 0.072*"coffee_&_tea" + 0.070*"breakfast_&_brunch" + 0.057*"salad" + 0.044*"bakeries" + 0.038*"desserts" + 0.027*"barbeque"'),
 (3,
  '0.166*"nightlife" + 0.160*"bars" + 0.068*"american_(new)" + 0.060*"american_(traditional)" + 0.052*"seafood" + 0.036*"sports_bars" + 0.029*"pubs" + 0.024*"canadian_(new)" + 0.022*"wine_&_spirits" + 0.022*"beer"'),
 (4,
  '0.187*"mexican" + 0.182*"chinese" + 0.069*"asian_fusion" + 0.068*"thai" + 0.057*"specialty

In [16]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, restaurants_corpus, restaurants_dictionary)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [17]:
fastfood = [restaurants_dictionary[x[0]] for x in lda_model.get_topic_terms(topicid=0)]

In [18]:
japan = [restaurants_dictionary[x[0]] for x in lda_model.get_topic_terms(topicid=1)]

In [19]:
pizza = [restaurants_dictionary[x[0]] for x in lda_model.get_topic_terms(topicid=2)]
fastfood.extend(pizza)

In [20]:
bars = [restaurants_dictionary[x[0]] for x in lda_model.get_topic_terms(topicid=3)]

In [21]:
asian = [restaurants_dictionary[x[0]] for x in lda_model.get_topic_terms(topicid=4)]

In [24]:
def classify(text):
    x = [0,0,0,0]
    if len(set(text).intersection(fastfood)) != 0:
        x[0] = 1
    if len(set(text).intersection(bars)) != 0:
        x[1] = 1
    if len(set(text).intersection(japan)) != 0:
        x[2] = 1
    if len(set(text).intersection(asian)) != 0:
        x[3] = 1
    return x
def classifier(text,cores=n_cpu-1):
    with Pool(processes=cores) as pool:
        result = pool.map(classify,text)
    return result


In [25]:
types = np.array(classifier(restaurants.categories.values))

In [27]:
rest_types = pd.DataFrame(types,columns=['fastfood','bars','japan','asian'])

In [29]:
restaurants = restaurants.join(rest_types)

In [32]:
restaurants.to_csv('yaoshen.csv')

In [30]:
ids = restaurants.business_id.values

In [54]:
fastfood = restaurants[restaurants.fastfood == 1].business_id
bars = restaurants[restaurants.bars == 1].business_id
japan = restaurants[restaurants.japan == 1].business_id
asian = restaurants[restaurants.asian == 1].business_id

### Reviews

In [55]:
chunk = pd.read_json('review_train.json',orient = 'records',lines=True,chunksize = 10000)
i=0
for c in chunk:
    c = c.drop('date',axis=1)
    if i == 0:
        train = c
    else:
        train = train.append(c)
    i = i +1

In [56]:
train_restaurants = train[train.business_id.isin(restaurants.business_id)]
del train

In [57]:
train_restaurants[:500000].to_csv('1.csv')
train_restaurants[500000:1000000].to_csv('2.csv')
train_restaurants[1000000:1500000].to_csv('3.csv')
train_restaurants[1500000:2000000].to_csv('4.csv')
train_restaurants[2000000:2500000].to_csv('5.csv')
train_restaurants[2500000:3000000].to_csv('6.csv')
train_restaurants[3000000:].to_csv('7.csv')

In [58]:
train_restaurants.to_csv('restaurants')

In [2]:
train_restaurants = pd.read_csv('restaurants')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train_restaurants.shape

(3377546, 4)

In [None]:
train_restaurants.stars.hist()

In [None]:
train_fastfood = train_restaurants[train_restaurants.business_id.isin(fastfood)]
train_japan = train_restaurants[train_restaurants.business_id.isin(japan)]
train_bars = train_restaurants[train_restaurants.business_id.isin(bars)]
train_asian = train_restaurants[train_restaurants.business_id.isin(asian)]

### Preprocessing

In [4]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.pop(stop.index('but'))
stop.pop(stop.index('not'))

'not'

#### convert not and but into bigrams and trigrams

In [5]:
def has_not(text):
    text = text.split(' ')
    for x in text:
        if x == 'not':
            return True
    return False

In [6]:
def convert_not_sentence(text):
    text = text + ' '
    nots = re.findall('not [a-z ]+',text)
    for n in nots:
        n = n.rstrip(',|.')
        words = n.split(' ')
        not_word = ''
        for word in words[1:]:
            if word != '':
                not_word = not_word + 'not_' + word.strip(' ') + ' '
        text = text + not_word
    return text.rstrip()

In [7]:
def has_but(text):
    text = text.split(' ')
    for x in text:
        if x == 'but':
            return True
    return False

In [8]:
def convert_but_sentence(text):
    text = re.sub('[^a-z0-9 _]','',text)
    but = text.split(' ')
    text = text + ' '
    index = but.index('but')
    left_words = []
    right_words = []
    for i,word in enumerate(but):
        if i < index:
            if word not in preposition and word != '':
                left_words.append(word)
        elif i == index:
            continue
        else:
            if word not in preposition and word != '':
                right_words.append(word)
    if len(left_words) != 0:
        for x in left_words:
            for y in right_words:
                text = text + x + '_but_' + y + ' '
    return text.rstrip()

In [9]:
def has_although(text):
    text = text.split(' ')
    for x in text:
        if x == 'though':
            return True
    return False

def convert_although_sentence(text):
    text = re.sub('[^a-z0-9 _]','',text)
    but = text.split(' ')
    text = text + ' '
    index = but.index('though')
    left_words = []
    right_words = []
    for i,word in enumerate(but):
        if i < index:
            if word not in preposition and word != '':
                left_words.append(word)
        elif i == index:
            continue
        else:
            if word not in preposition and word != '':
                right_words.append(word)
    if len(left_words) != 0:
        for x in left_words:
            for y in right_words:
                text = text + x + '_though_' + y + ' '
    return text.rstrip()

In [10]:
preposition = ['of','with','at','from','into','during',
               'including','until','till','against','among',
               'throughout','despite','towards','upon','concerning','to','in',
               'for','on','by','about','like','through','over',
               'before','between','after','since','without','under',
               'within','along','following','across','behind',
               'beyond','plus','except','but','up','out','around','down','off','above','near']
for prep in preposition:
    if prep in stop:
        stop.pop(stop.index(prep))

In [11]:
but = ['yet','however','nonetheless','whereas','nevertheless']
although = ['although','notwithstanding','albeit']
def change_but(text):
    for x in but:
        text = re.sub(x,'but',text)
    return text
def change_though(text):
    for x in although:
        text = re.sub(x,'though',text)
    return text
def change_adversatives(text):
    text = change_but(text)
    text = change_though(text)
    return text

In [12]:
def no_abbreviation(text):
    text = re.sub('can\'t','can not',text)
    text = re.sub('cannot','can not',text)
    text = re.sub('won\'t','will not',text)
    text = re.sub('([a-z]*)n\'t','\\1 not',text)
    return text

In [13]:
def utf8(text):
    x = str(text.encode('utf8'))
    x = re.sub('\\\\[a-zA-Z0-9]*','',x[2:(len(x)-1)])
    x = re.sub('\\n','',x)
        
    return x

#### Misspelling

In [14]:
def have_mis(text):
    return re.findall('([a-z])\\1\\1+',text) != []

In [15]:
def is_mis(word):
    return re.findall('([a-z])\\1\\1+',word) != []

In [16]:
from pattern.en import suggest

In [17]:
def correct_mis(word):
    tmpword = re.sub('([a-z])\\1\\1+','\\1\\1',word)
    word = suggest(tmpword)[0][0]
    return word

#### Split sentence

In [18]:
def split_sentences(text):
    pat = re.compile(r'([a-z][^\.!?]*[\.!?])', re.M)
    return pat.findall(text)


#### Lemmatization

In [19]:
def lemmatizer(text):
    tokens = lemmatize(text,stopwords=stop,allowed_tags=re.compile('.*'))
    t = ''
    for x in tokens:
        x = re.findall('\'([a-z]+)',str(x))[0]
        if x in stop:
            continue
        else:
            t = t + x + ' '
    t = t.rstrip()
    return t

#### Preprocessing

In [20]:
def preprocessing(text):
    text = utf8(str(text).lower())
    emoji = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)','',text)
    text = re.sub('[0-9]','',text)
    text = no_abbreviation(text)
    text = change_adversatives(text)
    text = text + '.'
    sentences = split_sentences(text)
    new_text = ''
    for sentence in sentences:
        sentence = re.sub('[^a-z ]*','',sentence)
        sen=''
        for word in sentence.split(' '):
            if word not in stop:
                sen = sen + word + ' '
        if has_not(sen):
            sen = convert_not_sentence(sen)
        if has_but(sen):
            sen = convert_but_sentence(sen)
        if has_although(sen):
            sen = convert_although_sentence(sen)
        new_text = new_text + sen + ' '
    new_text = re.sub('[^a-z0-9 _]*','',new_text)
    for emo in emoji:
        new_text = new_text + ' ' + emo
    new_text = re.sub('  ',' ',new_text)
    return new_text.rstrip()

  emoji = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
  text = re.sub('(?::|;|=)(?:-)?(?:\)|\(|D|P)','',text)


In [21]:
train_restaurants = train_restaurants.dropna()

In [24]:
X_new = train_restaurants.text.apply(preprocessing)
train_restaurants.text = X_new

In [None]:
train_restaurants.text.progress_apply(lambda x: x.lstrip(' '))

In [None]:
train_restaurants.to_csv('restaurants_preprocessed.csv')

#### Bigrams for phrase

In [None]:
train_restaurants = pd.read_csv('restaurants_preprocessed.csv')

In [None]:
from gensim.models.phrases import Phrases, Phraser

sentence_stream = [str(x).split(' ') for x in train_restaurants.text.values]

bigram = Phraser(Phrases(sentence_stream, min_count=10, threshold=10))

biX_new = bigram[sentence_stream]

bigramx = [x for x in biX_new]

In [None]:
i = 0
for x in trigramx[:2]:
    z = ''
    for y in x:
        z = z + ' ' + y
    train_restaurants.text.iloc[i] = z.rstrip()
    i = i + 1

In [None]:
train_restaurants.text = bigramx

In [None]:
train_restaurants.to_csv('data_preprocessed.csv',index = False,index_label=False)

#### Features

Most of the works here were done on server, for simplicity I just post part of them.

In [None]:
import re
import pandas as pd
features = []
for i in range(1,9):
    with open('%d.txt'%i) as f:
        for x in range(5000):
            features.append(re.sub('\n','',f.readline()))

with open('id.txt') as f:
    ids = f.readlines()

featrues = features[:37775]

X = pd.read_csv('bifinal.csv')

X = X[['business_id','stars','reviews']]

X.reviews = featrues

X.reviews = X.reviews.apply(lambda x: x.split(',')[:5])

chunk = pd.read_json('../../business_train.json',orient = 'records',lines=True,chunksize = 10000)
i=0
for c in chunk:
    if i == 0:
        business = c
    else:
        business = business.append(c)
    i = i +1

X = X.set_index('business_id').join(business[['business_id','city','latitude','longitude','name']].set_index('business_id'))
X.to_csv('finaldata.csv')

In [None]:
data1 = pd.read_csv('./01.csv',)
data1 = data1.loc[:,data1.columns[2:]]

i = 2
while i <= 5:
    data = pd.read_csv('./0%d.csv'%i)
    data = data.loc[:,data.columns[2:]]
    data1 = pd.concat([data1,data])
    i = i +1

del data
ids = data1.business_id.unique()
np.savetxt('id.txt',ids,delimiter='\n')

stars = []
reviews = []
for i in tqdm(range(len(ids))):
    j = ids[i]
    tmp = data1[data1.business_id == j]
    star = [x for x in tmp.stars.values]
    review = ''
    for y in tmp.text.values:
        review = review + str(y) + ' '
    stars.append(star)
    reviews.append(review)

final = pd.DataFrame({'business_id':ids,'stars':stars,'reviews':reviews})
del data1

del ids,stars,reviews

final.to_csv('final.csv')
final = pd.read_csv('final.csv')

from gensim.models.phrases import Phrases, Phraser

phrases = Phrases(final.reviews.values, min_count=10, threshold=10)
bigram = Phraser(phrases)

bireviews = bigram[final.reviews.values]
final.reviews = bireviews

final.to_csv('bifinal.csv')

In [None]:
import re
features = []
for i in range(1,9):
    with open('%d.txt'%i) as f:
        for x in range(5000):
            features.append(re.sub('\n','',f.readline()))

with open('id.txt') as f:
    ids = f.readlines()

featrues = features[:37775]

X = pd.read_csv('bifinal.csv')

X = X[['business_id','stars','reviews']]

X.reviews = featrues

X.reviews = X.reviews.apply(lambda x: x.split(',')[:5])

chunk = pd.read_json('../../business_train.json',orient = 'records',lines=True,chunksize = 10000)
i=0
for c in chunk:
    if i == 0:
        business = c
    else:
        business = business.append(c)
    i = i +1

X = X.set_index('business_id').join(business[['business_id','city','latitude','longitude','name']].set_index('business_id'))
X.to_csv('finaldata.csv')