# Importing Neccesary Libraries

In [4]:
import nltk, re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
import gensim
import pandas as pd
import numpy as np

## Reading Data

In [5]:
fobj=open('Restrain2014.txt')
fobj.readlines()

['But the staff was so horrible to us#service\n',
 'To be completely fair the only redeeming factor was the food which was above average but couldnt make up for all the other deficiencies of Teodora#food\n',
 '#anecdotes/miscellaneous\n',
 'The food is uniformly exceptional with a very capable kitchen which will proudly whip up whatever you feel like eating whether its on the menu or not#food\n',
 'Where Gabriela personaly greets you and recommends you what to eat#service\n',
 'For those that go once and dont enjoy it all I can say is that they just dont get it#anecdotes/miscellaneous\n',
 'Not only was the food outstanding but the little perks were great#food\n',
 'It is very overpriced and not very tasty#food\n',
 'Our agreed favorite is the orrechiete with sausage and chicken (usually the waiters are kind enough to split the dish in half so you get to sample both meats)#food\n',
 'The Bagels have an outstanding taste with a terrific texture both chewy yet not gummy#food\n',
 'Nevert

## Converting text Data to DataFrame

In [6]:
fobj=open('Restrain2014.txt')
d={'text':[],'aspects':[]}
c=0
for line in fobj.readlines():
    res=line.split('#', maxsplit=1)
    if len(res[0])!=0:
        txt=re.sub(r"[^A-Za-z]+"," ",res[0]).lower()
        asp=re.sub(r"[^A-Za-z]+",",",res[1]).lower()
        d['text'].append(txt)
        d['aspects'].append(asp)
    else:
        txt=d['text'][-1]
        d['text'].append(txt)
        asp=re.sub(r"[^A-Za-z]+",",",res[1])
        d['aspects'].append(d['aspects'][-1]+asp)
        d['text'].remove(d['text'][-2])
        d['aspects'].remove(d['aspects'][-2])

In [7]:
df=pd.DataFrame(d)
df['aspects']=df['aspects'].apply(lambda x:x[:-1])
df['aspects']=df['aspects'].apply(lambda x:x.split(','))
df

Unnamed: 0,text,aspects
0,but the staff was so horrible to us,"[price, anecdotes, miscellaneous]"
1,to be completely fair the only redeeming facto...,"[food, anecdotes, miscellaneous]"
2,the food is uniformly exceptional with a very ...,"[food, service, ambience]"
3,where gabriela personaly greets you and recomm...,"[food, price]"
4,for those that go once and dont enjoy it all i...,"[food, anecdotes, miscellaneous]"
...,...,...
2642,but that is highly forgivable,"[anecdotes, miscellaneous]"
2643,from the appetizers we ate the dim sum and oth...,[food]
2644,when we arrived at pm the restaurant was pract...,"[anecdotes, miscellaneous]"
2645,each table has a pot of boiling water sunken i...,[food]


In [8]:
df['text'][2645]

'each table has a pot of boiling water sunken into its surface and you get platters of thin sliced meats various vegetables and rice and glass noodles'

## Sentence Tokenization and Word Tokenization

In [9]:
import gensim

In [10]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [11]:
aspect = []
for doc in df['text']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        aspect.append(simple_preprocess(sent))
aspect

[['but', 'the', 'staff', 'was', 'so', 'horrible', 'to', 'us'],
 ['to',
  'be',
  'completely',
  'fair',
  'the',
  'only',
  'redeeming',
  'factor',
  'was',
  'the',
  'food',
  'which',
  'was',
  'above',
  'average',
  'but',
  'couldnt',
  'make',
  'up',
  'for',
  'all',
  'the',
  'other',
  'deficiencies',
  'of',
  'teodora'],
 ['the',
  'food',
  'is',
  'uniformly',
  'exceptional',
  'with',
  'very',
  'capable',
  'kitchen',
  'which',
  'will',
  'proudly',
  'whip',
  'up',
  'whatever',
  'you',
  'feel',
  'like',
  'eating',
  'whether',
  'its',
  'on',
  'the',
  'menu',
  'or',
  'not'],
 ['where',
  'gabriela',
  'personaly',
  'greets',
  'you',
  'and',
  'recommends',
  'you',
  'what',
  'to',
  'eat'],
 ['for',
  'those',
  'that',
  'go',
  'once',
  'and',
  'dont',
  'enjoy',
  'it',
  'all',
  'can',
  'say',
  'is',
  'that',
  'they',
  'just',
  'dont',
  'get',
  'it'],
 ['not',
  'only',
  'was',
  'the',
  'food',
  'outstanding',
  'but',
  'th

In [12]:
test_text=df.text.apply(gensim.utils.simple_preprocess)
test_text

0            [but, the, staff, was, so, horrible, to, us]
1       [to, be, completely, fair, the, only, redeemin...
2       [the, food, is, uniformly, exceptional, with, ...
3       [where, gabriela, personaly, greets, you, and,...
4       [for, those, that, go, once, and, dont, enjoy,...
                              ...                        
2642                  [but, that, is, highly, forgivable]
2643    [from, the, appetizers, we, ate, the, dim, sum...
2644    [when, we, arrived, at, pm, the, restaurant, w...
2645    [each, table, has, pot, of, boiling, water, su...
2646      [am, going, to, the, mid, town, location, next]
Name: text, Length: 2647, dtype: object

## Vectorization

In [14]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=1
)

In [15]:
model.build_vocab(test_text)

In [16]:
model.epochs

5

In [17]:
model.train(test_text, total_examples=model.corpus_count, epochs=model.epochs)

(125947, 169600)

In [18]:
len(model.wv.index_to_key)

4079

In [19]:
model.wv.most_similar('food')

[('the', 0.9999086260795593),
 ('and', 0.9998703598976135),
 ('are', 0.9998617768287659),
 ('an', 0.9998600482940674),
 ('with', 0.9998466968536377),
 ('of', 0.999843418598175),
 ('in', 0.999842643737793),
 ('very', 0.9998416304588318),
 ('who', 0.9998390078544617),
 ('for', 0.9998382925987244)]

In [20]:
def document_vector(doc):
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

In [21]:
document_vector(df['text'].values[0])

array([-0.36160696,  0.5314048 ,  0.26274228, -0.11244705,  0.21805914,
       -0.9324635 ,  0.41429636,  1.1901984 , -0.4476379 , -0.40333408,
       -0.22556183, -1.0025945 , -0.13970987,  0.34207508,  0.3105646 ,
       -0.41513583,  0.3681407 , -0.7274432 , -0.38276562, -1.3963569 ,
        0.45808578,  0.33757195,  0.8485298 , -0.24808286, -0.15240002,
        0.04876441, -0.2327685 , -0.17892344, -0.6633315 ,  0.10455442,
        0.5784243 ,  0.00877731,  0.28554347, -0.8248194 , -0.22548223,
        0.64533925,  0.26828626, -0.42731008, -0.21844049, -1.191736  ,
        0.05945011, -0.73263335, -0.25783983,  0.10110155,  0.5363096 ,
       -0.22567295, -0.48358628, -0.08117303,  0.09663084,  0.40850174,
        0.29241624, -0.55260843, -0.21894196, -0.12690507, -0.37691048,
       -0.06191508,  0.41477555, -0.08681939, -0.75781566,  0.0293078 ,
        0.17679223,  0.16877861,  0.26889628, -0.0915066 , -0.41236764,
        0.7413683 ,  0.22382069,  0.7424465 , -1.0207762 ,  0.85

In [22]:
from tqdm import tqdm

In [23]:
X = []
for doc in tqdm(df['text'].values):
    X.append(document_vector(doc))

100%|████████████████████████████████████████████████████████████████████████████| 2647/2647 [00:00<00:00, 6677.16it/s]


In [24]:
X = np.array(X)

In [26]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

### Onehot Encoding Multilabel classes

In [27]:
multilabel=MultiLabelBinarizer()

In [28]:
y=multilabel.fit_transform(df['aspects'])

In [29]:
df_y=pd.DataFrame(y,columns=multilabel.classes_)

In [30]:
df_y

Unnamed: 0,Unnamed: 1,NAME,ambience,anecdotes,food,miscellaneou,miscellaneous,price,service,way
0,0,0,0,1,0,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0,0
2,0,0,1,0,1,0,0,0,1,0
3,0,0,0,0,1,0,0,1,0,0
4,0,0,0,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2642,0,0,0,1,0,0,1,0,0,0
2643,0,0,0,0,1,0,0,0,0,0
2644,0,0,0,1,0,0,1,0,0,0
2645,0,0,0,0,1,0,0,0,0,0


In [31]:
multilabel.classes_

array(['', 'NAME', 'ambience', 'anecdotes', 'food', 'miscellaneou',
       'miscellaneous', 'price', 'service', 'way'], dtype=object)

## splitting data into train and test

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [34]:
X_train.shape

(2117, 100)

In [35]:
X_test.shape

(530, 100)

## Training Multilabel Data using SVC and OneVsRestClassifier

In [36]:
svc=LinearSVC(penalty='l2')
clf=OneVsRestClassifier(svc)
clf.fit(X_train,y_train)

In [56]:
y_pred=clf.predict(X_test)

In [57]:
y_pred[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Importing Test Data

In [37]:
fobj1=open('Restest2014.txt')
d={'text':[]}
c=0
for line in fobj1.readlines():
    res=line.split('#', maxsplit=1)
    if len(res[0])!=0:
        txt=res[0].lower()
        d['text'].append(txt)
d

{'text': ['the bread is top notch as well',
  'i have to say they have one of the fastest delivery times in the city',
  'food is always fresh and hot- ready to eat!',
  'did i mention that the coffee is outstanding?',
  'certainly not the best sushi in new york however it is always fresh and the place is very clean sterile',
  'i trust the people at go sushi it never disappoints',
  'straight-forward no surprises very decent japanese food',
  'best spicy tuna roll great asian salad',
  'try the rose roll (not on menu)',
  'i love the drinks esp lychee martini and the food is also very good',
  'in fact this was not a nicoise salad and was barely eatable',
  'while theres a decent menu it shouldnt take ten minutes to get your drinks and 45 for a dessert pizza',
  'once we sailed the top-notch food and live entertainment sold us on a unforgettable evening',
  'our waiter was horrible; so rude and disinterested',
  'the sangrias - watered down',
  'menu - uneventful small',
  'anytime an

In [38]:
df_test=pd.DataFrame(d)
df_test

Unnamed: 0,text
0,the bread is top notch as well
1,i have to say they have one of the fastest del...
2,food is always fresh and hot- ready to eat!
3,did i mention that the coffee is outstanding?
4,certainly not the best sushi in new york howev...
...,...
795,anyway the owner was fake
796,owner is pleasant and entertaining
797,i have never in my life sent back food before ...
798,although the restaurant itself is nice i prefe...


In [39]:
X_test[1]

array([-0.23141088,  0.3421176 ,  0.16562635, -0.06768682,  0.14078602,
       -0.5964318 ,  0.26650944,  0.7618366 , -0.28442845, -0.25668037,
       -0.1446979 , -0.6406588 , -0.09054816,  0.21612708,  0.1970188 ,
       -0.26435032,  0.23827092, -0.46868005, -0.24603336, -0.8894189 ,
        0.292004  ,  0.21688439,  0.54461503, -0.158009  , -0.0976833 ,
        0.0315368 , -0.15157539, -0.1186325 , -0.4224369 ,  0.06518491,
        0.3731261 ,  0.00532021,  0.1807126 , -0.52522916, -0.14379434,
        0.40988487,  0.1682568 , -0.27136728, -0.13705392, -0.75986254,
        0.03758232, -0.47187996, -0.16701208,  0.06310588,  0.33415666,
       -0.14536573, -0.30407625, -0.0545873 ,  0.06213973,  0.2599572 ,
        0.18738092, -0.35491168, -0.13946825, -0.07833893, -0.2409206 ,
       -0.03306282,  0.2693216 , -0.05467593, -0.48677525,  0.01790345,
        0.11237085,  0.10723128,  0.17047459, -0.06202318, -0.26727024,
        0.47350815,  0.14239483,  0.4736662 , -0.6544519 ,  0.54

# Prediction on Test Data

In [40]:
X_test = np.array(X_test)

In [41]:
y_pred=clf.predict(X_test)

In [42]:
y_pred

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [64]:
y_pred[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [65]:
y_test

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Training Data using DecisionTreeClassifier and OVR

In [43]:
from sklearn.tree import DecisionTreeClassifier

In [44]:
dt=DecisionTreeClassifier()
os=OneVsRestClassifier(dt)
os.fit(X_train,y_train)

# Training Data Using LogisticRegression and OVR

In [45]:
lr=LogisticRegression()
lr1=OneVsRestClassifier(lr)
lr1.fit(X_train,y_train)

In [46]:
y_pred_dt=os.predict(X_test)

In [47]:
y_pred_dt[0]

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

In [48]:
y_test[0]

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [None]:
#Testing

In [79]:
y1=os.predict(X_test[0].reshape(1,-1))

In [80]:
y1

array([[0, 0, 0, 0, 1, 0, 0, 0, 1, 0]])

## Original Y_test

In [51]:
multilabel.inverse_transform(y_test)

[('food',),
 ('anecdotes', 'miscellaneous'),
 ('ambience',),
 ('anecdotes', 'miscellaneous'),
 ('food',),
 ('anecdotes', 'miscellaneous', 'service'),
 ('anecdotes', 'miscellaneous'),
 ('food',),
 ('price',),
 ('anecdotes', 'miscellaneous'),
 ('anecdotes', 'miscellaneous'),
 ('service',),
 ('anecdotes', 'miscellaneous'),
 ('anecdotes', 'miscellaneous'),
 ('service',),
 ('food',),
 ('food',),
 ('anecdotes', 'miscellaneous'),
 ('service',),
 ('service',),
 ('food', 'price'),
 ('anecdotes', 'miscellaneous'),
 ('ambience',),
 ('food', 'service'),
 ('service',),
 ('food',),
 ('anecdotes', 'miscellaneous'),
 ('service',),
 ('food',),
 ('food', 'price'),
 ('food', 'service'),
 ('food', 'service'),
 ('food',),
 ('price',),
 ('service',),
 ('food',),
 ('anecdotes', 'miscellaneous'),
 ('food',),
 ('anecdotes', 'miscellaneous'),
 ('food',),
 ('anecdotes', 'miscellaneous'),
 ('anecdotes', 'miscellaneous'),
 ('anecdotes', 'miscellaneous'),
 ('anecdotes', 'miscellaneous'),
 ('food',),
 ('service',),


## Aspect Prediction Using DecisionTreeclassifier on Validation data

In [49]:
multilabel.inverse_transform(os.predict(X_test))

[('miscellaneous',),
 ('ambience', 'anecdotes', 'food', 'miscellaneous'),
 ('food', 'service'),
 ('anecdotes', 'food'),
 ('service',),
 ('ambience', 'anecdotes', 'miscellaneous'),
 (),
 ('ambience', 'anecdotes', 'miscellaneous', 'service'),
 ('anecdotes', 'miscellaneous', 'service'),
 (),
 ('anecdotes', 'miscellaneous'),
 ('ambience', 'anecdotes'),
 ('anecdotes', 'miscellaneous', 'service'),
 (),
 ('food', 'miscellaneous'),
 ('service',),
 ('anecdotes', 'food', 'miscellaneous'),
 ('food',),
 ('ambience', 'anecdotes', 'miscellaneous'),
 ('anecdotes', 'food', 'miscellaneous'),
 (),
 ('miscellaneous',),
 ('anecdotes',),
 ('anecdotes', 'miscellaneous'),
 ('anecdotes', 'miscellaneous'),
 ('anecdotes',),
 ('ambience', 'food', 'miscellaneous', 'price'),
 ('anecdotes',),
 (),
 ('food', 'service'),
 (),
 ('anecdotes', 'food'),
 ('anecdotes', 'food', 'miscellaneous', 'price'),
 ('anecdotes', 'food', 'price'),
 (),
 ('miscellaneous',),
 ('food', 'miscellaneous'),
 (),
 (),
 ('anecdotes', 'food'),

In [81]:
multilabel.inverse_transform(y1)

[('food', 'service')]