# Preprocessing and classifier (Kamran Ashraf)

In [2]:
# importing all necessary libraries
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
import string, os, re

In [3]:
# Path for file reading and aspects files.
dataDirPath = os.path.join(os.getcwd() ,'data')
dataFilesPath = os.path.join(dataDirPath , 'dataDir')

aspectsDirPath = os.path.join(os.getcwd() , 'aspectDir')
preprocessFilesPath = os.path.join(aspectsDirPath , 'preprocessDataDir')
unprocessFilesPath = os.path.join(aspectsDirPath , 'unprocessDataDir')

# creating aspectDir and its subfolders preprocessFilesPath, unprocessFilesPath
try:
    os.mkdir('aspectDir')
    os.makedirs(preprocessFilesPath)
    os.makedirs(unprocessFilesPath)
except:
    print('Either dir already exits or somthing goes wrong')
    
# No of comments file
NO_OF_FILES = len(os.listdir(dataFilesPath))
print('No of files found :: ' ,NO_OF_FILES)

No of files found ::  955


In [4]:
# Manually Defined Aspects and their frequently occurring keywords

aspects = ['Value', 'Location', 'Service', 'Meal', 'Facility', 'Room', 'Quality', 'Staff', 'Surrounding', 'Others']

aspectsKeywords = [['Value', 'Price', 'Amount', 'Rate', 'Cheap', 'Worth', 'Low', 'Money', 'Economical', 'Reasonable', 'Fee', 'Expensive'],
 ['Location', 'Railway', 'View', 'Station', 'Airport', 'Distance', 'Far', 'Close', 'Convenient', 'Train', 'Metro'],
 ['Service', 'Desk', 'Check-in', 'Checkout', 'Reliable', 'Fast', 'Convenient', 'Pick-up'],
 ['Meal', 'Drink', 'Breakfast', 'Spicy', 'Food', 'Tasty', 'Tea', 'Buffet', 'Bar', 'Restaurant', 'Dinner', 'Lunch', 'Brunch', 'Delicious'],
 ['Facility', 'Pool', 'Spa', 'Wifi', 'Gymnasium', 'Gym', 'Internet', 'Ample', 'Parking', 'Wireless', 'Broken'],
 ['Room', 'Bed', 'Dirty', 'Clean', 'Toilet', 'Bathroom', 'Shower', 'Dryer', 'Fridge', 'View'],
 ['Quality', 'Satisfactory', 'Ample', 'Hygienic', 'Proper', 'Ambience', 'Odour', 'Smell'],
 ['Staff', 'Good', 'Polite', 'Helpful', 'Friendly', 'Reliable', 'Quick', 'Reception', 'Manager'],
 ['Surrounding', 'Landmark', 'Monument', 'Temple', 'Mosque', 'Church', 'Restaurant', 'Diner', 'Mall', 'Market'],
 ['Paharganj', 'Delhi', 'Stay', 'Visit', 'Back', 'Driver', 'Booked', 'Agent', 'Trip']]

# creating empty aspects files
for aspect in aspects:
    try:
        with open(os.path.join(preprocessFilesPath, f'{aspect}.txt'), 'w', encoding="utf-8") as file:
            file.write('')
        with open(os.path.join(unprocessFilesPath, f'{aspect}.txt'), 'w', encoding="utf-8") as file:
            file.write('')
    except:
        print("Could not write file. Try again")

In [5]:
'''
    Classifier :: uses wup_similarity 
    Wu-Palmer Similarity, which is a scoring method based on how similar the word senses are and where
    the Synsets occur relative to each other in the hypernym tree.
'''
    
def classifier(processedTokenizedSentance):
    aspectsScore = []
    for i in range(0,10):
        aspectsScore.append(0)
#   Calculate scores of aspects for the input sentance
    for word in processedTokenizedSentance:
        for i in range(0,10):
            score = calWordScore(word, i)
            aspectsScore[i] = aspectsScore[i] + score
#   Pick the aspects with max score and returns its id
    maxScoreIndx = 0
    for i in range(1, 10):
        if aspectsScore[maxScoreIndx] < aspectsScore[i]:
            maxScoreIndx = i 
    return maxScoreIndx

'''
    Calculate score after comparing word with each word in aspectsKeywords[i] ( 'i' passed as parameter )
    if similarity is 'none' it returns 0 as score.
'''
def calWordScore(word, i):
    score = 0
    for listWord in aspectsKeywords[i]:
        try:
            a = wordnet.synsets(listWord)[0]
            b = wordnet.synsets(word)[0]
        except:
            # If error occurs (word spelling is incorrect or return list is empty)
            continue
        s = b.wup_similarity(a)
        p = a.wup_similarity(b)
        similarityScore = (s or p)
    #   in case s = none or p = none , it takes the one with score
        if similarityScore:
            score = score + similarityScore
    #   in case score = none  
    if not score:
        score = 0
    return score/len(aspectsKeywords[i])

In [6]:
'''
    Preprocessing :: Stopwords, pos taging, adjective removal
'''
stopWords = set(stopwords.words('english'))

def preprocessComment(sentance):
#     removes stopwords 
    sentance = [word for word in word_tokenize(sentance) if word not in stopWords]
#     removes punctuations 
    sentance = [word for word in sentance if word not in string.punctuation]
#     pos taging
    sentance = pos_tag(sentance)
#     removes general adjective 
    return [word for (word, tag) in sentance if tag != 'JJ']

In [7]:
'''
    Read files and for each file it preprocess comment( use preprocessComment func ),
    classify it (use classifier func) and finally write it in the correct aspect file 
'''
for review in range(1, NO_OF_FILES+1):
    with open(os.path.join(dataFilesPath, f'file{review}.txt'), 'r', encoding="utf-8") as file:
        # Read the file and lowercase the sentances at the same time and sentance tokenize it
        comment = sent_tokenize(file.read().lower())
        for sentance in comment:
            processedSentance = preprocessComment(sentance)
            aspectId = classifier(processedSentance)
            try:
                with open(os.path.join(preprocessFilesPath, f'{aspects[aspectId]}.txt'), 'a', encoding="utf-8") as file:
                    file.writelines(" ".join(processedSentance)+"\n")
                with open(os.path.join(unprocessFilesPath, f'{aspects[aspectId]}.txt'), 'a', encoding="utf-8") as file:
                    file.writelines(sentance+"\n")
                if(review%25 == 0):
                    print(review, end =" ")        #Show no. of reviews processed
            except:
                print("Someting goes wrong while writing in file")
print("\nPROCESS COMPLETE")

25 25 25 25 25 50 50 50 50 75 75 75 75 75 75 100 100 125 125 125 125 125 125 150 150 150 150 150 150 150 175 175 175 175 175 175 175 175 200 200 200 200 200 225 225 225 225 225 225 225 250 250 250 250 250 250 250 275 275 275 275 275 275 275 300 300 300 300 300 300 300 325 325 325 325 350 350 350 350 350 375 375 375 375 375 375 400 400 400 400 400 400 400 425 425 425 425 425 425 425 450 450 450 450 475 475 475 475 475 475 475 475 475 500 500 500 500 500 525 525 525 525 525 525 525 550 550 550 550 550 550 550 550 575 575 575 575 575 575 575 600 600 600 600 600 625 625 625 625 625 625 650 650 650 650 650 650 675 675 675 675 700 700 700 700 725 725 725 725 725 750 750 750 750 750 750 750 750 775 775 775 775 775 775 775 800 800 800 800 800 800 800 825 825 825 825 825 825 825 825 825 825 850 850 850 850 850 850 850 850 875 875 875 875 875 900 900 900 900 900 925 925 925 925 925 925 950 950 
PROCESS COMPLETE


In [8]:
# listWord = 'travel'#'Bazaar'
# word = 'trip'
# a = wordnet.synsets(listWord)[0]
# b = wordnet.synsets(word)[0]
# print('a',a,' b',b)
# s = b.wup_similarity(a)
# p = a.wup_similarity(b)
# print(s or p)

In [9]:
# # Manually Defined Aspects and their frequently occurring keywords

# aspects = ['Value', 'Location', 'Service', 'Meal', 'Facility', 'Room', 'Quality', 'Staff', 'Surrounding', 'Others']

# aspectsKeywords = [['Value', 'Price', 'Amount', 'Rate', 'Cheap', 'Worth', 'Low', 'Money', 'Economical', 'Reasonable', 'Fee', 'Expensive'],
#  ['Location', 'Railway', 'View', 'Station', 'Airport', 'Distance', 'Far', 'Close', 'Convenient', 'Train', 'Metro', 'spot', 'Bazaar'],
#  ['Service', 'Desk', 'Check-in', 'Checkout', 'Reliable', 'Fast', 'Convenient', 'Pick-up'],
#  ['Meal', 'Drink', 'Breakfast', 'Spicy', 'Food', 'Tasty', 'Tea', 'Buffet', 'Bar', 'Restaurant', 'Dinner', 'Lunch', 'Brunch', 'Delicious'],
#  ['Facility', 'Pool', 'Spa', 'Wifi', 'Gymnasium', 'Gym', 'Internet', 'Ample', 'Parking', 'Wireless', 'Broken'],
#  ['Room', 'Bed', 'Dirty', 'Clean', 'Toilet', 'Bathroom', 'Shower', 'Dryer', 'Fridge', 'View'],
#  ['Quality', 'Satisfactory', 'Ample', 'Hygienic', 'Proper', 'Ambience', 'Odour', 'Smell'],
#  ['Staff', 'Good', 'Polite', 'Helpful', 'Friendly', 'Reliable', 'Quick', 'reception'],
#  ['Surrounding', 'Landmark', 'Monument', 'Temple', 'Mosque', 'Church', 'Restaurant', 'Diner', 'Mall', 'Market', 'spot'],
#  ['Paharganj', 'Delhi', 'Stay', 'Visit', 'Back', 'Driver', 'Booked', 'Agent', 'Trip']]
# #  'India', 'Taxis', 'Drive', 'Foreigner', 'Landed', 
# # creating empty aspects files
# for aspect in aspects:
#     try:
#         with open(os.path.join(preprocessFilesPath, f'{aspect}.txt'), 'w', encoding="utf-8") as file:
#             file.write('')
#         with open(os.path.join(unprocessFilesPath, f'{aspect}.txt'), 'w', encoding="utf-8") as file:
#             file.write('')
#     except:
#         print("Could not write file. Try again")