# Analysis Utils

In [3]:
def return_oov_tokens(cleaned_reviews):
    """Returns a set of out-of-vocabulary tokens.
       Use it to check which words in our review content
       are unrecognisable to SpaCy.
    """
    oov_tokens = set()
    
    for review in cleaned_reviews:
        cleaned_reviews_doc = loaded_lang_model(review)

        for token in cleaned_reviews_doc:
            if token.is_oov and (not token.is_space):
                oov_tokens.add(token.text)
    return oov_tokens

# Running Utils

In [4]:
import spacy
import pandas as pd
import os

def remove_punctuation(text):
    """Removes punctuation."""
    acceptable = ['£','%']
    text = text.replace('\n', " ").replace('\t','').replace('\r','')
    clean_chars = [char.lower() for char in text if char.isalnum() or char.isspace() or (char in acceptable)]
    return "".join(clean_chars)



def all_files_in_folder(current_folder_path):
    """Gets all names of the files in the 'current' folder."""
    files_list = []
    files_n_dirs = os.listdir(current_folder_path) # folders and files in current folder

    for file in files_n_dirs:
        path_to_file = os.path.join(current_folder_path,file) # folder abs path + file_name
        is_file = os.path.isfile(path_to_file)
        if is_file == True:
            files_list.append(path_to_file)
    return files_list



def clean_all_files(files_list):
    """Returns a list of all reviews without punctuation."""
    reviews = []

    for file in files_list:
        df = pd.read_csv(file)
        corpus_df = df['content']
        corpus_df = corpus_df.apply(remove_punctuation) # series object
        reviews.append(corpus_df.tolist())
    return reviews




def extend_review_list(cleaned_reviews):
    """Returns a list with all the elements of all the lists in the cleaned_revis list."""
    extended_list = []
    for item_list in cleaned_reviews:
        extended_list.extend(item_list)
    return extended_list




def lemmatise_and_rmv_stopwords(review):
    """Returns lemmatised review content without stopwords."""
    no_stop_wrds_review = " ".join([token.lemma_ for token in review if token.is_stop == False])
    return no_stop_wrds_review

# Main body

In [5]:
folder = '/home/mrfox/Desktop/bt_group_project/data'
list_of_review_files = all_files_in_folder(folder)
cleaned_reviews = extend_review_list(clean_all_files(list_of_review_files))

#print(cleaned_reviews)


#DATA PROFILING:
# list of 80 lists, each is one page worth of reviews
# there are almost exactly 40 reviews on each page, incl. the last one
# 80*40 = 3200, we are 8 reviews short of that number, this is probably
# due to 8 people not submitting a review, just the ratings


loaded_lang_model = spacy.load("en_core_web_lg") # use lg to be able to use word vectors


In [None]:
oov_tokens = return_oov_tokens(cleaned_reviews)
#print(oov_tokens)