# Collection Utils

In [None]:
import os

def all_files_in_folder(current_folder_path):
    """Gets all names of the files in the 'current' folder."""
    files_list = []
    files_n_dirs = os.listdir(current_folder_path) # folders and files in current folder

    for i,file in enumerate(files_n_dirs):
        if i == 4:
            break
        else:
            path_to_file = os.path.join(current_folder_path,file) # folder abs path + file_name
            is_file = os.path.isfile(path_to_file)
            if is_file == True:
                files_list.append(path_to_file)
    return files_list


# Analysis Utils

In [1]:
def return_oov_tokens(cleaned_reviews):
    """Returns a set of out-of-vocabulary tokens.
       Use it to check which words in our review content
       are unrecognisable to SpaCy.
    """
    oov_tokens = set()
    
    for review in cleaned_reviews:
        cleaned_reviews_doc = loaded_lang_model(review)

        for token in cleaned_reviews_doc:
            if token.is_oov and (not token.is_space):
                oov_tokens.add(token.text)
    return oov_tokens

# Running Utils

In [19]:
import spacy

def remove_punctuation(text):
    """Removes punctuation."""
    acceptable = ['£','%']
    text = text.replace('\n', " ").replace('\t','').replace('\r','')
    clean_chars = [char.lower() for char in text if char.isalnum() or char.isspace() or (char in acceptable)]
    return "".join(clean_chars)

def lemmatise_and_rmv_stopwords(review):
    """Returns lemmatised review content without stopwords."""
    parser = English()
    review = parser(review)
    clean = " ".join([token.lemma_ for token in review if ((token.is_stop == False) and (token.is_space == False))])
    return clean

def prepare_txt_for_lda(text):
    """Filters out words shorter than 5 characters. Returns filtered tokens."""
    tokens = [token for token in text if len(token) > 4]
    return tokens    

# Main body

## Read in reviews into a list

In [20]:
import csv
from spacy.lang.en import English

folder = '/home/mrfox/Desktop/bt_group_project/data'
list_of_review_files = all_files_in_folder(folder) # return a list of all files in folder

list_of_reviews = []
for file in list_of_review_files: #iterate over files list, read each file, append review text to list_of_reviews
    with open(file) as csv_f:
        file_reader = csv.reader(csv_f)
        
        for row in file_reader:
            content = row[0][:-3] # removes the tildes
            list_of_reviews.append(content)


## Clean the reviews in the list for NLP analysis

In [21]:
clean_reviews = []
for review in list_of_reviews:
    no_punct_review = remove_punctuation(review)
    lemmatised_no_stop_review = lemmatise_and_rmv_stopwords(no_punct_review)
    no_short_tokens_review = prepare_txt_for_lda(lemmatised_no_stop_review) # remove tokens that are too short
    clean_reviews.append(no_short_tokens_review)
    

# Topic Modelling