The script endeavors to execute punctuation restoration on the textual content held within the 'Review' column of a CSV file. Subsequently, the processed text is appended to the 'Punctuated Review' column. Owing to certain inherent limitations within the employed model, any text that remains unprocessed by the initial approach undergoes further processing through the utilization of POS tagging techniques.

The model used for this is available at https://github.com/snakers4

In [None]:
import os
import yaml
import pandas as pd
import torch
from torch import package
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
import re

# Define function to correct punctuation and case of a review
def correct_review(review):
    # Tokenize the review into sentences
    sentences = sent_tokenize(review)

    # Apply POS tagging using TextBlob
    tagged_words = []
    for sentence in sentences:
        blob = TextBlob(sentence)
        tags = blob.tags  # Get the POS tags for each word in the sentence
        tagged_words.extend([(word, pos) for word, pos in tags])

    # Apply rules to add punctuation and correct case
    corrected_words = []
    for i, (word, pos) in enumerate(tagged_words):
        # Add period at the end of the sentence
        if i == len(tagged_words) - 1:
            if pos in ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                word += '.'
            else:
                word += '.'

            # Handle multiple consecutive punctuation marks
            word = re.sub(r'([.!?])\1+', r'\1', word)

        # Add comma after adjectives or before coordinating conjunctions (e.g., and, but, or)
        elif pos.startswith('JJ') or (pos == 'CC' and word != ','):
            word += ','

        # Capitalize the first word of the sentence
        if i == 0:
            word = word.capitalize()

        # Add semicolon, closing parenthesis, or closing quotation mark if applicable
        if word.endswith(';'):
            word += ';'
        elif word.startswith('(') and not word.endswith(')'):
            word += ')'
        elif word.startswith('“') and not word.endswith('”'):
            word += '”'

        # Handling quotes and apostrophes
        if word.startswith("'") and not word.endswith("'"):
            word += "'"
        elif word.startswith('"') and not word.endswith('"'):
            word += '"'

        corrected_words.append(word)

    corrected_review = ' '.join(corrected_words)
    return corrected_review

# Load the data (assuming you have a DataFrame named 'data')
data = pd.read_csv('Homedepot_Review data.csv')  # Update with your actual data source

# Convert 'Review' column to lowercase
data['Review'] = data['Review'].str.lower()

torch.hub.download_url_to_file('https://raw.githubusercontent.com/snakers4/silero-models/master/models.yml',
                               'latest_silero_models.yml',
                               progress=False)

with open('latest_silero_models.yml', 'r') as yaml_file:
    models = yaml.load(yaml_file, Loader=yaml.SafeLoader)
model_conf = models.get('te_models').get('latest')

model_url = model_conf.get('package')

model_dir = "downloaded_model"
os.makedirs(model_dir, exist_ok=True)
model_path = os.path.join(model_dir, os.path.basename(model_url))

if not os.path.isfile(model_path):
    torch.hub.download_url_to_file(model_url,
                                   model_path,
                                   progress=True)

imp = package.PackageImporter(model_path)
model = imp.load_pickle("te_model", "model")
example_texts = model.examples

def apply_te(text, lan='en'):
    return model.enhance_text(text, lan)

def process_row(row):
    idx, review = row
    try:
        punctuated_review = apply_te(review, lan='en')
    except Exception as e:
        print(f"Error processing row {idx} with the model: {e}")
        # Use the provided function to process reviews with errors
        punctuated_review = correct_review(review)
    return idx, punctuated_review

# Create a new column "Punctuated Review"
data['Punctuated Review'] = ''

# Define the number of processes to use
num_processes = cpu_count()

# Process and punctuate reviews using multiprocessing and tqdm
with Pool(num_processes) as pool:
    results = list(tqdm(pool.imap(process_row, data[['Review']].itertuples(index=True, name=None)),
                        total=len(data), desc="Processing"))

# Update the DataFrame with the punctuated reviews
for idx, punctuated_review in results:
    data.at[idx, 'Punctuated Review'] = punctuated_review

# Save the updated DataFrame with the punctuated reviews
data.to_csv('processed_data.csv', index=False)  # Update with desired output file name
