In [9]:
import pandas as pd
import re
from langdetect import detect, DetectorFactory

In [10]:
# Filter only English Reviews

DetectorFactory.seed = 0 # Ensure consistent results

def is_english(text):
    # Checks if a text is English or not. Returns True if it is English, False if otherwise.
    try:
        return detect(text) == "en" # Checks if text is English
    except:
        return False # In case of error such as empty string

def filter_english(dataframe, text_col):
    """
    Filters out any rows that contain Non-English language.
    """
    dataframe['is_english'] = dataframe[text_col].apply(is_english) # Create new boolean column to classify if the text is english
    english_df = dataframe[dataframe['is_english']] # New DataFrame that only has rows with english values
    
    english_df = english_df.drop(columns=["is_english"]) # Drop is_english column
    
    return english_df

In [7]:
# Text Cleaning and Regular Expression
def regex(text):
    """
    Applies regular expression to a text to remove punctuation marks
    """
    text = re.sub(r'[^\w\s]', "", text) # Replace punctuation marks with empty string
    text = re.sub(r'[\s+]', " ", text) # Replace multiple spaces with one space
    
    return text.strip()
    
def clean_text(dataframe, text_col):
    """
    Ensures data is consistent and removes punctuation for better model performance.
    """
    df = pd.read_csv(dataframe) # Initialize DataFrame
    
    df = df.drop(df.duplicated) # Remove Duplicated rows
    df = df.dropna(subset=[text_col]) # Remove rows with missing values in text column
    
    df[text_col] = df[text_col].apply(regex) # Remove punctuation marks
    
    return df

In [None]:
# Tokenizaton

In [None]:
# Stop Word Removal

In [None]:
# Lemmatization

In [None]:
# Vectorization