In [1]:
import pandas as pd
import spacy

In [27]:
# Load data
df = pd.read_csv('../data/new_tokenized_data.csv')
print("code is running")

code is running


In [28]:
# Print the column names
print("Column Names in the Dataset:::::::")
print(df.columns)

Column Names in the Dataset:::::::
Index(['Area_tokens', 'Popular Facilities_tokens', 'Description_tokens',
       'Facilities_tokens', 'Surroundings_tokens', 'Bathroom Features_tokens',
       'Bedroom Features_tokens', 'Outdoors_tokens', 'Room Amenities_tokens',
       'Activities_tokens', 'Living Area_tokens', 'Media & Technology_tokens',
       'Food & Drink_tokens', 'Parking_tokens', 'Reception services_tokens',
       'Entertainment and family services_tokens', 'Cleaning services_tokens',
       'Safety & security_tokens', 'General_tokens', 'Accessibility_tokens',
       'Wellness_tokens', 'Languages spoken_tokens',
       'Restaurants & cafes_tokens', 'Top attractions_tokens',
       'Natural beauty_tokens', 'Beaches in the neighbourhood_tokens',
       'Public transport_tokens', 'Closest airports_tokens',
       'Cancellation/Prepayment_tokens', 'Children and Bed Policies_tokens',
       'Age Restriction_tokens', 'Pets_tokens',
       'Accepted Payment Methods_tokens', 'Reviews

In [29]:
# Steps for Text Normalization
# Lemmatization: Converts each token into its base or dictionary form.
# Case Normalization: Typically converts all text to lowercase to standardize the data.
# Removing Stop Words: Filters out common words that might not be useful in the analysis.

In [30]:
#Lemmatization

In [31]:
import os
import ast  # To safely evaluate strings that look like lists

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

def parse_list_from_string(list_string):
    try:
        # Safely evaluate strings that represent lists
        return ast.literal_eval(list_string)
    except:
        # Return the original string if it's not a list-like string
        return list_string.split() if isinstance(list_string, str) else list_string

def lemmatize_tokens(tokens):
    """
    Lemmatize a list of tokens using spaCy.
    Args:
    tokens (list of str): The tokens to lemmatize.
    Returns:
    list of str: The lemmatized tokens.
    """
    # Convert list of tokens back to text
    text = ' '.join(tokens)
    
    # Define the maximum chunk size
    max_chunk_size = 1000000  # Maximum number of characters per chunk
    
    # Split the text into manageable chunks
    chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    
    lemmatized_tokens = []
    try:
        for chunk in chunks:
            doc = nlp(chunk)
            # Extract lemmas for each token in the chunk
            lemmatized_tokens.extend([token.lemma_ for token in doc if not token.is_punct and not token.is_space])
    except Exception as e:
        print(f"An error occurred during lemmatization: {e}")
    
    return lemmatized_tokens

def apply_lemmatization(data, token_columns):
    """
    Apply lemmatization to tokenized columns of the dataframe and return only lemmatized columns.
    Args:
    data (DataFrame): The DataFrame containing the tokenized data.
    token_columns (list): A list of column names containing tokenized data.
    Returns:
    DataFrame: A new DataFrame containing only the lemmatized columns.
    """
    lemmatized_data = pd.DataFrame()
    for column in token_columns:
        if column in data.columns:
            # Ensure that the tokens are parsed as lists
            data[column] = data[column].apply(parse_list_from_string)
            lemmatized_data[column + '_lemmatized'] = data[column].apply(lemmatize_tokens)
    return lemmatized_data

# Load your data (ensure this path matches where your data is stored)
# df = pd.read_csv('../data/your_tokenized_data.csv')

# Identify tokenized columns (assuming these are named with a '_tokens' suffix)
token_columns = [col for col in df.columns if '_tokens' in col]

# Apply lemmatization and retrieve only lemmatized columns
lemmatized_data = apply_lemmatization(df, token_columns)

# Save the lemmatized data
save_path = '../data/lemmatized_tokenized_data.csv'
directory = os.path.dirname(save_path)
if not os.path.exists(directory):
    os.makedirs(directory)

try:
    lemmatized_data.to_csv(save_path, index=False)
    print(f"Lemmatized data successfully saved to {save_path}")
except Exception as e:
    print(f"An error occurred while saving the lemmatized data: {e}")

# Preview the lemmatized data
print("Columns lemmatized and saved:", lemmatized_data.columns)
print(lemmatized_data.head())


Lemmatized data successfully saved to ../data/lemmatized_tokenized_data.csv
Columns lemmatized and saved: Index(['Area_tokens_lemmatized', 'Popular Facilities_tokens_lemmatized',
       'Description_tokens_lemmatized', 'Facilities_tokens_lemmatized',
       'Surroundings_tokens_lemmatized', 'Bathroom Features_tokens_lemmatized',
       'Bedroom Features_tokens_lemmatized', 'Outdoors_tokens_lemmatized',
       'Room Amenities_tokens_lemmatized', 'Activities_tokens_lemmatized',
       'Living Area_tokens_lemmatized', 'Media & Technology_tokens_lemmatized',
       'Food & Drink_tokens_lemmatized', 'Parking_tokens_lemmatized',
       'Reception services_tokens_lemmatized',
       'Entertainment and family services_tokens_lemmatized',
       'Cleaning services_tokens_lemmatized',
       'Safety & security_tokens_lemmatized', 'General_tokens_lemmatized',
       'Accessibility_tokens_lemmatized', 'Wellness_tokens_lemmatized',
       'Languages spoken_tokens_lemmatized',
       'Restaurants & 

In [None]:
# Step-by-Step Automated Testing Code
# Setup: Load your dataset and the spaCy language model.
# Validation Function: Create a function to compare the lemmatized tokens in the dataset against the lemmas generated by spaCy from the original text.
# Applying the Validation: Apply this function across your DataFrame to check each row.
# Report Results: Summarize and report any discrepancies.

In [2]:
import pandas as pd
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

# Load your lemmatized dataset
data = pd.read_csv('../data/lemmatized_tokenized_data.csv')

def validate_lemmatization(original_text, lemmatized_tokens):
    """
    Validate lemmatization by comparing the dataset's lemmatized tokens against spaCy's output.
    
    Args:
    original_text (str): The original text before tokenization and lemmatization.
    lemmatized_tokens (list of str): Lemmatized tokens from the dataset.

    Returns:
    bool: True if the validation passes, False otherwise.
    """
    # Process the original text with spaCy
    doc = nlp(original_text)
    # Generate expected lemmas, filtering out punctuation and spaces
    expected_lemmas = [token.lemma_ for token in doc if not token.is_punct and not token.is_space]

    # Compare the expected lemmas with the lemmatized tokens from the dataset
    return expected_lemmas == lemmatized_tokens

# Assuming your DataFrame has a column 'Original Text' and 'Lemmatized Tokens' that needs to be validated
# Convert lemmatized tokens stored as string back to list
data['Lemmatized Tokens'] = data['Lemmatized Tokens'].apply(eval)

# Apply the validation function
data['Validation Result'] = data.apply(lambda row: validate_lemmatization(row['Original Text'], row['Lemmatized Tokens']), axis=1)

# Check for rows where validation failed
invalid_rows = data[data['Validation Result'] == False]
print(f"Number of rows with invalid lemmatization: {len(invalid_rows)}")

if len(invalid_rows) > 0:
    print(invalid_rows[['Original Text', 'Lemmatized Tokens']])


KeyError: 'Lemmatized Tokens'

In [None]:
# Python Code to Apply Case Normalization and Stop Word Removal


In [5]:
import pandas as pd
import spacy
import ast

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

def parse_tokens_from_string(list_string):
    """
    Parse and safely evaluate strings that look like list of tokens.
    Args:
    list_string (str): String representation of a list of tokens.
    
    Returns:
    list: Evaluated list of tokens if valid, otherwise an empty list.
    """
    try:
        # Safely evaluate string that represents a list
        tokens = ast.literal_eval(list_string)
        if isinstance(tokens, list):
            return tokens
        else:
            return []
    except:
        # In case of any error, return an empty list
        return []

def normalize_and_remove_stopwords(tokens):
    """
    Normalize case and remove stop words from a list of tokens.
    Args:
    tokens (list of str): The tokens to process.
    
    Returns:
    list of str: Tokens after converting to lowercase and removing stop words.
    """
    # Convert list of tokens back to text for processing
    text = ' '.join(tokens)
    
    # Process the text with spaCy
    doc = nlp(text)
    
    # Apply case normalization and filter out stop words and punctuation
    filtered_tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
    
    return filtered_tokens

def apply_text_processing(data, token_columns):
    """
    Apply text processing including normalization and stop word removal to specified token columns of the dataframe.
    Args:
    data (DataFrame): The DataFrame containing the tokenized data.
    token_columns (list): A list of column names containing tokenized data.
    
    Returns:
    DataFrame: The DataFrame with text processed columns.
    """
    for column in token_columns:
        if column in data.columns:
            # Parse tokens if they are stored as string representations of lists
            data[column] = data[column].apply(parse_tokens_from_string)
            data[column + '_clean'] = data[column].apply(normalize_and_remove_stopwords)
    return data

# Load your lemmatized data
df = pd.read_csv('../data/lemmatized_tokenized_data.csv')

# Identify lemmatized columns (assuming these are named with a '_lemmatized' suffix)
lemmatized_columns = [col for col in df.columns if '_lemmatized' in col]

# Apply text processing
df = apply_text_processing(df, lemmatized_columns)

# Save the processed data
#df.to_csv('../data/cleaned_data.csv', index=False)

# Preview the cleaned data
print("Columns processed and cleaned:", lemmatized_columns)
print(df.head())


Columns processed and cleaned: ['Area_tokens_lemmatized', 'Popular Facilities_tokens_lemmatized', 'Description_tokens_lemmatized', 'Facilities_tokens_lemmatized', 'Surroundings_tokens_lemmatized', 'Bathroom Features_tokens_lemmatized', 'Bedroom Features_tokens_lemmatized', 'Outdoors_tokens_lemmatized', 'Room Amenities_tokens_lemmatized', 'Activities_tokens_lemmatized', 'Living Area_tokens_lemmatized', 'Media & Technology_tokens_lemmatized', 'Food & Drink_tokens_lemmatized', 'Parking_tokens_lemmatized', 'Reception services_tokens_lemmatized', 'Entertainment and family services_tokens_lemmatized', 'Cleaning services_tokens_lemmatized', 'Safety & security_tokens_lemmatized', 'General_tokens_lemmatized', 'Accessibility_tokens_lemmatized', 'Wellness_tokens_lemmatized', 'Languages spoken_tokens_lemmatized', 'Restaurants & cafes_tokens_lemmatized', 'Top attractions_tokens_lemmatized', 'Natural beauty_tokens_lemmatized', 'Beaches in the neighbourhood_tokens_lemmatized', 'Public transport_token

In [None]:

# To update the code so that you can save two different datasets—one with all the data and another with only the cleaned data columns—follow the steps 
# below. This approach will let you maintain a complete set with the original and processed data and a separate file that contains only the cleaned 
# columns.

In [6]:
import pandas as pd
import spacy
import ast

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

def parse_tokens_from_string(list_string):
    """
    Parse and safely evaluate strings that look like list of tokens.
    Args:
    list_string (str): String representation of a list of tokens.
    
    Returns:
    list: Evaluated list of tokens if valid, otherwise an empty list.
    """
    try:
        # Safely evaluate string that represents a list
        tokens = ast.literal_eval(list_string)
        if isinstance(tokens, list):
            return tokens
        else:
            return []
    except:
        # In case of any error, return an empty list
        return []

def normalize_and_remove_stopwords(tokens):
    """
    Normalize case and remove stop words from a list of tokens.
    Args:
    tokens (list of str): The tokens to process.
    
    Returns:
    list of str: Tokens after converting to lowercase and removing stop words.
    """
    # Convert list of tokens back to text for processing
    text = ' '.join(tokens)
    
    # Process the text with spaCy
    doc = nlp(text)
    
    # Apply case normalization and filter out stop words and punctuation
    filtered_tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space]
    
    return filtered_tokens

def apply_text_processing(data, token_columns):
    """
    Apply text processing including normalization and stop word removal to specified token columns of the dataframe.
    Args:
    data (DataFrame): The DataFrame containing the tokenized data.
    token_columns (list): A list of column names containing tokenized data.
    
    Returns:
    DataFrame: The DataFrame with text processed columns.
    """
    for column in token_columns:
        if column in data.columns:
            # Parse tokens if they are stored as string representations of lists
            data[column] = data[column].apply(parse_tokens_from_string)
            data[column + '_clean'] = data[column].apply(normalize_and_remove_stopwords)
    return data

# Load your lemmatized data
df = pd.read_csv('../data/lemmatized_tokenized_data.csv')

# Identify lemmatized columns (assuming these are named with a '_lemmatized' suffix)
lemmatized_columns = [col for col in df.columns if '_lemmatized' in col]

# Apply text processing
df = apply_text_processing(df, lemmatized_columns)

# Save the complete dataset with both original and cleaned data
df.to_csv('../data/complete_processed_stopword_data.csv', index=False)

# Create and save a dataset with only the cleaned columns
cleaned_columns = [col for col in df.columns if '_clean' in col]
df_cleaned = df[cleaned_columns]
df_cleaned.to_csv('../data/only_cleaned_data.csv', index=False)

# Preview the cleaned data
print("Columns processed and cleaned:", lemmatized_columns)
print(df[cleaned_columns].head())


Columns processed and cleaned: ['Area_tokens_lemmatized', 'Popular Facilities_tokens_lemmatized', 'Description_tokens_lemmatized', 'Facilities_tokens_lemmatized', 'Surroundings_tokens_lemmatized', 'Bathroom Features_tokens_lemmatized', 'Bedroom Features_tokens_lemmatized', 'Outdoors_tokens_lemmatized', 'Room Amenities_tokens_lemmatized', 'Activities_tokens_lemmatized', 'Living Area_tokens_lemmatized', 'Media & Technology_tokens_lemmatized', 'Food & Drink_tokens_lemmatized', 'Parking_tokens_lemmatized', 'Reception services_tokens_lemmatized', 'Entertainment and family services_tokens_lemmatized', 'Cleaning services_tokens_lemmatized', 'Safety & security_tokens_lemmatized', 'General_tokens_lemmatized', 'Accessibility_tokens_lemmatized', 'Wellness_tokens_lemmatized', 'Languages spoken_tokens_lemmatized', 'Restaurants & cafes_tokens_lemmatized', 'Top attractions_tokens_lemmatized', 'Natural beauty_tokens_lemmatized', 'Beaches in the neighbourhood_tokens_lemmatized', 'Public transport_token