In [7]:
import pandas as pd
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

def pos_tagging(text):
    """
    Apply POS tagging to a given text using spaCy and return the tags.
    Args:
    text (str): A string of text that has been cleaned and normalized.

    Returns:
    list: A list of tuples containing the word and its corresponding POS tag.
    """
    # Process the text through spaCy NLP pipeline
    doc = nlp(text)
    # Create a list of tuples for each token and its POS tag
    return [(token.text, token.pos_) for token in doc]

# Load your dataset
df = pd.read_csv('../data/only_cleaned_data.csv')

# Identify columns to apply POS tagging based on a suffix "_clean"
columns_to_tag = [col for col in df.columns if col.endswith('_clean')]

# Apply POS tagging to each relevant column
for column in columns_to_tag:
    df[column + '_pos'] = df[column].apply(lambda x: pos_tagging(str(x)) if pd.notnull(x) else [])

# Save the POS tagged data to a new CSV file
#df.to_csv('../data/pos_tagged_data.csv', index=False)

# Print some examples to verify
print(df[[column + '_pos' for column in columns_to_tag]].head())


                    Area_tokens_lemmatized_clean_pos  \
0  [([, X), (', PUNCT), (unknown, ADJ), (', PUNCT...   
1  [([, X), (', X), (ella, X), (', PUNCT), (], PU...   
2  [([, X), (', NOUN), (hikkaduwa, ADJ), (', PUNC...   
3  [([, X), (', PUNCT), (kandy, ADJ), (', PUNCT),...   
4  [([, X), (', PUNCT), (unknown, ADJ), (', PUNCT...   

      Popular Facilities_tokens_lemmatized_clean_pos  \
0  [([, X), (', PUNCT), (private, ADJ), (', PUNCT...   
1  [([, X), (', PUNCT), (airport, NOUN), (', PUNC...   
2  [([, X), (', PUNCT), (outdoor, ADJ), (', PUNCT...   
3  [([, X), (', PUNCT), (free, ADJ), (', PUNCT), ...   
4  [([, X), (', PUNCT), (outdoor, ADJ), (', PUNCT...   

             Description_tokens_lemmatized_clean_pos  \
0  [([, X), (', NUM), (108, NUM), (', NUM), (,, P...   
1  [([, X), (', NOUN), (3, NUM), (', NUM), (,, PU...   
2  [([, X), (', NUM), (33, NUM), (', NUM), (,, PU...   
3  [([, X), (', NUM), (360, NUM), (', NUM), (,, P...   
4  [([, X), (', PUNCT), (3r, NUM), (', PUNCT),

In [8]:
import pandas as pd
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

def pos_tagging(text):
    """
    Apply POS tagging to a given text using spaCy and return the tags.
    This function filters out punctuation, placeholder tokens like 'unknown', and other non-informative tags.
    Args:
    text (str): A string of text that has been cleaned and normalized.

    Returns:
    list: A list of tuples containing the word and its corresponding POS tag, excluding unwanted tokens.
    """
    # Process the text through spaCy NLP pipeline
    doc = nlp(text)
    # Create a list of tuples for each token and its POS tag, applying filters
    return [(token.text, token.pos_) for token in doc if token.pos_ not in ['PUNCT', 'X'] and not token.text in ['unknown', '[', ']']]

# Load your dataset
df = pd.read_csv('../data/only_cleaned_data.csv')

# Identify columns to apply POS tagging based on a suffix "_clean"
columns_to_tag = [col for col in df.columns if col.endswith('_clean')]

# Apply POS tagging to each relevant column
for column in columns_to_tag:
    df[column + '_pos'] = df[column].apply(lambda x: pos_tagging(str(x)) if pd.notnull(x) else [])

# Save the POS tagged data to a new CSV file
df.to_csv('../data/pos_tagged_data.csv', index=False)

# Print some examples to verify
print(df[[column + '_pos' for column in columns_to_tag]].head())


  Area_tokens_lemmatized_clean_pos  \
0                               []   
1                               []   
2    [(', NOUN), (hikkaduwa, ADJ)]   
3                   [(kandy, ADJ)]   
4                               []   

      Popular Facilities_tokens_lemmatized_clean_pos  \
0  [(private, ADJ), (beach, NOUN), (outdoor, ADJ)...   
1  [(airport, NOUN), (shuttle, NOUN), (', NOUN), ...   
2  [(outdoor, ADJ), (swimming, VERB), (pool, NOUN...   
3  [(free, ADJ), (wifi, NOUN), (family, NOUN), (r...   
4  [(outdoor, ADJ), (swimming, VERB), (pool, NOUN...   

             Description_tokens_lemmatized_clean_pos  \
0  [(', NUM), (108, NUM), (', NUM), (108, NUM), (...   
1  [(', NOUN), (3, NUM), (', NUM), (2.6, NUM), ('...   
2  [(', NUM), (33, NUM), (', NUM), (1.8, NUM), (k...   
3  [(', NUM), (360, NUM), (', NUM), (3, NUM), (',...   
4  [(3r, NUM), (1, NUM), (', NUM), (km, VERB), ('...   

              Facilities_tokens_lemmatized_clean_pos  \
0  [(', NOUN), (bathroom, NOUN), (', PROP

In [9]:
import pandas as pd
import spacy

# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

def pos_tagging(text):
    """
    Apply POS tagging to a given text using spaCy and return the tags.
    This function filters out punctuation, placeholder tokens like 'unknown', and other non-informative tags.
    Args:
    text (str): A string of text that has been cleaned and normalized.

    Returns:
    list: A list of tuples containing the word and its corresponding POS tag, excluding unwanted tokens.
    """
    # Process the text through spaCy NLP pipeline
    doc = nlp(text)
    # Create a list of tuples for each token and its POS tag, applying filters
    return [(token.text, token.pos_) for token in doc if token.pos_ not in ['PUNCT', 'X'] and not token.text in ['unknown', '[', ']']]

# Load your dataset
df = pd.read_csv('../data/only_cleaned_data.csv')

# Identify columns to apply POS tagging based on a suffix "_clean"
columns_to_tag = [col for col in df.columns if col.endswith('_clean')]

# Initialize a new DataFrame to store only POS tagged columns
pos_tagged_df = pd.DataFrame()

# Apply POS tagging to each relevant column and store it in the new DataFrame
for column in columns_to_tag:
    # Apply POS tagging
    df[column + '_pos'] = df[column].apply(lambda x: pos_tagging(str(x)) if pd.notnull(x) else [])
    # Copy the POS tagged data to the new DataFrame
    pos_tagged_df[column + '_pos'] = df[column + '_pos']

# Optionally, include any identifiers or relevant metadata from the original data
if 'id' in df.columns:
    pos_tagged_df['id'] = df['id']  # Assuming 'id' is an identifier column in the original data

# Save the original POS tagged data to a new CSV file
df.to_csv('../data/pos_tagged_data.csv', index=False)

# Save the POS-only tagged data to a separate CSV file
pos_tagged_df.to_csv('../data/pos_only_tagged_data.csv', index=False)

# Print some examples to verify
print(pos_tagged_df.head())


  Area_tokens_lemmatized_clean_pos  \
0                               []   
1                               []   
2    [(', NOUN), (hikkaduwa, ADJ)]   
3                   [(kandy, ADJ)]   
4                               []   

      Popular Facilities_tokens_lemmatized_clean_pos  \
0  [(private, ADJ), (beach, NOUN), (outdoor, ADJ)...   
1  [(airport, NOUN), (shuttle, NOUN), (', NOUN), ...   
2  [(outdoor, ADJ), (swimming, VERB), (pool, NOUN...   
3  [(free, ADJ), (wifi, NOUN), (family, NOUN), (r...   
4  [(outdoor, ADJ), (swimming, VERB), (pool, NOUN...   

             Description_tokens_lemmatized_clean_pos  \
0  [(', NUM), (108, NUM), (', NUM), (108, NUM), (...   
1  [(', NOUN), (3, NUM), (', NUM), (2.6, NUM), ('...   
2  [(', NUM), (33, NUM), (', NUM), (1.8, NUM), (k...   
3  [(', NUM), (360, NUM), (', NUM), (3, NUM), (',...   
4  [(3r, NUM), (1, NUM), (', NUM), (km, VERB), ('...   

              Facilities_tokens_lemmatized_clean_pos  \
0  [(', NOUN), (bathroom, NOUN), (', PROP

In [12]:
import pandas as pd

# Load your dataset
df = pd.read_csv('../data/pos_only_tagged_data.csv')  # Adjust the file path to your dataset

# Specify the column name you want to display
column_name = 'Popular Facilities_tokens_lemmatized_clean_pos'  # Change this to the actual column name

# Check if the column exists in the DataFrame
if column_name in df.columns:
    # Print all data in the specified column
   # Print the count of non-null rows in the specified column
    print(f"Count of non-null rows in '{column_name}': {df[column_name].sum()}")
else:
    print(f"Column '{column_name}' not found in the dataset.")


Count of non-null rows in 'Popular Facilities_tokens_lemmatized_clean_pos': [('private', 'ADJ'), ('beach', 'NOUN'), ('outdoor', 'ADJ'), ('swimming', 'VERB'), ('pool', 'NOUN'), ('airport', 'NOUN'), ('shuttle', 'NOUN'), ("'", 'NOUN'), ('non', 'NOUN'), ("'", 'VERB'), ('smoking', 'VERB'), ('room', 'NOUN'), ('room', 'NOUN'), ('service', 'NOUN'), ('free', 'ADJ'), ('parking', 'NOUN'), ('free', 'ADJ'), ('wifi', 'NOUN'), ('facility', 'NOUN'), ('disabled', 'ADJ'), ('guest', 'NOUN'), ('beachfront', 'VERB'), ("'", 'PROPN'), ('area', 'NOUN'), ('breakfast', 'NOUN')][('airport', 'NOUN'), ('shuttle', 'NOUN'), ("'", 'NOUN'), ('non', 'NOUN'), ("'", 'VERB'), ('smoking', 'VERB'), ('room', 'NOUN'), ('room', 'NOUN'), ('service', 'NOUN'), ('free', 'ADJ'), ('parking', 'NOUN'), ('free', 'ADJ'), ('wifi', 'NOUN'), ('restaurant', 'NOUN'), ('family', 'NOUN'), ('room', 'NOUN'), ('teacoffee', 'NOUN'), ('maker', 'NOUN'), ('room', 'NOUN'), ('bar', 'NOUN'), ('breakfast', 'NOUN')][('outdoor', 'ADJ'), ('swimming', 'VERB'