# Load the dictionary

In [None]:
import json

# Load the dictionary from a JSON file
with open('/content/drive/MyDrive/WM_Project/sustainability_dict.json', 'r') as json_file:
    sustainability_dict = json.load(json_file)

print(sustainability_dict)


{'Nature & Environment': ['nature', 'biodiversity', 'conservation', 'wildlife', 'eco-lodge', 'sustainable', 'green', 'forest', 'national park', 'wildlife reserve', 'eco-friendly', 'recycling', 'renewable energy', 'carbon-neutral', 'flora', 'fauna', 'climate action', 'marine reserve', 'environmental protection', 'ecotourism', 'natures', 'Purists_insist', 'inherently', 'inter_relatedness', 'Kaminak_disclosure', 'intangibility', 'interiority', 'impermanent', 'CN_cautions', 'anthropocentrism'], 'Culture & Heritage': ['culture', 'heritage', 'festival', 'folk dance', 'traditional', 'art', 'museum', 'historic site', 'indigenous', 'UNESCO', 'architecture', 'craftsmanship', 'monument', 'ritual', 'archaeological site', 'custom', 'oral tradition', 'ancestral', 'ethnic', 'cultural preservation', 'cultures', 'cultural', 'Alan_Hevesi_presided', 'traditions', 'cultural_milieu', 'indigenous_cultures', 'ethos', 'HOMESTEAD_MUSEUM_Southern', 'cultural_diversity', 'hyphenated_identities'], 'Adventure & Ac

# Load the corpuses

In [None]:
import os
import pickle

# Path to the folder containing the .pkl files
folder_path = "/content/drive/MyDrive/WM_Project/Corpus_states"

# Function to load a pickle file
def load_pkl_file(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# List of all .pkl files in the folder
pkl_files = [f for f in os.listdir(folder_path) if f.endswith('.pkl')]

# Load all files into a dictionary, where key is the file name (state name) and value is the text content
state_texts = {}
for file_name in pkl_files:
    file_path = os.path.join(folder_path, file_name)
    state_texts[file_name] = load_pkl_file(file_path)


# Counting occurences of dictionary keywords in corpus

Loading files

In [None]:
import pandas as pd

# Function to count occurrences of keywords in text
def count_keywords_in_text(text, keywords):
    # Ensure text is a string
    if isinstance(text, list):
        text = " ".join(text)  # Join list elements into a single string
    text = text.lower()  # Convert to lowercase to ensure case insensitivity
    word_count = {keyword: text.count(keyword) for keyword in keywords}
    return word_count

# Initialize a dictionary to store results
results = {}

# Loop over all the states and count occurrences of dictionary words
for state, text in state_texts.items():
    state_results = {}
    total_count = 0

    # For each category in sustainability_dict, count occurrences
    for category, keywords in sustainability_dict.items():
        category_count = count_keywords_in_text(text, keywords)
        category_total = sum(category_count.values())  # Total count for this category

        state_results[category] = category_total
        total_count += category_total

    # Add total word count for the state
    state_results['Total'] = total_count

    results[state] = state_results

# Create a pandas DataFrame from the results
df = pd.DataFrame(results).T  # Transpose to have rows as states
df = df.fillna(0)  # Replace NaN values with 0s

# Display the resulting DataFrame
print(df)


                                            Nature & Environment  \
Corpus_states_Arunachal_Pradesh_corpus.pkl                  2792   
Corpus_states_Assam_corpus.pkl                                29   
Corpus_states_Goa_corpus.pkl                                 966   
Corpus_states_Jammu_and_Kashmir_corpus.pkl                   357   
Corpus_states_Karnataka_corpus.pkl                          1558   
Corpus_states_Kerala_corpus.pkl                               72   
Corpus_states_Maharashtra_corpus.pkl                        4125   
merged_corpus.pkl                                           9899   

                                            Culture & Heritage  \
Corpus_states_Arunachal_Pradesh_corpus.pkl                8304   
Corpus_states_Assam_corpus.pkl                             765   
Corpus_states_Goa_corpus.pkl                              4258   
Corpus_states_Jammu_and_Kashmir_corpus.pkl                2485   
Corpus_states_Karnataka_corpus.pkl                       

# Saving the dataframe

In [None]:
# Save the DataFrame to a CSV file
df.to_csv('/content/drive/MyDrive/WM_Project/state_keyword_counts.csv')

# Save it as an Excel file
df.to_excel('/content/drive/MyDrive/WM_Project/state_keyword_counts.xlsx')

print("Results saved to CSV and Excel.")


Results saved to CSV and Excel.


# Calculating Scores

In [None]:
import os
import pickle

# Path to the folder containing the .pkl files
folder_path = "/content/drive/MyDrive/WM_Project/Corpus_states"

# List of all .pkl files in the folder
pkl_files = [f for f in os.listdir(folder_path) if f.endswith('.pkl')]

# Find the merged_corpus.pkl (assuming it's the last file in the folder)
merged_corpus_filename = pkl_files[-1]  # Assuming the last file is the merged corpus
merged_corpus_path = os.path.join(folder_path, merged_corpus_filename)

# Function to load a pickle file
def load_pkl_file(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Load the merged corpus
merged_corpus = load_pkl_file(merged_corpus_path)

# Load other state corpora
state_texts = {}
for file_name in pkl_files[:-1]:  # Exclude the merged_corpus.pkl file
    file_path = os.path.join(folder_path, file_name)
    state_texts[file_name] = load_pkl_file(file_path)

# Print the name of the merged corpus
print(f"Merged corpus loaded from: {merged_corpus_filename}")


Merged corpus loaded from: merged_corpus.pkl


In [None]:
# Count occurrences of related words for each key in the merged corpus
merged_counts = {}

# Loop over each category in the dictionary
for category, keywords in sustainability_dict.items():
    category_count = count_keywords_in_text(merged_corpus, keywords)
    merged_counts[category] = sum(category_count.values())  # Total related words for this category in merged corpus

# Print merged counts (related words count for each key in the merged corpus)
print(merged_counts)


{'Nature & Environment': 9899, 'Culture & Heritage': 54345, 'Adventure & Activities': 13347, 'Local Cuisine & Dining': 21127, 'Affordability & Value': 6592}


In [None]:
def calculate_scores(state, category, category_total, total_words_in_state, merged_counts):
    """
    Calculate two scores for a given state and category.

    :param state: The name of the state being evaluated
    :param category: The current category (key from the sustainability dictionary)
    :param category_total: Total number of related words in the state corpus for this category
    :param total_words_in_state: Total number of words in the entire state corpus
    :param merged_counts: Dictionary with the total count of related words in the merged corpus for each category
    :return: Score 1 and Score 2
    """
    # Score 1: Percentage of category-related words in the state corpus
    score_1 = (category_total / total_words_in_state)if total_words_in_state > 0 else 0

    # Score 2: Percentage of category-related words in the state corpus compared to the merged corpus
    total_in_merged = merged_counts.get(category, 1)  # Get the count from merged corpus, prevent division by zero
    score_2 = (category_total / total_in_merged) if total_in_merged > 0 else 0

    return score_1, score_2


In [None]:
# Function to count occurrences of keywords in text (already defined earlier)
def count_keywords_in_text(text, keywords):
    # Ensure that the text is a single string
    if isinstance(text, list):
        text = " ".join(text)  # Join list of sentences/paragraphs into one string
    text = text.lower()  # Convert to lowercase to ensure case insensitivity
    word_count = {keyword: text.count(keyword) for keyword in keywords}
    return word_count

# Loop through all the states and calculate scores
state_scores = {}

for state, text in state_texts.items():
    state_results = {}

    # Ensure the text is a string before counting words
    if isinstance(text, list):
        text = " ".join(text)  # Convert list to a single string if necessary

    total_words_in_state = len(text.split())  # Total words in the state text (for Score 1)

    for category, keywords in sustainability_dict.items():
        # Count related words for this category in the state text
        category_count = count_keywords_in_text(text, keywords)
        category_total = sum(category_count.values())  # Total related words for this category

        # Calculate Score 1 and Score 2
        score_1, score_2 = calculate_scores(state, category, category_total, total_words_in_state, merged_counts)

        # Store the scores in the results
        state_results[f'{category}_Score_1'] = score_1
        state_results[f'{category}_Score_2'] = score_2
        state_results[category] = category_total  # Keep the related word count as well

    state_scores[state] = state_results

# Create a DataFrame to store the scores for each state
scores_df = pd.DataFrame(state_scores).T  # Transpose so states are rows

# Display the resulting DataFrame
print(scores_df)


                                            Nature & Environment_Score_1  \
Corpus_states_Arunachal_Pradesh_corpus.pkl                      0.212771   
Corpus_states_Assam_corpus.pkl                                  0.100877   
Corpus_states_Goa_corpus.pkl                                    0.167811   
Corpus_states_Jammu_and_Kashmir_corpus.pkl                      0.011844   
Corpus_states_Karnataka_corpus.pkl                              0.154239   
Corpus_states_Kerala_corpus.pkl                                 0.854701   
Corpus_states_Maharashtra_corpus.pkl                            0.639659   

                                            Nature & Environment_Score_2  \
Corpus_states_Arunachal_Pradesh_corpus.pkl                     28.204869   
Corpus_states_Assam_corpus.pkl                                  0.292959   
Corpus_states_Goa_corpus.pkl                                    9.758561   
Corpus_states_Jammu_and_Kashmir_corpus.pkl                      3.606425   
Corpus_stat

In [None]:
# Save the results to CSV and Excel
scores_df.to_csv('/content/drive/MyDrive/WM_Project/state_scores.csv')
# Or to Excel
scores_df.to_excel('/content/drive/MyDrive/WM_Project/state_scores.xlsx')

print("Scores saved to CSV/Excel.")


Scores saved to CSV/Excel.


# Query Search and Ranking

In [None]:
def query_search(query_key, scores_df):
    """
    Search for a specific key (query) and return states ranked by the score related to that key.

    :param query_key: The key (category) from the sustainability dictionary (e.g., 'Nature & Environment')
    :param scores_df: The DataFrame containing the calculated scores for each state
    :return: A ranked list of states based on the query key
    """
    # Check if the query_key exists in the DataFrame columns (score columns)
    score_column = f"{query_key}_Score_1"  # You can adjust this to use Score_2 if needed
    if score_column not in scores_df.columns:
        print(f"Error: The query '{query_key}' does not exist in the data.")
        return

    # Sort the DataFrame based on the selected score (Score_1 in this case)
    ranked_states = scores_df[[score_column]].sort_values(by=score_column, ascending=False)

    # Display the ranked states
    print(f"States ranked by {query_key}:")
    print(ranked_states)

# Example of querying for 'Nature & Environment' (you can change the query to any key from the dictionary)
query = 'Nature & Environment'
query_search(query, scores_df)


States ranked by Nature & Environment:
                                            Nature & Environment_Score_1
Corpus_states_Kerala_corpus.pkl                                 0.854701
Corpus_states_Maharashtra_corpus.pkl                            0.639659
Corpus_states_Arunachal_Pradesh_corpus.pkl                      0.212771
Corpus_states_Goa_corpus.pkl                                    0.167811
Corpus_states_Karnataka_corpus.pkl                              0.154239
Corpus_states_Assam_corpus.pkl                                  0.100877
Corpus_states_Jammu_and_Kashmir_corpus.pkl                      0.011844
