### Keyword_Analysis_Housing_Association_Reviews_

"""

This script analyzes reviews from Housing Associations, specifically those with a 'RatingValue' of 1, 
to identify relevant keywords associated with different classes of social housing issues. 

It follows a multi-step process:
1. Loads and filters review data from a CSV file, retaining only reviews with a 'RatingValue' of 1.
2. Loads theme data from an Excel file, mapping each theme to a list of associated keywords.
3. Uses TF-IDF (Term Frequency-Inverse Document Frequency) to vectorize both the reviews and the keywords.
4. Calculates cosine similarity between the reviews and keywords, identifying relevant matches above a set threshold.
5. Counts the occurrences of each keyword class per review and generates a distribution.
6. Visualizes the keyword distribution across Housing Associations using a heatmap.

The final output includes a keyword distribution table and a visual representation of the keyword occurrences.

To run the script, update the file paths (`reviews_file_path` and `themes_file_path`) to point to your local data files, then execute the `main()` function.

Dependencies:
- pandas
- scikit-learn
- matplotlib
- seaborn

"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_filter_reviews(file_path):
    """
    Loads Housing Association reviews data and filters out reviews with RatingValue == 1.
    Returns a DataFrame with columns: 'HousingAssociation', 'ReviewBody'.
    """
    df = pd.read_csv(file_path)
    # Filter reviews with RatingValue == 1
    df_filtered = df[df['RatingValue'] == 1][['HousingAssociation', 'ReviewBody']].reset_index(drop=True)
    return df_filtered

def load_and_process_themes(file_path):
    """
    Loads the themes data with Main_Class names and associated keywords.
    Returns a DataFrame with 'Main_Class' and 'Key_words' columns.
    """
    df = pd.read_excel(file_path)
    df_filtered = df[['Main_Class', 'Key_words']]
    return df_filtered

def create_keywords_dict(df):
    """
    Converts the 'Key_words' from the themes DataFrame into a dictionary with 'Main_Class' as keys
    and lists of keywords as values. Keywords are lowercased and stripped of extra spaces.
    """
    keywords_dict = {}
    for _, row in df.iterrows():
        class_name = row['Main_Class']
        keywords = [keyword.strip().lower() for keyword in row['Key_words'].split(',')]
        keywords_dict[class_name] = keywords
    return keywords_dict

def prepare_keywords_and_classes(keywords_dict):
    """
    Prepares two lists: one with all the keywords and another with their associated classes.
    """
    keywords_list = []
    keyword_classes = []
    for class_name, keywords in keywords_dict.items():
        for keyword in keywords:
            keywords_list.append(keyword)
            keyword_classes.append(class_name)
    return keywords_list, keyword_classes

def vectorize_texts(keywords_list, reviews_list):
    """
    Vectorizes the given list of keywords and reviews using TF-IDF.
    Returns the TF-IDF matrices for both keywords and reviews.
    """
    all_texts = keywords_list + reviews_list  # Combine keywords and reviews for TF-IDF vectorization
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Split the matrix into keywords and reviews parts
    keywords_tfidf = tfidf_matrix[:len(keywords_list)]  # First part for keywords
    reviews_tfidf = tfidf_matrix[len(keywords_list):]  # Remaining part for reviews
    return reviews_tfidf, keywords_tfidf, vectorizer

def calculate_cosine_similarity(reviews_tfidf, keywords_tfidf):
    """
    Calculates cosine similarity between reviews and keywords.
    Returns the similarity scores matrix.
    """
    return cosine_similarity(reviews_tfidf, keywords_tfidf)

def match_keywords_to_reviews(similarity_scores, keyword_classes, df_reviews, threshold=0.2):
    """
    Matches keywords to reviews based on cosine similarity and counts the occurrences of each keyword class in each review.
    Returns a list of dictionaries with keyword counts per review and class.
    """
    keyword_counts = []
    for review_idx, row in df_reviews.iterrows():
        ha_name = row['HousingAssociation']
        matches = {}
        
        # Iterate through each similarity score for the current review
        for keyword_idx, score in enumerate(similarity_scores[review_idx]):
            if score > threshold:  # Only count matches above the threshold
                class_name = keyword_classes[keyword_idx]
                matches[class_name] = matches.get(class_name, 0) + 1
        
        # Append the matched counts to the result list
        for class_name, count in matches.items():
            keyword_counts.append({'HousingAssociation': ha_name, 'Class': class_name, 'Count': count})
    
    return keyword_counts

def visualize_keyword_distribution(keyword_df):
    """
    Visualizes the keyword distribution using a heatmap.
    """
    # Create a pivot table for the heatmap
    distribution = keyword_df.pivot_table(index='HousingAssociation', columns='Class', values='Count', aggfunc='sum', fill_value=0)
    
    # Plotting the heatmap
    plt.figure(figsize=(15, 1))
    sns.heatmap(distribution, annot=True, cmap='YlGnBu', cbar=True)
    plt.title("Keyword Distribution for Housing Association Reviows")
    plt.xlabel("Social Housing Issues (Classes)")
    plt.ylabel("Housing Association")
    plt.show()

def main():
    # File paths (replace 'YOURpath' with actual paths)
    reviews_file_path = "YOURpath/HousingAssociationNAME.csv"
    themes_file_path = "YOURpath/Themes2D.xlsx" 
    
    # Step 1: Load and filter reviews data
    df_reviews = load_and_filter_reviews(reviews_file_path)
    
    # Step 2: Load and process themes data
    df_themes = load_and_process_themes(themes_file_path)
    
    # Step 3: Create a dictionary of keywords grouped by class
    keywords_dict = create_keywords_dict(df_themes)
    
    # Step 4: Prepare lists of keywords and their corresponding classes
    keywords_list, keyword_classes = prepare_keywords_and_classes(keywords_dict)
    
    # Step 5: Vectorize the reviews and keywords using TF-IDF
    reviews_tfidf, keywords_tfidf, vectorizer = vectorize_texts(keywords_list, df_reviews['ReviewBody'].tolist())
    
    # Step 6: Calculate cosine similarity between reviews and keywords
    similarity_scores = calculate_cosine_similarity(reviews_tfidf, keywords_tfidf)
    
    # Step 7: Match keywords to reviews and count occurrences by class
    keyword_counts = match_keywords_to_reviews(similarity_scores, keyword_classes, df_reviews)
    
    # Step 8: Convert the counts into a DataFrame
    keyword_df = pd.DataFrame(keyword_counts)
    
    # Step 9: Check if the DataFrame is empty, else proceed with visualization
    if keyword_df.empty:
        print("No keywords found in any review.")
    else:
        print("Keyword Distribution for Housing Association Reviews:")
        distribution = keyword_df.pivot_table(index='HousingAssociation', columns='Class', values='Count', aggfunc='sum', fill_value=0)
        print(distribution)
        visualize_keyword_distribution(keyword_df)

# Run the main function
if __name__ == "__main__":
    main()
