### Housing Association Review Classification and Theme Visualization

#### Introduction

This Python script processes Housing Association reviews and classifies them based on predefined themes and keywords. The process involves:

1. **Loading and Filtering Data:** Reviews with a rating of 1 are filtered from the provided dataset.
2. **Loading Themes:** The script loads themes and associated keywords, which represent different classes of social housing issues.
3. **Keyword Matching:** Using TF-IDF (Term Frequency-Inverse Document Frequency), the script calculates the cosine similarity between keywords and review content to identify relevant themes in each review.
4. **Visualization:** The results are visualized in the form of a heatmap to show the distribution of identified themes (keyword classes) across Housing Associations.

#### Requirements

- **Pandas**: For data manipulation and analysis.
- **Scikit-learn**: For vectorizing text data and calculating cosine similarity.
- **Matplotlib & Seaborn**: For generating the heatmap visualizations.

To run the script, simply replace the file paths for your input data files (`combined_df.csv` for the reviews and `Themes2D.xlsx` for the themes) and execute the code.


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_filter_reviews(file_path):
    """
    Loads Housing Association reviews data from the given CSV file and filters out reviews with RatingValue == 1.
    
    Parameters:
        file_path (str): The file path to the reviews data CSV.
        
    Returns:
        pd.DataFrame: Filtered DataFrame with columns: 'HousingAssociation', 'ReviewBody'.
    """
    df = pd.read_csv(file_path)
    # Filter reviews with RatingValue == 1
    df_filtered = df[df['RatingValue'] == 1][['HousingAssociation', 'ReviewBody']].reset_index(drop=True)
    return df_filtered

def load_and_process_themes(file_path):
    """
    Loads the themes data from an Excel file and returns a DataFrame with 'Main_Class' and 'Key_words'.
    
    Parameters:
        file_path (str): The file path to the themes Excel file.
        
    Returns:
        pd.DataFrame: Filtered DataFrame with columns: 'Main_Class' and 'Key_words'.
    """
    df = pd.read_excel(file_path)
    df_filtered = df[['Main_Class', 'Key_words']]
    return df_filtered

def create_keywords_dict(df):
    """
    Converts the 'Key_words' from the themes DataFrame into a dictionary with 'Main_Class' as keys
    and lists of keywords as values. Keywords are lowercased and stripped of extra spaces.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing 'Main_Class' and 'Key_words' columns.
        
    Returns:
        dict: A dictionary where keys are 'Main_Class' and values are lists of keywords.
    """
    keywords_dict = {}
    for _, row in df.iterrows():
        class_name = row['Main_Class']
        keywords = [keyword.strip().lower() for keyword in row['Key_words'].split(',')]
        keywords_dict[class_name] = keywords
    return keywords_dict

def prepare_keywords_and_classes(keywords_dict):
    """
    Prepares two lists: one with all the keywords and another with their associated classes.
    
    Parameters:
        keywords_dict (dict): A dictionary with 'Main_Class' as keys and lists of keywords as values.
        
    Returns:
        tuple: A tuple containing:
            - List of all keywords.
            - List of corresponding classes for each keyword.
    """
    keywords_list = []
    keyword_classes = []
    for class_name, keywords in keywords_dict.items():
        for keyword in keywords:
            keywords_list.append(keyword)
            keyword_classes.append(class_name)
    return keywords_list, keyword_classes

def vectorize_texts(keywords_list, reviews_list):
    """
    Vectorizes the given list of keywords and reviews using TF-IDF.
    
    Parameters:
        keywords_list (list): List of keywords.
        reviews_list (list): List of reviews (strings).
        
    Returns:
        tuple: A tuple containing:
            - TF-IDF matrix for reviews.
            - TF-IDF matrix for keywords.
            - The vectorizer used for transforming the texts.
    """
    all_texts = keywords_list + reviews_list  # Combine keywords and reviews for TF-IDF vectorization
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_texts)
    
    # Split the matrix into keywords and reviews parts
    keywords_tfidf = tfidf_matrix[:len(keywords_list)]  # First part for keywords
    reviews_tfidf = tfidf_matrix[len(keywords_list):]  # Remaining part for reviews
    return reviews_tfidf, keywords_tfidf, vectorizer

def calculate_cosine_similarity(reviews_tfidf, keywords_tfidf):
    """
    Calculates cosine similarity between reviews and keywords.
    
    Parameters:
        reviews_tfidf (scipy.sparse.csr_matrix): TF-IDF matrix for reviews.
        keywords_tfidf (scipy.sparse.csr_matrix): TF-IDF matrix for keywords.
        
    Returns:
        numpy.ndarray: Cosine similarity scores between reviews and keywords.
    """
    return cosine_similarity(reviews_tfidf, keywords_tfidf)

def match_keywords_to_reviews(similarity_scores, keyword_classes, df_reviews, threshold=0.2):
    """
    Matches keywords to reviews based on cosine similarity and counts the occurrences of each keyword class in each review.
    
    Parameters:
        similarity_scores (numpy.ndarray): Matrix of cosine similarity scores between reviews and keywords.
        keyword_classes (list): List of classes corresponding to the keywords.
        df_reviews (pd.DataFrame): DataFrame containing the review data.
        threshold (float): Cosine similarity threshold for considering a match.
        
    Returns:
        list: A list of dictionaries with 'HousingAssociation', 'Class', and 'Count' for each matched class.
    """
    keyword_counts = []
    for review_idx, row in df_reviews.iterrows():
        ha_name = row['HousingAssociation']
        matches = {}
        
        # Iterate through each similarity score for the current review
        for keyword_idx, score in enumerate(similarity_scores[review_idx]):
            if score > threshold:  # Only count matches above the threshold
                class_name = keyword_classes[keyword_idx]
                matches[class_name] = matches.get(class_name, 0) + 1
        
        # Append the matched counts to the result list
        for class_name, count in matches.items():
            keyword_counts.append({'HousingAssociation': ha_name, 'Class': class_name, 'Count': count})
    
    return keyword_counts

def visualize_keyword_distribution(keyword_df):
    """
    Visualizes the keyword distribution using a heatmap.
    
    Parameters:
        keyword_df (pd.DataFrame): DataFrame containing the keyword count distribution.
    """
    # Create a pivot table for the heatmap
    distribution = keyword_df.pivot_table(index='HousingAssociation', columns='Class', values='Count', aggfunc='sum', fill_value=0)
    
    # Plotting the heatmap
    plt.figure(figsize=(15, 2))
    sns.heatmap(distribution, annot=True, cmap='YlGnBu', cbar=True)
    plt.title("Keyword Distribution for Housing Association Reviews")
    plt.xlabel("Social Housing Issues (Classes)")
    plt.ylabel("Housing Association")
    plt.show()

def main():
    """
    Main function to execute the entire process of analyzing Housing Association reviews,
    matching keywords based on similarity, and visualizing the results.
    """
    # File paths (replace 'YOURpath' with actual paths)
    reviews_file_path = "YOURpath/combined_df.csv"
    themes_file_path = "YOURpath/Themes2D.xlsx" 
    
    # Step 1: Load and filter reviews data
    df_reviews = load_and_filter_reviews(reviews_file_path)
    
    # Step 2: Load and process themes data
    df_themes = load_and_process_themes(themes_file_path)
    
    # Step 3: Create a dictionary of keywords grouped by class
    keywords_dict = create_keywords_dict(df_themes)
    
    # Step 4: Prepare lists of keywords and their corresponding classes
    keywords_list, keyword_classes = prepare_keywords_and_classes(keywords_dict)
    
    # Step 5: Vectorize the reviews and keywords using TF-IDF
    reviews_tfidf, keywords_tfidf, vectorizer = vectorize_texts(keywords_list, df_reviews['ReviewBody'].tolist())
    
    # Step 6: Calculate cosine similarity between reviews and keywords
    similarity_scores = calculate_cosine_similarity(reviews_tfidf, keywords_tfidf)
    
    # Step 7: Match keywords to reviews and count occurrences by class
    keyword_counts = match_keywords_to_reviews(similarity_scores, keyword_classes, df_reviews)
    
    # Step 8: Convert the counts into a DataFrame
    keyword_df = pd.DataFrame(keyword_counts)
    
    # Step 9: Check if the DataFrame is empty, else proceed with visualization
    if keyword_df.empty:
        print("No keywords found in any review.")
    else:
        print("Keyword Distribution for Housing Association:")
        distribution = keyword_df.pivot_table(index='HousingAssociation', columns='Class', values='Count', aggfunc='sum', fill_value=0)
        print(distribution)
        visualize_keyword_distribution(keyword_df)

# Run the main function
if __name__ == "__main__":
    main()
