### Difference in keyword matches for different providers_with MainClassCode

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

def load_data(review_file: str, keyword_file: str):
    """
    Load review and keyword datasets.
    """
    # Load reviews
    reviews = pd.read_csv(review_file, header=None, names=['HA', 'Review'])

    # Load keywords
    keywords = pd.read_excel(keyword_file)

    return reviews, keywords

def process_keywords(keywords_df: pd.DataFrame):
    """
    Create a dictionary of classes and their keywords.
    """
    keywords_dict = {}
    for _, row in keywords_df.iterrows():
        class_name = row['Class']
        keywords = row['Key_words'].split(',')
        keywords_dict.setdefault(class_name, []).extend(
            [keyword.strip().lower() for keyword in keywords]
        )

    # Deduplicate keywords per class
    keywords_dict = {cls: list(set(kws)) for cls, kws in keywords_dict.items()}

    return keywords_dict

def vectorize_text(reviews: pd.Series, keywords_dict: dict):
    """
    Vectorize reviews and keywords using TF-IDF.
    """
    keywords_list = []
    keyword_classes = []
    for cls, kws in keywords_dict.items():
        keywords_list.extend(kws)
        keyword_classes.extend([cls] * len(kws))

    all_texts = keywords_list + reviews.tolist()
    vectorizer = TfidfVectorizer(stop_words='english').fit(all_texts)

    keywords_tfidf = vectorizer.transform(keywords_list)
    reviews_tfidf = vectorizer.transform(reviews)

    return keywords_tfidf, reviews_tfidf, keyword_classes

def match_keywords(reviews_df: pd.DataFrame, keywords_tfidf, reviews_tfidf, keyword_classes, threshold=0.2):
    """
    Match keywords to reviews and aggregate results by class.
    """
    similarity_scores = cosine_similarity(reviews_tfidf, keywords_tfidf)

    aggregated_counts = []
    for review_idx, ha_name in enumerate(reviews_df['HA']):
        matches = {}
        for keyword_idx, score in enumerate(similarity_scores[review_idx]):
            if score > threshold:
                class_name = keyword_classes[keyword_idx]
                matches[class_name] = matches.get(class_name, 0) + 1

        for class_name, count in matches.items():
            aggregated_counts.append({'HA': ha_name, 'Class': class_name, 'Count': count})

    return pd.DataFrame(aggregated_counts)

def plot_distribution(distribution: pd.DataFrame):
    """
    Plot heatmap of keyword distribution.
    """
    plt.figure(figsize=(12, 8))
    sns.heatmap(distribution, annot=True, cmap='YlGnBu', cbar=True)
    plt.title("Keyword Distribution by Housing Association")
    plt.xlabel("Social Housing Issues (Classes)")
    plt.ylabel("Housing Association")
    plt.show()

def chi_square_test(distribution: pd.DataFrame):
    """
    Perform Chi-Square test on the distribution.
    """
    contingency_table = distribution.T  # Transpose for correct orientation
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    print("\nChi-Square Test Result:")
    print(f"Chi2 Statistic: {chi2}, p-value: {p_value}")

    if p_value < 0.05:
        print("The keyword distributions are significantly different across Housing Associations.")
    else:
        print("The keyword distributions are not significantly different across Housing Associations.")

def plot_frequency(distribution: pd.DataFrame):
    """
    Plot frequency distribution as a heatmap.
    """
    frequency_distribution = distribution.div(distribution.sum(axis=1), axis=0)
    plt.figure(figsize=(16, 8))
    sns.heatmap(frequency_distribution, annot=True, cmap='YlGnBu', fmt='.2f', cbar=True)
    plt.title("Frequency Distribution of Keywords by Housing Association")
    plt.xlabel("Social Housing Issues (Classes)")
    plt.ylabel("Housing Association")
    plt.show()

def main():
    """
    Main function to execute the analysis pipeline.
    """
    # File paths
    review_file = "YOUR_PATH/file_name1.csv"
    keyword_file = "YOUR_PATH/file_name2.xlsx"

    # Step 1: Load data
    reviews, keywords = load_data(review_file, keyword_file)

    # Step 2: Process keywords
    keywords_dict = process_keywords(keywords)

    # Step 3: Vectorize text
    keywords_tfidf, reviews_tfidf, keyword_classes = vectorize_text(reviews['Review'], keywords_dict)

    # Step 4: Match keywords to reviews
    keyword_counts_df = match_keywords(reviews, keywords_tfidf, reviews_tfidf, keyword_classes)

    if keyword_counts_df.empty:
        print("No keywords found in any review.")
        return

    # Step 5: Create and analyze distribution
    distribution = keyword_counts_df.pivot_table(
        index='HA', columns='Class', values='Count', aggfunc='sum', fill_value=0
    )
    print("Keyword Distribution by Housing Association:")
    print(distribution)

    # Plot keyword distribution
    plot_distribution(distribution)

    # Perform Chi-Square test
    chi_square_test(distribution)

    # Plot frequency distribution
    plot_frequency(distribution)

if __name__ == "__main__":
    main()
