<a href="https://colab.research.google.com/github/Lexi-Zhou/stats201-project-zzz/blob/main/Code/W4_2_Model1_RandomForest_%26_LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import os
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Loop Random Forest & Logistic Regression processing

In [3]:
base_path = '/content/drive/MyDrive/Colab Notebooks/stats201_final_project/Week_4/'

file_dict = {
    'df1_stem_poor': '18_RMP_stem_poor.csv',
    'df2_stem_average': '18_RMP_stem_average.csv',
    'df3_stem_good': '18_RMP_stem_good.csv',
    'df4_humanities_poor': '18_RMP_humanities_poor.csv',
    'df5_humanities_average': '18_RMP_humanities_average.csv',
    'df6_humanities_good': '18_RMP_humanities_good.csv'
}

all_results = {}

In [8]:
def process_and_store_results(df, comment_col):
    # 1. Extract X and y
    X = df[comment_col].fillna('')
    y = df['prof_gender_finalized']

    # 2. Initialize and fit TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=2000)
    X_vectorized = vectorizer.fit_transform(X)

    # 3. Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_vectorized, y, test_size=0.2, random_state=42
    )

    # 4. Train RandomForestClassifier
    rf_model = RandomForestClassifier(random_state=42, oob_score=True)
    rf_model.fit(X_train, y_train)

    # 5. Train LogisticRegression model
    lr_model = LogisticRegression(random_state=42, solver='liblinear')
    lr_model.fit(X_train, y_train)

    # 6. Report model performance and capture metrics
    y_pred_rf = rf_model.predict(X_test)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    oob_score_rf = rf_model.oob_score_
    classification_report_rf = classification_report(y_test, y_pred_rf)

    y_pred_lr = lr_model.predict(X_test)
    accuracy_lr = accuracy_score(y_test, y_pred_lr)
    classification_report_lr = classification_report(y_test, y_pred_lr)

    # Print reports (for visibility during execution)
    print(f"--- RandomForestClassifier Performance for {comment_col} ---")
    print(f"Random Forest Accuracy: {accuracy_rf:.2%}")
    print(f"Random Forest OOB Score: {oob_score_rf:.2f}")
    print("Random Forest Classification Report:")
    print(classification_report_rf)

    print(f"\n--- LogisticRegression Performance for {comment_col} ---")
    print(f"Logistic Regression Accuracy: {accuracy_lr:.2%}")
    print("Logistic Regression Classification Report:")
    print(classification_report_lr)

    # 7. Extract feature importances from Random Forest
    feature_importances = rf_model.feature_importances_
    feature_names = vectorizer.get_feature_names_out()
    rf_importance_df = pd.DataFrame({
        'Word': feature_names,
        'Feature Importance': feature_importances
    }).sort_values(by='Feature Importance', ascending=False)

    # 8. Extract coefficients from Logistic Regression
    coefficients = lr_model.coef_[0]
    positive_class = lr_model.classes_[1]
    negative_class = lr_model.classes_[0]

    directions = []
    for coef in coefficients:
        if coef > 0:
            directions.append(f"{positive_class.capitalize()}")
        elif coef < 0:
            directions.append(f"{negative_class.capitalize()}")
        else:
            directions.append("Neutral")

    lr_direction_df = pd.DataFrame({
        'Word': feature_names,
        'Coefficient': coefficients,
        'Direction': directions
    }).sort_values(by='Coefficient', ascending=False)

    # 9. Merge and sort DataFrames
    merged_df = pd.merge(rf_importance_df, lr_direction_df, on='Word', how='inner')
    merged_df_sorted = merged_df.sort_values(by='Feature Importance', ascending=False)

    return merged_df_sorted, classification_report_rf, classification_report_lr, accuracy_rf, oob_score_rf, accuracy_lr

print("The 'process_and_store_results' function has been redefined to return additional metrics.")

The 'process_and_store_results' function has been redefined to return additional metrics.


In [None]:
def process_individual_dataframe(file_path, dataset_name):
    import pandas as pd
    # Load the CSV into a DataFrame
    df = pd.read_csv(file_path)

    print(f"\n--- Processing {dataset_name} (Original Comments) ---")
    # Process original comments
    merged_original_df, rf_report_orig, lr_report_orig, rf_acc_orig, rf_oob_orig, lr_acc_orig = process_and_store_results(df, 'comments_original')
    print(f"\nTop 20 Words for {dataset_name} (original comments):")
    print(merged_original_df.head(20))

    print(f"\n--- Processing {dataset_name} (Scrubbed Comments) ---")
    # Process scrubbed comments
    merged_scrubbed_df, rf_report_scrub, lr_report_scrub, rf_acc_scrub, rf_oob_scrub, lr_acc_scrub = process_and_store_results(df, 'comments_scrubbed')
    print(f"\nTop 20 Words for {dataset_name} (scrubbed comments):")
    print(merged_scrubbed_df.head(20))

    return (
        merged_original_df,
        merged_scrubbed_df,
        {
            'original': {
                'rf_report': rf_report_orig,
                'lr_report': lr_report_orig,
                'rf_accuracy': rf_acc_orig,
                'rf_oob_score': rf_oob_orig,
                'lr_accuracy': lr_acc_orig,
            },
            'scrubbed': {
                'rf_report': rf_report_scrub,
                'lr_report': lr_report_scrub,
                'rf_accuracy': rf_acc_scrub,
                'rf_oob_score': rf_oob_scrub,
                'lr_accuracy': lr_acc_scrub,
            }
        }
    )

print("The 'process_individual_dataframe' function has been redefined to handle additional metrics.")

**Reasoning**:
Since both `process_and_store_results` and `process_individual_dataframe` functions have been redefined to handle additional metrics, I need to re-run the loop that iterates through all datasets to populate the `all_results` dictionary with the new structured data containing these metrics.



In [None]:
for label, filename in file_dict.items():
    full_file_path = os.path.join(base_path, filename)

    print(f"\n\n=======================================================")
    print(f"Processing dataset: {label} (File: {filename})")
    print(f"=======================================================")

    merged_original_df, merged_scrubbed_df, metrics_dict = process_individual_dataframe(full_file_path, label)

    all_results[label] = {
        'original': merged_original_df,
        'scrubbed': merged_scrubbed_df,
        'metrics': metrics_dict
    }

print("All datasets processed and results stored in 'all_results'.")

# Output: txt



In [None]:
output_file_name = '/content/drive/MyDrive/Colab Notebooks/stats201_final_project/Week_4/19_combined_top_20_words_RFanalysis.txt'

with open(output_file_name, 'w') as f:
    for label, results_dict in all_results.items():
        f.write(f"========================================================\n")
        f.write(f"--- Analysis for {label} ---\n")
        f.write(f"========================================================\n\n")

        for comment_type in ['original', 'scrubbed']:
            f.write(f"--------------------------------------\n")
            f.write(f"--- {comment_type.capitalize()} Comments ---\n")
            f.write(f"--------------------------------------\n")

            # Extract and write performance metrics
            metrics = results_dict['metrics'][comment_type]
            f.write(f"Random Forest Accuracy: {metrics['rf_accuracy']:.2%}\n")
            f.write(f"Random Forest OOB Score: {metrics['rf_oob_score']:.2f}\n")
            f.write(f"Logistic Regression Accuracy: {metrics['lr_accuracy']:.2%}\n\n")

            f.write("Random Forest Classification Report:\n")
            f.write(metrics['rf_report'])
            f.write("\n")

            f.write("Logistic Regression Classification Report:\n")
            f.write(metrics['lr_report'])
            f.write("\n")

            # Extract and write top 20 words
            df_to_write = results_dict[comment_type].head(20)

            f.write("Top 20 Words:\n")
            f.write(f"{'Word':<20} {'Feature Importance':<20} {'Coefficient':<15} {'Direction':<10}\n")
            f.write(f"{'':-<20} {'':-<20} {'':-<15} {'':-<10}\n")
            for index, row in df_to_write.iterrows():
                f.write(f"{row['Word']:<20} {row['Feature Importance']:<20.6f} {row['Coefficient']:<15.6f} {row['Direction']:<10}\n")
            f.write("\n")

print(f"Comprehensive report written to {output_file_name}")

# Output: Heatmap


In [None]:
top_scrubbed_words_coefficients = {}

for label, results_dict in all_results.items():
    # Access the 'scrubbed' DataFrame
    scrubbed_df = results_dict['scrubbed']

    # Select the top 20 rows
    top_20_scrubbed = scrubbed_df.head(20)

    # Extract 'Word' and 'Coefficient' columns
    words_and_coefficients = top_20_scrubbed[['Word', 'Coefficient']]

    # Store in the new dictionary
    top_scrubbed_words_coefficients[label] = words_and_coefficients

In [None]:
all_words_coeffs_dfs = []
for label, df in top_scrubbed_words_coefficients.items():
    # Rename 'Coefficient' column to the dataset label for merging
    df_temp = df.rename(columns={'Coefficient': label}).set_index('Word')
    all_words_coeffs_dfs.append(df_temp)

# Concatenate all DataFrames. Use 'outer' join to include all unique words
heatmap_data_coeffs = pd.concat(all_words_coeffs_dfs, axis=1, join='outer')

# Fill NaN values with 0, as a word not being in the top 20 for a dataset implies a coefficient of 0 for the heatmap's color
heatmap_data_coeffs = heatmap_data_coeffs.fillna(0)

# Sort words alphabetically for consistent heatmap presentation
heatmap_data_coeffs = heatmap_data_coeffs.sort_index()

print("Prepared heatmap_data_coeffs DataFrame:")
print(heatmap_data_coeffs.head())

In [None]:
top_scrubbed_words_importances = {}

for label, results_dict in all_results.items():
    # Access the 'scrubbed' DataFrame
    scrubbed_df = results_dict['scrubbed']

    # Select the top 20 rows
    top_20_scrubbed = scrubbed_df.head(20)

    # Extract 'Word' and 'Feature Importance' columns
    words_and_importances = top_20_scrubbed[['Word', 'Feature Importance']]

    # Store in the new dictionary
    top_scrubbed_words_importances[label] = words_and_importances

all_words_importances_dfs = []
for label, df in top_scrubbed_words_importances.items():
    # Rename 'Feature Importance' column to the dataset label for merging
    df_temp = df.rename(columns={'Feature Importance': label}).set_index('Word')
    all_words_importances_dfs.append(df_temp)

# Concatenate all DataFrames. Use 'outer' join to include all unique words
heatmap_data_importances = pd.concat(all_words_importances_dfs, axis=1, join='outer')

# Fill NaN values with 0, as a word not being in the top 20 for a dataset implies no significant importance there.
heatmap_data_importances = heatmap_data_importances.fillna(0)

# Sort words alphabetically for consistent heatmap presentation
heatmap_data_importances = heatmap_data_importances.sort_index()

print("Prepared heatmap_data_importances DataFrame:")
print(heatmap_data_importances.head())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate combined importance for each word across all datasets
# Sum the importances across the rows (axis=1) to get a total importance for each word
combined_importances = heatmap_data_importances.sum(axis=1)

# Get the top 30 words based on combined importance
top_30_overall_words = combined_importances.nlargest(30).index

# Filter heatmap_data_coeffs and heatmap_data_importances to include only these top 30 words
filtered_heatmap_data_coeffs = heatmap_data_coeffs.loc[top_30_overall_words]
filtered_heatmap_data_importances = heatmap_data_importances.loc[top_30_overall_words]

plt.figure(figsize=(14, 12)) # Adjusted figure size for better readability
sns.heatmap(
    filtered_heatmap_data_coeffs,
    cmap='RdBu_r',
    center=0,
    annot=filtered_heatmap_data_importances,
    fmt='.3f',
    linewidths=.5,
    linecolor='lightgray'
)
plt.title('Top 30 Overall Scrubbed Words: Coefficients (Color) and Feature Importances (Annotation)\n Red: Male-leaning, Blue: Female-leaning', fontsize=16)
plt.xlabel('Dataset', fontsize=12)
plt.ylabel('Words', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("Heatmap displayed with overall top 30 scrubbed words, coefficients as colors and feature importances as annotations.")

## Report on Current Models

### How the Model Works?

- **1. Data Preparation and Feature Extraction (TF-IDF Vectorization)**:
    - **Word Counting & Vectorization**: Each dataset's original and scrubbed comments are processed. A `TfidfVectorizer` is initialized to convert text comments into a matrix of TF-IDF features. This process effectively counts words, weighs their frequency by how often they appear in a document relative to the corpus, and selects a predefined number of `max_features` (e.g., 2000) to focus on the most discriminative words. This helps prevent very common words from dominating the analysis.

- **2. Model Training (Random Forest & Logistic Regression)**:
    - **Data Splitting**: The vectorized data is split into training and testing sets (e.g., 80% train, 20% test) to evaluate model performance on unseen data.
    - **Random Forest Classifier**: A `RandomForestClassifier` is trained on the training data. This ensemble method builds multiple decision trees and merges their predictions to improve accuracy and control overfitting.
    - **Logistic Regression**: A `LogisticRegression` model is also trained. This is a linear model used for binary classification, which models the probability of a binary outcome.

- **3. Performance Evaluation**:
    - **Accuracy Score**: The accuracy of both models is calculated on the test set, showing the proportion of correctly classified instances.
    - **OOB Score (Random Forest)**: For Random Forest, the Out-Of-Bag (OOB) score is computed, providing an internal estimate of generalization error without the need for a separate validation set.
    - **Classification Reports**: Detailed classification reports are generated for both models, including precision, recall, f1-score, and support for each class.

- **4. Feature Importance and Coefficients Extraction**:
    - **Random Forest Feature Importance**: The `feature_importances_` attribute from the trained Random Forest model is extracted. This provides a score for each word (feature) indicating its relative importance in predicting the gender of the professor. Higher values mean more importance.
    - **Logistic Regression Coefficients**: The `coef_[0]` attribute from the trained Logistic Regression model is extracted. These coefficients represent the log-odds change in the dependent variable for a one-unit change in the independent variable (word presence/TF-IDF score). The sign of the coefficient indicates the direction of influence:
        - **Positive Coefficient**: Word is associated with the positive class (e.g., 'Male' leaning).
        - **Negative Coefficient**: Word is associated with the negative class (e.g., 'Female' leaning).

- **5. Heatmap Visualization**:
    - **Combined Importance for Top Words**: For heatmap visualization, the `Feature Importance` from Random Forest and `Coefficient` from Logistic Regression are extracted for the top words from each dataset. An overall importance score for each word is calculated by summing its Random Forest importances across all datasets.
    - **Top 30 Overall Words Selection**: The top 30 words based on this combined overall importance are selected.
    - **Heatmap Generation**: A heatmap is generated where:
        - **Color**: Represents the Logistic Regression `Coefficient` for the selected words across different datasets. A diverging colormap (e.g., 'RdBu_r' centered at 0) clearly shows positive (Red: Male-leaning) and negative (Blue: Female-leaning) associations.
        - **Annotation**: The Random Forest `Feature Importance` values are annotated within each cell, providing a numerical measure of how impactful that word was in the Random Forest model for that specific dataset.

### Key Findings (from the heatmap visualization):

The heatmap displays the top 30 overall scrubbed words. The color of each cell indicates the direction of the word's association (red for male-leaning, blue for female-leaning) and the intensity of the color indicates the strength of this association, according to the Logistic Regression model. The numerical annotations within the cells show the Random Forest feature importance, indicating how much that word contributed to the predictive power of the Random Forest model. This allows for a comprehensive view of how different words are associated with professor gender across various academic disciplines and performance categories.