# Modeling Examples

In the examples below we are using a dataframe (holds our data) which includes the amount of unifying and polarizing words. How this is done can change based on how you do your sentiment analysis.

**Main point** is that we are using the amount of unifying and polarizing words (gathered through the sentiment analysis) to determine the political party of the president giving the speech.

## Biserial Correlation

### Imports

In [None]:
import pandas as pd
from scipy.stats import pointbiserialr
from pathlib import Path
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


### Prep Data

In [None]:
# Encode Political Party as a binary variable for point biserial correlation analysis
# df represents dataframe aka where you hold your data
df2['Political Party'] = df2['Political Party'].map({'Republican': 0, 'Democrat': 1})


### Point Biserial Correlation Analysis

In [None]:
correlation, p_value = pointbiserialr(df2['Political Party'], df2['Overall Ratio'])
# runs a point biserial correlation analysis
print("Correlation:", round(correlation, 5))
print("p-value:", round(p_value, 5))



## Logistic Regression

In [None]:
# Feature and target selection
# X are the columns that are independent variables
# y what we are what we are trying to determine based on the x
# (dependent variables)
X = df2.drop(columns=["Political Party", "Name", "Overall Language"]) # X = df2[["Political Party"]]
y = df2["Political Party"]


# Split the data into training and testing sets (80% train, 20% test)
# Training set is the data we train the model one (train it what to look for)
# the test set is the data we test to see how our model is at predicting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply Standard Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit on train data and transform
X_test = scaler.transform(X_test)  # Only transform test data

# Create and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

cm = confusion_matrix(y_test, predictions)

# Evaluate the model
# print("Training Accuracy:", model.score(X_train, y_train))
print("Testing Accuracy:", accuracy_score(y_test, predictions))
print("\nConfusion Matrix:")
print("                 Predicted Negative  Predicted Positive")
print("Actual Negative      TN = {:<5}        FP = {:<5}".format(cm[0, 0], cm[0, 1]))
print("Actual Positive      FN = {:<5}        TP = {:<5}".format(cm[1, 0], cm[1, 1]))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


## Bias And Uncertainty

### Calibration Curve

In [None]:
# Calibration Curve (Reliability Curve)
# Measures how well predicted probabilities match the true probabilities

from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

# Get predicted probabilities for the positive class
y_prob = model.predict_proba(X_test)[:, 1]

prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)

# Plot calibration curve
plt.plot(prob_pred, prob_true, marker='o', label="Calibration Curve")
plt.plot([0, 1], [0, 1], linestyle="--", label="Perfect Calibration")
plt.xlabel("Predicted Probability")
plt.ylabel("Observed Frequency")
plt.legend()
plt.show()


### Brier Score

In [None]:
# Brier Score
# Measures accuracy of predicted probabilities
from sklearn.metrics import brier_score_loss

brier_score = brier_score_loss(y_test, y_prob) # Lower values = lower bias
print(f"Brier Score: {brier_score:.4f}")


### Calculating Uncertainty (Entropy)

In [None]:
# Entropy
# Measures uncertainty of predicted probabilities

from scipy.stats import entropy

# Convert probabilities to entropy
entropies = entropy([y_prob, 1 - y_prob], axis=0)
print(f"Entropy (Mean): {np.mean(entropies):.4f}") # Higher values = higher uncertainty


## View Feature Importance

In [None]:
# Extract feature coefficients
coefficients = model.coef_[0]
features = X.columns  # Fix: Use X instead of X_train to get feature names

# Create a DataFrame for better readability
feature_importance = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})
# feature_importance['Absolute Coefficient'] = np.abs(feature_importance['Coefficient'])
# feature_importance = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)

print("Feature Importance for Logistic Regression:")
print(feature_importance)
