In [51]:
import pandas as pd 
from scipy.stats import pointbiserialr


## Prep Data

In [52]:
df = pd.read_csv("./speech_data.csv")

# Subset data with Republican or Democratic Presidents 
df2 = df[(df["Political Party"] == "Republican")|(df["Political Party"] == "Democrat")].copy()

df2["Political Party"].unique()
df2.head()

Unnamed: 0,Name,Unifying Words Count,Polarizing Words Count,Total Words Count,Political Party,Overall Language,Unifying Words Ratio,Polarizing Words Ratio,Overall Ratio
0,Donald J. Trump (2nd Term),20,23,2905,Republican,polarizing,0.00688,0.00792,0.8687
1,"Joseph R. Biden, Jr.",31,6,2532,Democrat,unifying,0.01224,0.00237,5.1646
2,Donald J. Trump (1st Term),12,4,1455,Republican,unifying,0.00825,0.00275,3.0
3,Barack Obama,15,5,2090,Democrat,unifying,0.00718,0.00239,3.0042
4,Barack Obama,13,15,2391,Democrat,polarizing,0.00544,0.00627,0.8676


In [53]:
# Encode Political Party as a binary variable for point biserial correlation analysis 

df2['Political Party'] = df2['Political Party'].map({'Republican': 0, 'Democrat': 1})
df2.head(3)

Unnamed: 0,Name,Unifying Words Count,Polarizing Words Count,Total Words Count,Political Party,Overall Language,Unifying Words Ratio,Polarizing Words Ratio,Overall Ratio
0,Donald J. Trump (2nd Term),20,23,2905,0,polarizing,0.00688,0.00792,0.8687
1,"Joseph R. Biden, Jr.",31,6,2532,1,unifying,0.01224,0.00237,5.1646
2,Donald J. Trump (1st Term),12,4,1455,0,unifying,0.00825,0.00275,3.0


## Conduct Analysis: Point Biserial Correlation

In [58]:
correlation, p_value = pointbiserialr(df2['Political Party'], df2['Overall Ratio'])

print("Correlation:", round(correlation, 5))
print("p-value:", round(p_value, 5))

Correlation: 0.01333
p-value: 0.92681


This did not yield very statistically significant results nor any strong relationship between political party and primary type of language used. Let's try some other methods...

## Trying Logistic Regression

### Fit a Kitchen Sink Model (all features)

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Feature and target selection
X = df2.drop(columns=['Political Party', "Name", "Overall Language"]) # X = df2[["Political Party"]]
y = df2['Political Party']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
print("Training Accuracy:", model.score(X_train, y_train))
print("Testing Accuracy:", accuracy_score(y_test, predictions))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, predictions))
print("\nClassification Report:\n", classification_report(y_test, predictions))


Training Accuracy: 0.6
Testing Accuracy: 0.3

Confusion Matrix:
 [[1 4]
 [3 2]]

Classification Report:
               precision    recall  f1-score   support

           0       0.25      0.20      0.22         5
           1       0.33      0.40      0.36         5

    accuracy                           0.30        10
   macro avg       0.29      0.30      0.29        10
weighted avg       0.29      0.30      0.29        10



### Viewing Feature Importance 

In [60]:
# Extract feature coefficients
coefficients = model.coef_[0]
features = X_train.columns

# Create a DataFrame for better readability
feature_importance = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})
# feature_importance['Absolute Coefficient'] = np.abs(feature_importance['Coefficient'])
# feature_importance = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)

print("Feature Importance for Logistic Regression:")
print(feature_importance)


Feature Importance for Logistic Regression:
                  Feature  Coefficient
0    Unifying Words Count     0.078096
1  Polarizing Words Count     0.008431
2       Total Words Count    -0.000780
3    Unifying Words Ratio     0.002653
4  Polarizing Words Ratio     0.002131
5           Overall Ratio     0.019097


### Trying Decision Trees 

In [None]:
# DO LATER 