In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

data = pd.DataFrame({
    "text": [
        "He is a great leader", "She is emotional", "He is decisive",
        "She is caring", "He is ambitious", "She is beautiful"
    ],
    "gender": ["male", "female", "male", "female", "male", "female"],
    "label": [1, 0, 1, 0, 1, 0]
})

X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.3, random_state=42)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [2]:
model = LogisticRegression()
model.fit(X_train_vec, y_train)
preds = model.predict(X_test_vec)
acc = accuracy_score(y_test, preds)
cm = confusion_matrix(y_test, preds)
print("Accuracy:", acc)
print("Confusion Matrix:\n", cm)


Accuracy: 1.0
Confusion Matrix:
 [[1 0]
 [0 1]]


In [3]:
weights = pd.DataFrame({
    'word': vectorizer.get_feature_names_out(),
    'coef': model.coef_[0]
}).sort_values(by='coef', ascending=False)
print(weights.head(10))


        word          coef
4         he  5.864743e-01
3   decisive  2.932372e-01
0  ambitious  2.932372e-01
5         is -7.186836e-17
1  beautiful -2.932372e-01
2     caring -2.932372e-01
6        she -5.864743e-01
