In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn import metrics


import warnings
warnings.filterwarnings("ignore")

In [9]:
bigrams = pd.read_csv("bigram_count.csv")
df = bigrams[pd.notnull(bigrams["score_0"])]
bigs = df.columns[1:201]

In [10]:
b2 = bigrams.copy()
b2r = b2[b2["party"]=="R"]
b2d = b2[b2["party"]=="D"]
for i, b in enumerate(bigs):
    b2r[b] = np.where(b2r[b] >= 1, 1, 0)
    b2d[b] = np.where(b2d[b] >= 1, 1, 0)
    if i % 20 == 0:
        print(i)

0
20
40
60
80
100
120
140
160
180


In [11]:
b2ds = b2d[bigs].sum()
b2rs = b2r[bigs].sum()

In [12]:
# Approach 1: Score by total frequency
reps = []
dems = []

rt = []
dt = []
rscores = []
for b in bigs:
    rtotal = df[df["party"]=="R"][b].sum()
    dtotal = df[df["party"]=="D"][b].sum()
    total = rtotal + dtotal
    rscore = rtotal / total
    if (rscore >= 0.60):
        reps.append(b)
    elif (rscore <= 0.40):
        dems.append(b)
    rt.append(rtotal)
    dt.append(dtotal)
    rscores.append(rscore)

In [13]:
# Approach 2: Score by Indicator (% of Repubs who have said bigram)
reps2 = []
dems2 = []

rt2 = []
dt2 = []
rscores2 = []
for b in bigs:
    rtotal = b2rs[b]
    dtotal = b2ds[b]
    total = rtotal + dtotal
    rscore = rtotal / total
    if (rscore >= 0.60):
        reps2.append(b)
    elif (rscore <= 0.40):
        dems2.append(b)
    rt2.append(rtotal)
    dt2.append(dtotal)
    rscores2.append(rscore)

In [7]:
bigsdf2 = pd.DataFrame({"bigram": bigs, "rtotal": rt2, "dtotal": dt2, "rscore": rscores, "rscore2": rscores2, "diff": abs(np.array(rscores) - np.array(rscores2))})
bigsdf2["r_lean"] = np.where((bigsdf2["rscore2"] >= 0.60) & (bigsdf2["rscore"] < 0.75), 1, 0)
bigsdf2["r_solid"] = np.where(bigsdf2["rscore2"] >= 0.75, 1, 0)
bigsdf2["d_lean"] = np.where((bigsdf2["rscore2"] <= 0.40) & (bigsdf2["rscore"] > 0.25), 1, 0)
bigsdf2["d_solid"] = np.where(bigsdf2["rscore2"] <= 0.25, 1, 0)
bigsdf2.head()

NameError: name 'rscores' is not defined

In [None]:
r_lean = df[bigsdf2[bigsdf2["r_lean"] == 1]["bigram"].values].sum(1)
r_solid = df[bigsdf2[bigsdf2["r_solid"] == 1]["bigram"].values].sum(1)
d_lean = df[bigsdf2[bigsdf2["d_lean"] == 1]["bigram"].values].sum(1)
d_solid = df[bigsdf2[bigsdf2["d_solid"] == 1]["bigram"].values].sum(1)
total = r_lean + r_solid + d_lean + d_solid

df_leans2 = pd.DataFrame({
    "custom_id": df["custom_id"], "name": df["name"], "is_rep": np.where(df["party"] == "R", 1, 0), "position": df["position"], 
    "r_lean": r_lean, "r_solid": r_solid, "rt": r_lean + r_solid, 
    "d_lean": d_lean, "d_solid": d_solid, "dt": d_lean + d_solid,
    "total": total, "score": df["score_0"]
})
print(df_leans2.shape)
print("\t\t\t Bigram Frequencies (Categorized)")
df_leans2.head()


In [None]:
def scale(x, min_num, max_num, scale_min=0, scale_max=1):
    return (x - min_num) * (scale_max - scale_min) / (max_num - min_num) + scale_min
# (X_test["pred_score"] - min(X_test["pred_score"])) * (1 - 0) / (max(X_test["pred_score"]) - min(X_test["pred_score"])) + 0


In [None]:
# Final Logistic Regression Model (Predict Party)

X_train, X_test, y_train, y_test = train_test_split(df_leans2, df_leans2["is_rep"], test_size=0.25)
X_train["dli"] = 1 / X_train["d_lean"]
cols = ["rt", "dt"]


predict_party_model = LogisticRegression()
predict_party_model.fit(X_train[cols], y_train)
print(predict_party_model.score(X_train[cols], y_train))

X_train["pred_rep"] = predict_party_model.predict(X_train[cols])
X_test["pred_rep"] = predict_party_model.predict(X_test[cols])
print(sum(X_test["is_rep"] == X_test["pred_rep"]) / len(X_test["pred_rep"]))
# X_test.head()

# Final Linear Regression Model (Predict Trump Score)
cols = ["r_lean", "r_solid", "d_lean", "d_solid", "pred_rep"]
predict_ts_model = LinearRegression()

predict_ts_model.fit(X_train[cols], y_train)

print(predict_ts_model.score(X_train[cols], (y_train)))


X_test["pred_score"] = predict_ts_model.predict(X_test[cols])
X_test["pred_score"] = scale(X_test["pred_score"], min(X_test["pred_score"]), max(X_test["pred_score"]), 0, 1)

X_train["pred_score"] = predict_ts_model.predict(X_train[cols])
X_train["pred_score"] = scale(X_train["pred_score"], min(X_train["pred_score"]), max(X_train["pred_score"]), 0, 1)
X_test.head()

In [None]:
print(np.corrcoef(X_test["score"], X_test["pred_score"]))
plt.hist(X_test["pred_score"], bins=30)
# plt.hist(X_test["score"], bins=30)
plt.show()