In [1]:
import pandas as pd
import numpy as np
import os
from requests_oauthlib import OAuth1Session
import json
import constants
import re
import nltk
import joblib

nltk.download('stopwords')
nltk.download('punkt') 
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk import bigrams

# import stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.stem import PorterStemmer

ps = PorterStemmer()

from nltk.probability import FreqDist
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
analyser = SentimentIntensityAnalyzer()
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, GridSearchCV

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/costinsmilovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/costinsmilovici/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/costinsmilovici/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/costinsmilovici/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# Contains the API key information that will be needed for querying the Twitter API
# The sample size for the number of tweets we will use
API_KEY_SEC = constants.API_KEY_SEC
API_KEY = constants.API_KEY

## Data
- https://ucsd.libguides.com/congress_twitter
- Twitter API
    - User lookup endpoint
    - Timelines endpoint

In [4]:
House = pd.read_csv("./Data/HouseTwitter.csv")
Senate = pd.read_csv("./Data/SenateTwitter.csv")

In [5]:
Politicians = House.append(Senate).reset_index(drop=True)
Politicians.head()

Unnamed: 0,Name,Link,State,Party
0,"Adams, Alma",https://twitter.com/RepAdams,NC,D
1,"Aderholt, Robert",https://twitter.com/Robert_Aderholt,AL,R
2,"Aguilar, Pete",https://twitter.com/RepPeteAguilar,CA,D
3,"Allen, Rick",https://twitter.com/RepRickAllen,GA,R
4,"Allred, Colin",https://twitter.com/RepColinAllred,TX,D


In [6]:
Politicians["Usernames"] = Politicians["Link"].str.replace("https://twitter.com/","",regex=False)

In [7]:
Politicians = Politicians.dropna(subset=["Usernames"])

In [8]:
Politicians = Politicians[Politicians["Party"] != "I"]

In [9]:
usernames = Politicians["Usernames"]

In [11]:
search_arr = []
for i in range((len(usernames) // 100)+1):
    l = i*100
    r = min(len(usernames),(i+1)*100)
    search_arr += [",".join(usernames[l:r].str.strip())]

This code is Twitter's authentication code for python so it can largely be ignored

**Source: https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/main/Tweet-Lookup/get_tweets_with_user_context.py**

In [12]:
# To set your enviornment variables in your terminal run the following line:
consumer_key = API_KEY
consumer_secret = API_KEY_SEC

request_token_url = "https://api.twitter.com/oauth/request_token"
oauth = OAuth1Session(consumer_key, client_secret=consumer_secret)

try:
    fetch_response = oauth.fetch_request_token(request_token_url)
except ValueError:
    print(
        "There may have been an issue with the consumer_key or consumer_secret you entered."
    )

resource_owner_key = fetch_response.get("oauth_token")
resource_owner_secret = fetch_response.get("oauth_token_secret")
print("Got OAuth token: %s" % resource_owner_key)

# Get authorization
base_authorization_url = "https://api.twitter.com/oauth/authorize"
authorization_url = oauth.authorization_url(base_authorization_url)
print("Please go here and authorize: %s" % authorization_url)
verifier = input("Paste the PIN here: ")

# Get the access token
access_token_url = "https://api.twitter.com/oauth/access_token"
oauth = OAuth1Session(
    consumer_key,
    client_secret=consumer_secret,
    resource_owner_key=resource_owner_key,
    resource_owner_secret=resource_owner_secret,
    verifier=verifier,
)
oauth_tokens = oauth.fetch_access_token(access_token_url)


access_token = oauth_tokens["oauth_token"]
access_token_secret = oauth_tokens["oauth_token_secret"]

# Make the request
oauth = OAuth1Session(
    consumer_key,
    client_secret=consumer_secret,
    resource_owner_key=access_token,
    resource_owner_secret=access_token_secret,
)

Got OAuth token: iizCcAAAAAABcJlTAAABgjxIvBs
Please go here and authorize: https://api.twitter.com/oauth/authorize?oauth_token=iizCcAAAAAABcJlTAAABgjxIvBs
Paste the PIN here: 9739639


In [13]:
API_UIDs = []

for users in search_arr:
    params = {"usernames": users, "user.fields": "username,id"}
    response = oauth.get(
        "https://api.twitter.com/2/users/by", params=params
    )
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(response.status_code, response.text)
        )
    json_response = response.json()
    dataframed_response = pd.DataFrame(json_response["data"])[["id","username"]]
    API_UIDs += [dataframed_response]

In [14]:
uid_df = pd.concat(API_UIDs)

In [15]:
Politicians = Politicians.merge(uid_df,left_on="Usernames",right_on="username").drop(columns="username")

In [16]:
Politicians

Unnamed: 0,Name,Link,State,Party,Usernames,id
0,"Adams, Alma",https://twitter.com/RepAdams,NC,D,RepAdams,2916086925
1,"Aderholt, Robert",https://twitter.com/Robert_Aderholt,AL,R,Robert_Aderholt,76452765
2,"Aguilar, Pete",https://twitter.com/RepPeteAguilar,CA,D,RepPeteAguilar,3018670151
3,"Allen, Rick",https://twitter.com/RepRickAllen,GA,R,RepRickAllen,2964287128
4,"Allred, Colin",https://twitter.com/RepColinAllred,TX,D,RepColinAllred,1078355119920562176
...,...,...,...,...,...,...
521,"Warren, Elizabeth",https://twitter.com/SenWarren,MA,D,SenWarren,970207298
522,"Whitehouse, Sheldon",https://twitter.com/SenWhitehouse,RI,D,SenWhitehouse,242555999
523,"Wicker, Roger F.",https://twitter.com/SenatorWicker,MS,R,SenatorWicker,264219447
524,"Wyden, Ron",https://twitter.com/RonWyden,OR,D,RonWyden,250188760


In [17]:
API_tweets = []
for user_id in Politicians["id"]:
    params = {"tweet.fields": "author_id,text", "max_results": 100}
    response = oauth.get(
        f"https://api.twitter.com/2/users/{user_id}/tweets", params=params
    )
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(response.status_code, response.text)
        )
    json_response = response.json()
    if json_response.get("data",False):
        dataframed_response = pd.DataFrame(json_response["data"])
        API_tweets += [dataframed_response]

In [18]:
poli_tweets = pd.concat(API_tweets).reset_index(drop=True)

In [19]:
poli_tweets = poli_tweets.drop_duplicates(subset="text")

In [20]:
poli_tweets = poli_tweets[["author_id","text"]]

In [21]:
right = Politicians[["id","Party"]]

In [239]:
party_tweets = poli_tweets.merge(right,left_on="author_id",right_on="id").drop(columns=["id","author_id"])

In [None]:
party_tweets.to_csv("./Data/ClassifierData.csv")

## Modeling

In [2]:
party_tweets = pd.read_csv("./Data/ClassifierData.csv")

In [3]:
party_tweets["text"] = (
    party_tweets["text"]
        .str.replace("^RT @.*:","",regex=True)
        .str.replace(" https://t.*$","",regex=True)
        .str.strip()
)

In [4]:
def remove_noise(tweet_tokens, stop_words = stop_words):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [5]:
def tokenize(raw_str):
    tokenizer = TweetTokenizer()
    tokens = remove_noise(tokenizer.tokenize(raw_str))
    return tokens

In [6]:
tfidf = TfidfVectorizer(sublinear_tf = True,)

In [6]:
X = party_tweets["text"]
y = party_tweets["Party"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

In [17]:
pipe = Pipeline([('tfidf', tfidf), ('svc', SVC(kernel='linear'))])
param_grid = {
    "tfidf__min_df": np.round(np.linspace(1,20,4)).astype(int),
    "tfidf__max_df": np.linspace(.6,1,3),
}

search = GridSearchCV(pipe, param_grid, n_jobs=2, cv=2)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.806):
{'tfidf__max_df': 0.6, 'tfidf__min_df': 1}


In [12]:
tfidf = TfidfVectorizer(sublinear_tf = True,max_df=0.6,min_df=1,tokenizer=tokenize)
pipe = Pipeline([('tfidf', tfidf), ('svc', SVC(kernel='linear'))])
pipe.fit(X,y)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.6, sublinear_tf=True,
                                 tokenizer=<function tokenize at 0x7fddc4c94280>)),
                ('svc', SVC(kernel='linear'))])

In [24]:
joblib.dump(pipe, './Models/fitted_svm.pkl')

['./Models/fitted_svm.pkl']

In [26]:
my_model = joblib.load('./Models/fitted_svm.pkl')

In [16]:
birdwatch_tweets = pd.read_csv("./Data/birdwatch_tweets.csv")

In [20]:
birdwatch_tweets["predicted_label"] = my_model.predict(birdwatch_tweets["text"])

In [23]:
birdwatch_tweets["predicted_label"].value_counts(normalize=True)

R    0.541872
D    0.458128
Name: predicted_label, dtype: float64

In [28]:
birdwatch_tweets.to_csv("./Data/birdwatch_tweets_with_labels.csv")