In [2]:
# Imports
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

In [1]:
from nltk import word_tokenize
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

# Methods for POS-tagging and lemmatization of the data

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

lemmatizer = WordNetLemmatizer()

def PosTagAndLmtz(sentence):
    tokens = word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    lSentence = ""
    for pair in tagged_tokens:
        try:
            PosTag = get_wordnet_pos(pair[1])
            lWord = lemmatizer.lemmatize(pair[0], pos =PosTag)
            
            lSentence += lWord + " "
        except:
            lSentence += pair[0] + " "
    return lSentence



In [3]:
# Read the data
df = pd.read_csv('ExtractedTweets.csv', sep=',')

In [4]:
# Drop null-values
df = df.dropna()

In [5]:
# We don't want to keep the handle-column of the data
df = df.drop(columns=['Handle'])

In [None]:
# We apply the lemmatization on the tweets
df['Tweet'] = df['Tweet'].apply(PosTagAndLmtz)


In [6]:
# Replace democrat/republican with 0/1
n = {"Democrat":0, "Republican":1}
df = df.replace({"Party":n})

In [143]:
# Bag of words model
vectorizer = CountVectorizer(max_features=40000, max_df=0.9, min_df=2)
X = vectorizer.fit_transform(df["Tweet"])

In [144]:
# Normalize the bag of words vectors with TFIDF
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X)

In [148]:
X_train, X_test, y_train, y_test = train_test_split(X, df["Party"].to_numpy(), test_size=0.2, random_state=42)

In [149]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

clf = LinearSVC()

clf.fit(X_train.toarray(), y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [150]:
clf.score(X_test.toarray(), y_test)

0.7742308582003239

In [151]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
p = clf.predict(X_test)
print(confusion_matrix(y_test,p))
print(classification_report(y_test,p))
print(accuracy_score(y_test, p))

[[6421 2084]
 [1820 6967]]
              precision    recall  f1-score   support

           0       0.78      0.75      0.77      8505
           1       0.77      0.79      0.78      8787

   micro avg       0.77      0.77      0.77     17292
   macro avg       0.77      0.77      0.77     17292
weighted avg       0.77      0.77      0.77     17292

0.7742308582003239


In [175]:
# To make prediction:

# Change string to political tweet text
toPred = ""

toPred = PosTagAndLmtz(toPred)
XP = vectorizer.transform([toPred])
XP = tfidfconverter.transform(XP)

# 0 democrat, 1 republican
print(clf.predict(XP))

[0]
