## Import Packages

In [1]:
from sklearn.manifold import TSNE
from collections import Counter
from six.moves import cPickle
from gensim.models import Word2Vec
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import io
import re
import json
import pickle
import pandas as pd
import gensim
import glob

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk

In [2]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
#     text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text


#### In convote data, we have set party label democrats as 0, matching the liberal label of 0 from IBC and Republicans have been labeled as 1, matching label 1 (conservative) of IBC.
The assumption behind this method is, all sentences from Democratic party are liberal and those from Republicans should reflect conservative ideology 

### Get Convote data

In [14]:
convote_train_files_path = 'data_stage_one/training_set/*.txt'
convote_test_files_path = 'data_stage_one/test_set/*.txt'
convote_train_files = glob.glob(convote_train_files_path)
convote_test_files = glob.glob(convote_test_files_path)
filepath_dict = {'convote_train': convote_train_files,
                 'convote_test': convote_test_files}

In [15]:
convote_data = []

for data_type, filenames in filepath_dict.items():
    for i in range(len(filenames)):
        f = open(filenames[i], 'r')
        f_text = f.read()
        #print(f_text)
        f.close()
        party = filenames[i].split('_')[-1][0]
        sample_group = data_type.split('_')[-1]
        #print(review_sentiment)
        review_label = 0 if party == 'D' else 1
        #print(review_label)
        convote_data.append([f_text, party, sample_group, review_label])      

In [16]:
convote_data = pd.DataFrame(convote_data)
convote_data = convote_data.rename(index=str, columns={0: 'text', 1: 'party', 2: 'group', 3: 'label'})
convote_data['text'] = convote_data['text'].apply(clean_text)

In [20]:
convote = pd.DataFrame(convote_data,columns= ['text','label'])

In [22]:
convote.shape

(7419, 2)

### Get IBC data

In [8]:
[lib,con,neutral]= pickle.load(open('ibcData.pkl','rb'))
liberal = []
for tree in lib:
    liberal.append(tree.get_words())
conservative = []
for tree in con:
    conservative.append(tree.get_words())
neu = []
for tree in neutral:
    neu.append(tree.get_words())

liberals = pd.DataFrame(liberal,columns=['text'])
liberals['label'] = 0
conservatives = pd.DataFrame(conservative,columns=['text'])
conservatives['label'] = 1

In [9]:
frames = [liberals,conservatives]
result = pd.concat(frames)
result = result.sample(frac=1).reset_index(drop=True)
result['text'] = result['text'].apply(clean_text)

In [10]:
result.shape

(3726, 2)

In [23]:
frames = [convote , result]
all_data = pd.concat(frames)
all_data = all_data.sample(frac=1).reset_index(drop=True)

In [25]:
all_data.shape

(11145, 2)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(all_data['text'], all_data['label'], test_size=0.15, random_state = 42)

### Implementing BOW+TFIDF features

#### Multinomial Naive Bayes

In [27]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_test,y_pred_nb))

accuracy 0.6919856459330144


#### Linear SVM

In [28]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

y_pred_svm = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_test,y_pred_svm))

accuracy 0.6746411483253588


#### Logistic Regression

In [29]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

y_pred_logreg = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_test,y_pred_logreg))

accuracy 0.6614832535885168


#### Saving predicted results to csv

In [30]:
y_test = np.array(y_test)
bow_tfidf_ibc = pd.DataFrame()
bow_tfidf_ibc ['MultiNB'] = y_pred_nb
bow_tfidf_ibc ['SVM'] = y_pred_svm
bow_tfidf_ibc ['Logreg'] = y_pred_logreg
bow_tfidf_ibc ['yTrue'] = y_test
bow_tfidf_ibc.to_csv('BOW+TF-IDF_all_data.csv')

### Implementing only BOW features

In [31]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
train = cv.fit_transform(X_train)
test = cv.transform(X_test)

#### Multinomial Naive Bayes

In [32]:
clf = MultinomialNB().fit(train, y_train)
predicted_nb = clf.predict(test)
print("MultinomialNB Accuracy:",accuracy_score(y_test,predicted_nb))

MultinomialNB Accuracy: 0.6716507177033493


#### Linear SVM

In [33]:
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(train,y_train)
predicted_svm = clf.predict(test)
print("Linear SVM Accuracy:",accuracy_score(y_test,predicted_svm))

Linear SVM Accuracy: 0.6794258373205742


#### Logistic Regression

In [34]:
clf = LogisticRegression(n_jobs=1, C=1e5).fit(train,y_train)
predicted_logreg = clf.predict(test)
print("Logistic Regression Accuracy:",accuracy_score(y_test,predicted_logreg))

Logistic Regression Accuracy: 0.638755980861244


#### Saving predicted results to csv

In [35]:
y_test = np.array(y_test)
bow_all = pd.DataFrame()
bow_all ['MultiNB'] = predicted_nb
bow_all ['SVM'] = predicted_svm
bow_all ['Logreg'] = predicted_logreg
bow_all ['yTrue'] = y_test
bow_all.to_csv('BOW_all_data.csv')

### Implementing only TF-IDF features

In [36]:
tf = TfidfVectorizer()
train = tf.fit_transform(X_train)
test = tf.transform(X_test)

#### Naive Bayes

In [37]:
clf = MultinomialNB().fit(train, y_train)
predicted_NB = clf.predict(test)
print("MultinomialNB Accuracy:",accuracy_score(y_test, predicted_NB))

MultinomialNB Accuracy: 0.6919856459330144


#### Linear SVM

In [38]:
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(train,y_train)
predicted_SVM = clf.predict(test)
print("Linear SVM Accuracy:",accuracy_score(y_test, predicted_SVM))

Linear SVM Accuracy: 0.6746411483253588


#### Logistic Regression

In [39]:
clf = LogisticRegression(n_jobs=1, C=1e5).fit(train,y_train)
predicted_LOG = clf.predict(test)
print("Logistic Regression Accuracy:",accuracy_score(y_test, predicted_LOG))

Logistic Regression Accuracy: 0.6614832535885168


#### Saving predicted results as csv

In [40]:
y_test = np.array(y_test)
tfidf_all = pd.DataFrame()
tfidf_all ['MultiNB'] = predicted_NB
tfidf_all ['SVM'] = predicted_SVM
tfidf_all ['Logreg'] = predicted_LOG
tfidf_all ['yTrue'] = y_test
tfidf_all.to_csv('TF-IDF_all_data.csv')