In [1]:
from sklearn.manifold import TSNE
from collections import Counter
from six.moves import cPickle
from gensim.models import Word2Vec
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import io
import re
import json
import pickle
import pandas as pd
import gensim
import glob

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import nltk

### Pre-processing function

In [2]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
#     text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text


#### In convote data, we have set party label democrats as 0, matching the liberal label of 0 from IBC and Republicans have been labeled as 1, matching label 1 (conservative) of IBC.
The assumption behind this method is, all sentences from Democratic party are liberal and those from Republicans should reflect conservative ideology 

### Get convote data

In [3]:
convote_train_files_path = 'data_stage_one/training_set/*.txt'
convote_test_files_path = 'data_stage_one/test_set/*.txt'
convote_train_files = glob.glob(convote_train_files_path)
convote_test_files = glob.glob(convote_test_files_path)
filepath_dict = {'convote_train': convote_train_files,
                 'convote_test': convote_test_files}

In [4]:
convote_data = []

for data_type, filenames in filepath_dict.items():
    for i in range(len(filenames)):
        f = open(filenames[i], 'r')
        f_text = f.read()
        #print(f_text)
        f.close()
        party = filenames[i].split('_')[-1][0]
        sample_group = data_type.split('_')[-1]
        #print(review_sentiment)
        review_label = 0 if party == 'D' else 1
        #print(review_label)
        convote_data.append([f_text, party, sample_group, review_label])      

In [5]:
convote_data = pd.DataFrame(convote_data)
convote_data = convote_data.rename(index=str, columns={0: 'text', 1: 'party', 2: 'group', 3: 'party_label'})
convote_data['text'] = convote_data['text'].apply(clean_text)

In [6]:
X_convote = convote_data.text
y_convote = convote_data.party_label

In [7]:
X_convote.shape

(7419,)

### Get IBC data

In [8]:
[lib,con,neutral]= pickle.load(open('ibcData.pkl','rb'))
liberal = []
for tree in lib:
    liberal.append(tree.get_words())
conservative = []
for tree in con:
    conservative.append(tree.get_words())
neu = []
for tree in neutral:
    neu.append(tree.get_words())

liberals = pd.DataFrame(liberal,columns=['text'])
liberals['label'] = 0
conservatives = pd.DataFrame(conservative,columns=['text'])
conservatives['label'] = 1
neutrals = pd.DataFrame(neu,columns=['text'])

In [9]:
frames = [liberals,conservatives]
result = pd.concat(frames)
result = result.sample(frac=1).reset_index(drop=True)
result['text'] = result['text'].apply(clean_text)

In [10]:
X_ibc = result.text
y_ibc = result.label

In [11]:
X_ibc.shape

(3726,)

## Implementing BOW+TFIDF features

### Multinomial Naive Bayes

#### Training on convote testing on IBC

In [12]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_convote, y_convote)

y_pred_nb1 = nb.predict(X_ibc)

print('accuracy %s' % accuracy_score(y_ibc,y_pred_nb1))

accuracy 0.5697799248523886


#### Training on IBC and testing on convote

In [16]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_ibc, y_ibc)

y_pred_nb2 = nb.predict(X_convote)

print('accuracy %s' % accuracy_score(y_convote,y_pred_nb2))

accuracy 0.5081547378352878


### Linear SVM

#### Training on convote and testing on IBC

In [13]:
sgd1 = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd1.fit(X_convote, y_convote)

y_pred_svm1 = sgd1.predict(X_ibc)

print('accuracy %s' % accuracy_score(y_ibc,y_pred_svm1))

accuracy 0.5517981749865808


#### Training on IBC and testing on convote

In [17]:
sgd1.fit(X_ibc, y_ibc)

y_pred_svm2 = sgd1.predict(X_convote)

print('accuracy %s' % accuracy_score(y_convote,y_pred_svm2))

accuracy 0.5061329020083569


### Logistic Regression

#### Training on convote and testing on IBC

In [14]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_convote, y_convote)

y_pred_logreg1 = logreg.predict(X_ibc)

print('accuracy %s' % accuracy_score(y_ibc,y_pred_logreg1))

accuracy 0.5260332796564681


#### Training on IBC and testing on convote

In [18]:
logreg.fit(X_ibc, y_ibc)

y_pred_logreg2 = logreg.predict(X_convote)

print('accuracy %s' % accuracy_score(y_convote,y_pred_logreg2))

accuracy 0.5123331985442782


#### Saving predicted results when training on Convote testing on IBC

In [15]:
y_test = np.array(y_ibc)
bow_ibc_convote = pd.DataFrame()
bow_ibc_convote ['MultiNB'] = y_pred_nb1
bow_ibc_convote ['SVM'] = y_pred_svm1
bow_ibc_convote ['Logreg'] = y_pred_logreg1
bow_ibc_convote ['yTrue'] = y_test
bow_ibc_convote.to_csv('BOW+TFIDF_TrainConvote_TestIBC.csv')

#### Saving predicted results when training on IBC and testing on Convote

In [19]:
y_test = np.array(y_convote)
bow_ibc_convote = pd.DataFrame()
bow_ibc_convote ['MultiNB'] = y_pred_nb2
bow_ibc_convote ['SVM'] = y_pred_svm2
bow_ibc_convote ['Logreg'] = y_pred_logreg2
bow_ibc_convote ['yTrue'] = y_test
bow_ibc_convote.to_csv('BOW+TFIDF_TrainIBC_TestConvote.csv')

## Implementing only BOW features

### Training on convote and testing on IBC

In [20]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts_convote = cv.fit_transform(X_convote)
text_counts_ibc = cv.transform(X_ibc)

#### Multinomial Naive Bayes

In [21]:
clf = MultinomialNB().fit(text_counts_convote, y_convote)
predicted_nb = clf.predict(text_counts_ibc)
print("MultinomialNB Accuracy:",accuracy_score(y_ibc,predicted_nb))

MultinomialNB Accuracy: 0.5759527643585615


#### Linear SVM

In [22]:
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(text_counts_convote, y_convote)
predicted_svm = clf.predict(text_counts_ibc)
print("Linear SVM Accuracy:",accuracy_score(y_ibc,predicted_svm))

Linear SVM Accuracy: 0.5399892646269457


#### Logistic Regression

In [23]:
clf = LogisticRegression(n_jobs=1, C=1e5).fit(text_counts_convote, y_convote)
predicted_logreg = clf.predict(text_counts_ibc)
print("Logistic Regression Accuracy:",accuracy_score(y_ibc,predicted_logreg))

Logistic Regression Accuracy: 0.509393451422437


#### Saving predicted results to csv

In [24]:
y_test = np.array(y_ibc)
bow_ibc_convote = pd.DataFrame()
bow_ibc_convote ['MultiNB'] = predicted_nb
bow_ibc_convote ['SVM'] = predicted_svm
bow_ibc_convote ['Logreg'] = predicted_logreg
bow_ibc_convote ['yTrue'] = y_test
bow_ibc_convote.to_csv('BOW_TrainConvote_TestIBC.csv')

### Training on IBC and testing on convote

In [None]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts_ibc = cv.fit_transform(X_ibc)
text_counts_convote = cv.transform(X_convote)

#### Multinomial Naive Bayes

In [None]:
clf = MultinomialNB().fit(text_counts_ibc, y_ibc)
predicted_nb = clf.predict(text_counts_convote)
print("MultinomialNB Accuracy:",accuracy_score(y_convote,predicted_nb))

#### Linear SVM

In [None]:
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(text_counts_ibc, y_ibc)
predicted_svm = clf.predict(text_counts_convote)
print("Linear SVM Accuracy:",accuracy_score(y_convote,predicted_svm))

#### Logistic Regression

In [None]:
clf = LogisticRegression(n_jobs=1, C=1e5).fit(text_counts_ibc, y_ibc)
predicted_logreg = clf.predict(text_counts_convote)
print("Logistic Regression Accuracy:",accuracy_score(y_convote,predicted_logreg))

#### Saving predicted results in csv file

In [None]:
y_test = np.array(y_convote)
bow_ibc_convote = pd.DataFrame()
bow_ibc_convote ['MultiNB'] = predicted_nb
bow_ibc_convote ['SVM'] = predicted_svm
bow_ibc_convote ['Logreg'] = predicted_logreg
bow_ibc_convote ['yTrue'] = y_test
bow_ibc_convote.to_csv('BOW_TrainIBC_TestConvote.csv')

## Implementing only TFIDF features

### Training on convote and testing on IBC

In [None]:
tf = TfidfVectorizer()
text_tf_convote = tf.fit_transform(X_convote)
text_tf_ibc = tf.transform(X_ibc)

#### Multinomial Naive Bayes

In [None]:
clf = MultinomialNB().fit(text_tf_convote, y_convote)
predicted_NB = clf.predict(text_tf_ibc)
print("MultinomialNB Accuracy:",accuracy_score(y_ibc, predicted_NB))

#### Linear SVM

In [None]:
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(text_tf_convote, y_convote)
predicted_SVM = clf.predict(text_tf_ibc)
print("Linear SVM Accuracy:",accuracy_score(y_ibc, predicted_SVM))

#### Logistic Regression

In [None]:
clf = LogisticRegression(n_jobs=1, C=1e5).fit(text_tf_convote, y_convote)
predicted_LOG = clf.predict(text_tf_ibc)
print("Logistic Regression Accuracy:",accuracy_score(y_ibc, predicted_LOG))

#### Saving predicted results in csv file

In [None]:
y_test = np.array(y_ibc)
bow_ibc_convote = pd.DataFrame()
bow_ibc_convote ['MultiNB'] = predicted_NB
bow_ibc_convote ['SVM'] = predicted_SVM
bow_ibc_convote ['Logreg'] = predicted_LOG
bow_ibc_convote ['yTrue'] = y_test
bow_ibc_convote.to_csv('TFIDF_TrainConvote_TestIBC.csv')

### Training on IBC and testing on Convote

In [None]:
tf = TfidfVectorizer()
text_tf_ibc = tf.fit_transform(X_ibc)
text_tf_convote = tf.transform(X_convote)

#### Multinomial Naive Bayes

In [None]:
clf = MultinomialNB().fit(text_tf_ibc, y_ibc)
predicted_NB = clf.predict(text_tf_convote)
print("MultinomialNB Accuracy:",accuracy_score(y_convote, predicted_NB))

#### Linear SVM

In [None]:
clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None).fit(text_tf_ibc, y_ibc)
predicted_SVM = clf.predict(text_tf_convote)
print("Linear SVM Accuracy:",accuracy_score(y_convote, predicted_SVM))

#### Logistic Regression

In [None]:
clf = LogisticRegression(n_jobs=1, C=1e5).fit(text_tf_ibc, y_ibc)
predicted_LOG = clf.predict(text_tf_convote)
print("Logistic Regression Accuracy:",accuracy_score(y_convote, predicted_LOG))

#### Saving predicted results to csv file 

In [None]:
y_test = np.array(y_convote)
bow_ibc_convote = pd.DataFrame()
bow_ibc_convote ['MultiNB'] = predicted_NB
bow_ibc_convote ['SVM'] = predicted_SVM
bow_ibc_convote ['Logreg'] = predicted_LOG
bow_ibc_convote ['yTrue'] = y_test
bow_ibc_convote.to_csv('TFIDF_TrainIBC_TestConvote.csv')