In [6]:
import csv
import re
import string
from nltk import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [52]:
def clean_data(body):
    punctuations = string.punctuation + '—' + '’' + '…' + '‘' + '–' + '”' + '“'
    regex = re.compile('[%s]' % re.escape(punctuations))
    clean_text=''
    for sentence in sent_tokenize(body):
        sentence=regex.sub('', sentence)
        sentence=re.sub(r"\n", " ", sentence)
        sentence = re.sub(r"i'm", "i am", sentence)
        sentence = re.sub(r"he's", "he is", sentence)
        sentence = re.sub(r"she's", "she is", sentence)
        sentence = re.sub(r"it's", "it is", sentence)
        sentence = re.sub(r"that's", "that is", sentence)
        sentence = re.sub(r"what's", "what is", sentence)
        sentence = re.sub(r"where's", "where is", sentence)
        sentence = re.sub(r"how's", "how is", sentence)
        sentence = re.sub(r"\'ll", " will", sentence)
        sentence = re.sub(r"\'ve", " have", sentence)
        sentence = re.sub(r"\'re", " are", sentence)
        sentence = re.sub(r"\'d", " would", sentence)
        sentence = re.sub(r"\'re", " are", sentence)
        sentence = re.sub(r"won't", "will not", sentence)
        sentence = re.sub(r"can't", "cannot", sentence)
        sentence = re.sub(r"n't", " not", sentence)
        sentence = re.sub(r"n'", "ng", sentence)
        sentence = re.sub(r"'bout", "about", sentence)
        sentence = re.sub(r"'til", "until", sentence)
        sentence = re.sub(' +',' ',sentence)
        clean_text=clean_text+' '+sentence
    return clean_text
    
    
def clean_bodies(articles):
    clean_articles={}
    for i in articles:
        clean_articles[int(i)]=clean_data(articles[i])
    #print(clean_articles[0])
    clean_articles_list=[]
    for i in clean_articles:
        clean_articles_list.append(clean_articles[i])
    return(clean_articles_list)



def read_bodies(file_name):
    with open(file_name,encoding='latin1') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        articles={}
        for i in spamreader:
            articles[i[0]]=i[1]
    del articles['Body ID']
    clean_articles={}
    for i in articles:
        clean_articles[int(i)]=clean_data(articles[i])
    #print(clean_articles[0])
    clean_articles_list=[]
    for i in clean_articles:
        clean_articles_list.append(clean_articles[i])
    return clean_articles,clean_articles_list

def generate_vocab(list_articles):
    single_string=''
    for article in list_articles:
        single_string+=article
    stop_words = set(stopwords.words('english'))
    vocab=set(single_string.split())
    vocab = [w for w in vocab if not w in stop_words]
    return vocab

def read_title_stances(filename):
    with open(filename,encoding='latin1') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        stances={}
        title={}
        for i in spamreader:
            stances[i[1]]=i[2]
            title[i[1]]=i[0]
        del title['Body ID']
        del stances['Body ID']
        title = {int(k):v for k,v in title.items()}
        stances = {int(k):v for k,v in stances.items()}
    return title,stances

def generate_matrix(article_list,vocabulary):
    vectorizer = TfidfVectorizer(vocabulary=vocabulary)
    t=vectorizer.fit_transform(article_list)
    vect_articles=t.toarray()
    #print(vect_articles)
    return vect_articles

def convert_labels(article,stances,selected_stance):
    y_ids=[]
    for i in article:
        y_ids.append(i)
    stance_rearranged=[]
    for i in y_ids:
        stance_rearranged.append(stances[i])
    y_stance_label=[]
    for s in stance_rearranged:
        if s==selected_stance:
            y_stance_label.append(1)
        else :
            y_stance_label.append(0)
    return y_stance_label

def calculate_accuracy(predictions,labels):
    count=0
    for i in range(len(predictions)):
        if predictions[i]==labels[i]:
            count+=1
    #print(sum(predictions),'\n',sum(labels))
    return count/len(labels)

def order_stance_by_body(article,stances):
    y_ids=[]
    for i in article:
        y_ids.append(i)
    stance_rearranged=[]
    for i in y_ids:
        stance_rearranged.append(stances[i])
    return stance_rearranged

In [4]:
from sklearn.linear_model import LogisticRegression

articles_train,article_list_train = read_bodies('train_bodies.csv')
title,stances = read_title_stances('train_stances.csv')

vocab = generate_vocab(article_list_train)

vect_articles = generate_matrix(article_list=article_list_train,vocabulary=vocab)

y_stance_label=convert_labels(article=articles_train,selected_stance='unrelated',stances=stances)


clf = LogisticRegression(fit_intercept=True, C = 1e15)
clf.fit(vect_articles, y_stance_label)

print(clf.intercept_, clf.coef_)


[4.11851711] [[  0.           6.71696765   3.86215917 ...   0.           0.
  -16.18381799]]


In [40]:
test_article,test_article_list=read_bodies('competition_test_bodies.csv')
test_matrix=generate_matrix(article_list=test_article_list,vocabulary=vocab)
test_stances=read_title_stances(filename='competition_test_stances.csv')[1]
binary_test_stances=convert_labels(article=test_article,selected_stance='unrelated',stances=test_stances)
pred = clf.predict(X=test_matrix)
#print((test_stances),'\n\n',(pred))
print('Accuracy from sk-learn: {0}'.format(calculate_accuracy(labels = binary_test_stances , predictions = pred)))

Accuracy from sk-learn: 0.49889380530973454


In [27]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def logistic_regression(training_features, label, num_steps, learning_rate, add_intercept = False):
    if add_intercept:
        intercept = np.ones((training_features.shape[0], 1))
        training_features = np.hstack((intercept, training_features))
    
    weights = np.zeros(training_features.shape[1])
    
    for step in range(num_steps):
        scores = np.dot(training_features, weights)
        predictions = sigmoid(scores)

        # Update weights with gradient
        output_error = label - predictions
        gradient = np.dot(training_features.T, output_error)
        weights += learning_rate * gradient
        
    return weights
def calculate_results(x,weight,intercept):
    #data_with_intercept = np.hstack((np.ones((x.shape[0], 1)), x))                                
    final_scores = np.dot(x, np.transpose(weight))+intercept
    return final_scores

In [13]:

articles_train,article_list_train = read_bodies('train_bodies.csv')
title,stances = read_title_stances('train_stances.csv')

vocab = generate_vocab(article_list_train)

vect_articles = generate_matrix(article_list=article_list_train,vocabulary=vocab)

def calulate_weights(selected_stance,articles_train,stances,vect_articles):

    y_stance_label=convert_labels(article=articles_train,selected_stance = selected_stance, stances=stances)

    #weights = logistic_regression(training_features=vect_articles,label=y_stance_label,
    #                     num_steps = 4000, learning_rate = 5e-6, add_intercept=True)
    clf = LogisticRegression(fit_intercept=True, C = 1e15)
    clf.fit(vect_articles, y_stance_label)
    return(clf.intercept_, clf.coef_)

    #return weights

In [32]:
intercept_unrelated,w_unrelated=calulate_weights(selected_stance='unrelated',articles_train=articles_train,stances=stances,vect_articles=vect_articles)
intercept_agree,w_agree=calulate_weights(selected_stance='agree',articles_train=articles_train,stances=stances,vect_articles=vect_articles)
intercept_disagree,w_disagree=calulate_weights(selected_stance='disagree',articles_train=articles_train,stances=stances,vect_articles=vect_articles)
intercept_discuss,w_discuss=calulate_weights(selected_stance='discuss',articles_train=articles_train,stances=stances,vect_articles=vect_articles)


In [35]:
score_unrelated=calculate_results(weight=w_unrelated,x=test_matrix,intercept=intercept_unrelated)
score_agree=calculate_results(weight=w_agree,x=test_matrix,intercept=intercept_agree)
score_disagree=calculate_results(weight=w_disagree,x=test_matrix,intercept=intercept_disagree)
score_discuss=calculate_results(weight=w_discuss,x=test_matrix,intercept=intercept_discuss)
print(type(score_unrelated))

<class 'numpy.ndarray'>


In [37]:
result=[]
for i in range(score_agree.size):
    m=max(score_unrelated[i],score_agree[i],score_disagree[i],score_discuss[i])
    if m==score_unrelated[i]:
        result.append('unrelated')
    elif m==score_agree[i]:
        result.append('agree')
    elif m==score_discuss[i]:
        result.append('discuss')
    else:
        result.append('disagree')
  

In [53]:
actual_stance=order_stance_by_body(article=test_article,stances=test_stances)
calculate_accuracy(labels=actual_stance,predictions=result)

0.39823008849557523