In [1]:
import csv
import re
import string
from nltk import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.stem import WordNetLemmatizer

In [32]:
#function for cleaning a article

def clean_data(body):
    punctuations = string.punctuation + '—' + '’' + '…' + '‘' + '–' + '”' + '“'
    regex = re.compile('[%s]' % re.escape(punctuations))
    clean_text=''
    for sentence in sent_tokenize(body):
        sentence=regex.sub('', sentence)
        sentence=re.sub(r"\n", " ", sentence)
        sentence = re.sub(r"i'm", "i am", sentence)
        sentence = re.sub(r"he's", "he is", sentence)
        sentence = re.sub(r"she's", "she is", sentence)
        sentence = re.sub(r"it's", "it is", sentence)
        sentence = re.sub(r"that's", "that is", sentence)
        sentence = re.sub(r"what's", "what is", sentence)
        sentence = re.sub(r"where's", "where is", sentence)
        sentence = re.sub(r"how's", "how is", sentence)
        sentence = re.sub(r"\'ll", " will", sentence)
        sentence = re.sub(r"\'ve", " have", sentence)
        sentence = re.sub(r"\'re", " are", sentence)
        sentence = re.sub(r"\'d", " would", sentence)
        sentence = re.sub(r"\'re", " are", sentence)
        sentence = re.sub(r"won't", "will not", sentence)
        sentence = re.sub(r"can't", "cannot", sentence)
        sentence = re.sub(r"n't", " not", sentence)
        sentence = re.sub(r"n'", "ng", sentence)
        sentence = re.sub(r"'bout", "about", sentence)
        sentence = re.sub(r"'til", "until", sentence)
        sentence = re.sub(' +',' ',sentence)
        clean_text=clean_text+' '+sentence
    return clean_text
    
#clean all articles
def clean_bodies(articles):
    clean_articles={}
    for i in articles:
        clean_articles[int(i)]=clean_data(articles[i])
    #print(clean_articles[0])
    clean_articles_list=[]
    for i in clean_articles:
        clean_articles_list.append(clean_articles[i])
    return(clean_articles_list)


#read the articles

def read_bodies(file_name):
    with open(file_name,encoding='latin1') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        articles={}
        for i in spamreader:
            articles[i[0]]=i[1]
    del articles['Body ID']
    clean_articles={}
    for i in articles:
        clean_articles[int(i)]=clean_data(articles[i])
    #print(clean_articles[0])
    clean_articles_list=[]
    for i in clean_articles:
        clean_articles_list.append(clean_articles[i])
    return clean_articles,clean_articles_list

#create vocabulary

def generate_vocab(list_articles):
    single_string=''
    for article in list_articles:
        single_string=single_string + ' ' + article
    stop_words = set(stopwords.words('english'))
    vocab=set(single_string.split())
    
    vocab = [w for w in vocab if not w in stop_words]
   
    wordnet_lemmatizer = WordNetLemmatizer()
    new_vocab=[]
    for w in vocab:
        new_vocab.append(wordnet_lemmatizer.lemmatize(w))
    new_vocab=set(new_vocab)
    return vocab

#read title and stances into dictionaries
def read_title_stances(filename):
    with open(filename,encoding='latin1') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        stances={}
        title={}
        for i in spamreader:
            stances[i[1]]=i[2]
            title[i[1]]=i[0]
        del title['Body ID']
        del stances['Body ID']
        title = {int(k):v for k,v in title.items()}
        stances = {int(k):v for k,v in stances.items()}
    return title,stances

#create tf-idf matrix of features
def generate_matrix(article_list,vocabulary):
    vectorizer = TfidfVectorizer(vocabulary=vocabulary)
    t=vectorizer.fit_transform(article_list)
    vect_articles=t.toarray()
    #print(vect_articles)
    return vect_articles

#convert binary lables- 1 for selected stance 0 for rest
def convert_labels(article,stances,selected_stance):
    y_ids=[]
    for i in article:
        y_ids.append(i)
    stance_rearranged=[]
    for i in y_ids:
        stance_rearranged.append(stances[i])
    y_stance_label=[]
    for s in stance_rearranged:
        if s==selected_stance:
            y_stance_label.append(1)
        else :
            y_stance_label.append(0)
    return y_stance_label

#calculate accuracy of prediction
def calculate_accuracy(predictions,labels):
    count=0
    for i in range(len(predictions)):
        if predictions[i]==labels[i]:
            count+=1
    #print(sum(predictions),'\n',sum(labels))
    return count/len(labels)

#reorder stances by article bodies
def order_stance_by_body(article,stances):
    y_ids=[]
    for i in article:
        y_ids.append(i)
    stance_rearranged=[]
    for i in y_ids:
        stance_rearranged.append(stances[i])
    return stance_rearranged

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def logistic_regression(training_features, label, num_steps, learning_rate,c, add_intercept = False):
    if add_intercept:
        intercept = np.ones((training_features.shape[0], 1))
        training_features = np.hstack((intercept, training_features))
    
    weights = np.zeros(training_features.shape[1])
    #c=100
    for step in range(num_steps):
        scores = np.dot(training_features, weights)
        predictions = sigmoid(scores)

        # Update weights with gradient
        output_error = label - predictions
        gradient = np.dot(training_features.T, output_error)+ (2*c*weights)
        weights += learning_rate * gradient
        
    return weights[0],weights[1:]

#calculate final scores for prediction
def calculate_results(x,weight,intercept):
    #data_with_intercept = np.hstack((np.ones((x.shape[0], 1)), x))                                
    final_scores = np.dot(x, np.transpose(weight))+intercept
    predictions=[]
    for score in final_scores: 
        predictions.append(sigmoid(score))
    #final_scores.sigmoid()
    return predictions

#calculate weights and intercept
def calulate_weights(selected_stance,articles_train,stances,vect_articles):

    y_stance_label=convert_labels(article=articles_train,selected_stance = selected_stance, stances=stances)

    intercept , weights = logistic_regression(training_features=vect_articles,label=y_stance_label,
                         num_steps = 4000, learning_rate = 5e-6, add_intercept=True , c=100)
    
    return intercept , weights

#order the titles by article bodies
def order_title_by_body(article,title):
    y_ids=[]
    for i in article:
        y_ids.append(i)
    title_rearranged=[]
    for i in y_ids:
        title_rearranged.append(title[i])
    return title_rearranged

#append the title to the article body
def append_title(article_list,title):
    for i in range(len(article_list)):
        article_list[i] = article_list[i]+' '+ title[i]
    return article_list
        

In [30]:
articles_train , article_list_train = read_bodies('train_bodies.csv')

title_train , stances_train = read_title_stances('train_stances.csv')

vocab_train = generate_vocab(article_list_train)

title_train_rearranged = order_title_by_body(article = articles_train , title = title_train)

article_list_train = append_title(article_list = article_list_train , title = title_train_rearranged )

train_matrix = generate_matrix(article_list=article_list_train,vocabulary=vocab_train)


In [31]:
articles_test , article_list_test = read_bodies('competition_test_bodies.csv')

title_test , stances_test =read_title_stances(filename='competition_test_stances.csv')

title_test_rearranged = order_title_by_body(article = articles_test , title = title_test )

article_list_test = append_title(article_list = article_list_test , title = title_test_rearranged )

test_matrix=generate_matrix(article_list = article_list_test , vocabulary=vocab_train)



In [7]:
#result using sklearn 
from sklearn.linear_model import LogisticRegression

test_s=order_stance_by_body(article= articles_test , stances = stances_test)

y_s=order_stance_by_body(article=articles_train,stances=stances_train)

clf = LogisticRegression(fit_intercept=True, C = 100,multi_class = 'ovr')

clf.fit(train_matrix, y_s)

pred = clf.predict(X=test_matrix)

print(clf.intercept_, clf.coef_)

print('Accuracy from sk-learn: {0}'.format(calculate_accuracy(labels = test_s , predictions = pred)))

[ 0.13752231 -5.31850756 -0.59484479] [[ 0.          0.74983346  0.         ...  0.         -0.46739639
  -0.79210826]
 [ 0.         -0.00474117  0.         ...  0.         -0.02063156
   0.14040751]
 [ 0.         -0.7749598   0.         ...  0.          0.51205094
   0.61615012]]
Accuracy from sk-learn: 0.6603982300884956


In [33]:

intercept_agree , w_agree = calulate_weights(selected_stance = 'agree' , articles_train = articles_train , 
                                         stances = stances_train , vect_articles = train_matrix )

intercept_disagree , w_disagree = calulate_weights(selected_stance = 'disagree', articles_train=articles_train , 
                                                   stances = stances_train , vect_articles = train_matrix )

intercept_discuss , w_discuss = calulate_weights(selected_stance = 'discuss', articles_train = articles_train ,
                                                  stances = stances_train , vect_articles = train_matrix )


In [34]:

score_agree = calculate_results( weight=w_agree , x=test_matrix , intercept = intercept_agree )

score_disagree = calculate_results( weight=w_disagree , x=test_matrix , intercept=intercept_disagree )

score_discuss=calculate_results(weight = w_discuss , x = test_matrix , intercept = intercept_discuss )
#print((score_unrelated))


In [35]:
result=[]

for i in range(len(score_agree)):

    m=max(score_agree[i],score_disagree[i],score_discuss[i])
    
    if m==score_agree[i]:
        result.append('agree')
    
    elif m==score_discuss[i]:
        result.append('discuss')
    
    else:
        result.append('disagree')
  

In [36]:
#result using logistic regression function
actual_stance = order_stance_by_body( article = articles_test , stances = stances_test)

acc=calculate_accuracy( labels = actual_stance , predictions = result ) 

print('Final Accuracy with 1 vs all Logistic Regression = ',acc)

Final Accuracy with 1 vs all Logistic Regression =  0.5995575221238938


In [6]:
print(len(vocab_train))

33795


In [22]:
from sklearn import metrics, cross_validation
mult = 1
C = 1e-15
list_acc={}
while C * mult <= 1e10:
    clf = LogisticRegression(fit_intercept=True, C = C * mult, multi_class = 'ovr')
    predicted = cross_validation.cross_val_predict(clf, train_matrix, y_s, cv=10)
    list_acc[C*mult]= metrics.accuracy_score(y_s, predicted)
    print("For C: ", C * mult)
    print(metrics.accuracy_score(y_s, predicted))
    print(metrics.classification_report(y_s, predicted)) 
    mult = mult * 10

For C:  1e-15
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  1.0000000000000002e-14
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  1e-13
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  1e-12
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  1.0000000000000001e-11
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  1e-10
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  1e-09
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  1e-08
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  1.0000000000000001e-07
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  1.0000000000000002e-06
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  1e-05
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  0.0001
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  0.001
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  0.01
0.5923945335710041
             precision    recall  f1-score   support

      agree       0.00      0.00      0.00       590
   disagree       0.00      0.00      0.00        96
    discuss       0.59      1.00      0.74       997

avg / total       0.35      0.59      0.44      1683



  'precision', 'predicted', average, warn_for)


For C:  0.1
0.6054664289958408
             precision    recall  f1-score   support

      agree       0.92      0.04      0.07       590
   disagree       0.00      0.00      0.00        96
    discuss       0.60      1.00      0.75       997

avg / total       0.68      0.61      0.47      1683



  'precision', 'predicted', average, warn_for)


For C:  1.0
0.750445632798574
             precision    recall  f1-score   support

      agree       0.71      0.63      0.67       590
   disagree       0.00      0.00      0.00        96
    discuss       0.77      0.89      0.83       997

avg / total       0.70      0.75      0.72      1683

For C:  10.0
0.7605466428995841
             precision    recall  f1-score   support

      agree       0.71      0.68      0.69       590
   disagree       0.50      0.12      0.20        96
    discuss       0.79      0.87      0.83       997

avg / total       0.75      0.76      0.75      1683

For C:  100.00000000000001
0.750445632798574
             precision    recall  f1-score   support

      agree       0.69      0.66      0.67       590
   disagree       0.52      0.18      0.26        96
    discuss       0.79      0.86      0.82       997

avg / total       0.74      0.75      0.74      1683

For C:  1000.0000000000001
0.7450980392156863
             precision    recall  f1-score 

In [20]:
list_acc
list_c=[]
for key in list_acc.keys():
    list_c.append(key)
for key in list_acc:
    print(key,'\t',list_acc[key])

1e-10 	 0.5923945335710041
1e-09 	 0.5923945335710041
1e-08 	 0.5923945335710041
1.0000000000000001e-07 	 0.5923945335710041
1e-06 	 0.5923945335710041
1e-05 	 0.5923945335710041
0.0001 	 0.5923945335710041
0.001 	 0.5923945335710041
0.01 	 0.5923945335710041
0.1 	 0.6054664289958408
1.0 	 0.750445632798574
10.0 	 0.7605466428995841
100.0 	 0.750445632798574
1000.0 	 0.7450980392156863
10000.0 	 0.7415329768270945
100000.0 	 0.7391562685680333
1000000.0 	 0.7367795603089721
10000000.0 	 0.7361853832442068
100000000.0 	 0.7361853832442068
1000000000.0 	 0.7361853832442068
10000000000.0 	 0.7361853832442068
