In [1]:
import csv
import re
import string
from nltk import sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.stem import WordNetLemmatizer

In [2]:
#function for cleaning a article

def clean_data(body):
    punctuations = string.punctuation + '—' + '’' + '…' + '‘' + '–' + '”' + '“'
    regex = re.compile('[%s]' % re.escape(punctuations))
    clean_text=''
    for sentence in sent_tokenize(body):
        sentence=regex.sub('', sentence)
        sentence=re.sub(r"\n", " ", sentence)
        sentence = re.sub(r"i'm", "i am", sentence)
        sentence = re.sub(r"he's", "he is", sentence)
        sentence = re.sub(r"she's", "she is", sentence)
        sentence = re.sub(r"it's", "it is", sentence)
        sentence = re.sub(r"that's", "that is", sentence)
        sentence = re.sub(r"what's", "what is", sentence)
        sentence = re.sub(r"where's", "where is", sentence)
        sentence = re.sub(r"how's", "how is", sentence)
        sentence = re.sub(r"\'ll", " will", sentence)
        sentence = re.sub(r"\'ve", " have", sentence)
        sentence = re.sub(r"\'re", " are", sentence)
        sentence = re.sub(r"\'d", " would", sentence)
        sentence = re.sub(r"\'re", " are", sentence)
        sentence = re.sub(r"won't", "will not", sentence)
        sentence = re.sub(r"can't", "cannot", sentence)
        sentence = re.sub(r"n't", " not", sentence)
        sentence = re.sub(r"n'", "ng", sentence)
        sentence = re.sub(r"'bout", "about", sentence)
        sentence = re.sub(r"'til", "until", sentence)
        sentence = re.sub(' +',' ',sentence)
        clean_text=clean_text+' '+sentence
    return clean_text
    
#clean all articles
def clean_bodies(articles):
    clean_articles={}
    for i in articles:
        clean_articles[int(i)]=clean_data(articles[i])
    #print(clean_articles[0])
    clean_articles_list=[]
    for i in clean_articles:
        clean_articles_list.append(clean_articles[i])
    return(clean_articles_list)


#read the articles

def read_bodies(file_name):
    with open(file_name,encoding='latin1') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        articles={}
        for i in spamreader:
            articles[i[0]]=i[1]
    del articles['Body ID']
    clean_articles={}
    for i in articles:
        clean_articles[int(i)]=clean_data(articles[i])
    #print(clean_articles[0])
    clean_articles_list=[]
    for i in clean_articles:
        clean_articles_list.append(clean_articles[i])
    return clean_articles,clean_articles_list

#create vocabulary

def generate_vocab(list_articles):
    single_string=''
    for article in list_articles:
        single_string=single_string + ' ' + article
    stop_words = set(stopwords.words('english'))
    vocab=set(single_string.split())
    
    vocab = [w for w in vocab if not w in stop_words]
   
    wordnet_lemmatizer = WordNetLemmatizer()
    new_vocab=[]
    for w in vocab:
        new_vocab.append(wordnet_lemmatizer.lemmatize(w))
    new_vocab=set(new_vocab)
    return vocab

#read title and stances into dictionaries
def read_title_stances(filename):
    with open(filename,encoding='latin1') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        stances={}
        title={}
        for i in spamreader:
            stances[i[1]]=i[2]
            title[i[1]]=i[0]
        del title['Body ID']
        del stances['Body ID']
        title = {int(k):v for k,v in title.items()}
        stances = {int(k):v for k,v in stances.items()}
    return title,stances

#create tf-idf matrix of features
def generate_matrix(article_list,vocabulary):
    vectorizer = TfidfVectorizer(vocabulary=vocabulary)
    t=vectorizer.fit_transform(article_list)
    vect_articles=t.toarray()
    #print(vect_articles)
    return vect_articles

#convert binary lables- 1 for selected stance 0 for rest
def convert_labels(article,stances,selected_stance):
    y_ids=[]
    for i in article:
        y_ids.append(i)
    stance_rearranged=[]
    for i in y_ids:
        stance_rearranged.append(stances[i])
    y_stance_label=[]
    for s in stance_rearranged:
        if s==selected_stance:
            y_stance_label.append(1)
        else :
            y_stance_label.append(0)
    return y_stance_label

#calculate accuracy of prediction
def calculate_accuracy(predictions,labels):
    count=0
    for i in range(len(predictions)):
        if predictions[i]==labels[i]:
            count+=1
    #print(sum(predictions),'\n',sum(labels))
    return count/len(labels)

#reorder stances by article bodies
def order_stance_by_body(article,stances):
    y_ids=[]
    for i in article:
        y_ids.append(i)
    stance_rearranged=[]
    for i in y_ids:
        stance_rearranged.append(stances[i])
    return stance_rearranged

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def logistic_regression(training_features, label, num_steps, learning_rate,c ,add_intercept = False):
    if add_intercept:
        intercept = np.ones((training_features.shape[0], 1))
        training_features = np.hstack((intercept, training_features))
    
    weights = np.zeros(training_features.shape[1])
    
    for step in range(num_steps):
        scores = np.dot(training_features, weights)
        predictions = sigmoid(scores)

        # Update weights with gradient
        output_error = label - predictions
        gradient = np.dot(training_features.T, output_error)+ (2*c*weights)
        weights += learning_rate * gradient
        
    return weights[0],weights[1:]

#calculate final scores for prediction
def calculate_results(x,weight,intercept):
    #data_with_intercept = np.hstack((np.ones((x.shape[0], 1)), x))                                
    final_scores = np.dot(x, np.transpose(weight))+intercept
    predictions=[]
    for score in final_scores: 
        predictions.append(sigmoid(score))
    #final_scores.sigmoid()
    return predictions

#calculate weights and intercept
def calulate_weights(selected_stance,articles_train,stances,vect_articles):

    y_stance_label=convert_labels(article=articles_train,selected_stance = selected_stance, stances=stances)

    intercept , weights = logistic_regression(training_features=vect_articles,label=y_stance_label,
                         num_steps = 4000, learning_rate = 5e-6, add_intercept=True , c= 0.1 )
    
    return intercept , weights

#order the titles by article bodies
def order_title_by_body(article,title):
    y_ids=[]
    for i in article:
        y_ids.append(i)
    title_rearranged=[]
    for i in y_ids:
        title_rearranged.append(title[i])
    return title_rearranged

#append the title to the article body
def append_title(article_list,title):
    for i in range(len(article_list)):
        article_list[i] = article_list[i]+' '+ title[i]
    return article_list
        

In [3]:
articles_train , article_list_train = read_bodies('train_bodies.csv')

title_train , stances_train = read_title_stances('train_stances_unrelated.csv')

vocab_train = generate_vocab(article_list_train)

title_train_rearranged = order_title_by_body(article = articles_train , title = title_train)

article_list_train = append_title(article_list = article_list_train , title = title_train_rearranged )

train_matrix = generate_matrix(article_list=article_list_train,vocabulary=vocab_train)


In [4]:
articles_test , article_list_test = read_bodies('competition_test_bodies.csv')

title_test , stances_test =read_title_stances(filename='competition_test_stances_unrelated.csv')

title_test_rearranged = order_title_by_body(article = articles_test , title = title_test )

article_list_test = append_title(article_list = article_list_test , title = title_test_rearranged )

test_matrix=generate_matrix(article_list = article_list_test , vocabulary=vocab_train)



In [33]:
#result using sklearn
from sklearn.linear_model import LogisticRegression

test_s=order_stance_by_body(article= articles_test , stances = stances_test)

y_s=order_stance_by_body(article=articles_train,stances=stances_train)

clf = LogisticRegression(fit_intercept=True, C = 0.10,multi_class = 'ovr')

clf.fit(train_matrix, y_s)

pred = clf.predict(X=test_matrix)

print(clf.intercept_, clf.coef_)

print('Accuracy from sk-learn: {0}'.format(calculate_accuracy(labels = test_s , predictions = pred)))

[-0.09753355] [[ 0.00758028 -0.00763406  0.00248961 ... -0.02043477  0.
   0.        ]]
Accuracy from sk-learn: 0.5276548672566371


In [5]:
intercept_unrelated , w_unrelated = calulate_weights(selected_stance = 'unrelated' , articles_train = articles_train ,
                                                     stances=stances_train , vect_articles= train_matrix)

intercept_related , w_related = calulate_weights(selected_stance = 'related' , articles_train = articles_train ,
                                                     stances=stances_train , vect_articles= train_matrix)


In [6]:
score_unrelated = calculate_results ( weight = w_unrelated , x = test_matrix , intercept = intercept_unrelated )

score_related = calculate_results ( weight = w_related , x = test_matrix , intercept = intercept_related )



In [7]:
result=[]

for i in range(len(score_related)):

    m=max(score_unrelated[i],score_related[i])
    
    if m==score_unrelated[i]:
        result.append('unrelated')
    
    else:
        result.append('related')
  

In [8]:
#result using logistic regression function
actual_stance = order_stance_by_body( article = articles_test , stances = stances_test)

acc=calculate_accuracy( labels = actual_stance , predictions = result ) 

print('Final Accuracy with 1 vs all Logistic Regression = ',acc)

Final Accuracy with 1 vs all Logistic Regression =  0.5309734513274337


In [21]:
print(len(vocab_train))

33795


In [23]:
from sklearn import metrics, cross_validation
mult = 1
C = 1e-10
list_acc={}
while C * mult <= 1e10:
    clf = LogisticRegression(fit_intercept=True, C = C * mult, multi_class = 'ovr')
    predicted = cross_validation.cross_val_predict(clf, train_matrix, y_s, cv=10)
    list_acc[C * mult]= metrics.accuracy_score(y_s, predicted)
    print("For C: ", C * mult)
    print(metrics.accuracy_score(y_s, predicted))
    print(metrics.classification_report(y_s, predicted)) 
    mult = mult * 10

For C:  1e-10
0.5181224004753416
             precision    recall  f1-score   support

    related       0.52      1.00      0.68       872
  unrelated       0.00      0.00      0.00       811

avg / total       0.27      0.52      0.35      1683



  'precision', 'predicted', average, warn_for)


For C:  1e-09
0.5181224004753416
             precision    recall  f1-score   support

    related       0.52      1.00      0.68       872
  unrelated       0.00      0.00      0.00       811

avg / total       0.27      0.52      0.35      1683



  'precision', 'predicted', average, warn_for)


For C:  1e-08
0.5181224004753416
             precision    recall  f1-score   support

    related       0.52      1.00      0.68       872
  unrelated       0.00      0.00      0.00       811

avg / total       0.27      0.52      0.35      1683



  'precision', 'predicted', average, warn_for)


For C:  1.0000000000000001e-07
0.5181224004753416
             precision    recall  f1-score   support

    related       0.52      1.00      0.68       872
  unrelated       0.00      0.00      0.00       811

avg / total       0.27      0.52      0.35      1683



  'precision', 'predicted', average, warn_for)


For C:  1e-06
0.5181224004753416
             precision    recall  f1-score   support

    related       0.52      1.00      0.68       872
  unrelated       0.00      0.00      0.00       811

avg / total       0.27      0.52      0.35      1683



  'precision', 'predicted', average, warn_for)


For C:  1e-05
0.5181224004753416
             precision    recall  f1-score   support

    related       0.52      1.00      0.68       872
  unrelated       0.00      0.00      0.00       811

avg / total       0.27      0.52      0.35      1683



  'precision', 'predicted', average, warn_for)


For C:  0.0001
0.5181224004753416
             precision    recall  f1-score   support

    related       0.52      1.00      0.68       872
  unrelated       0.00      0.00      0.00       811

avg / total       0.27      0.52      0.35      1683



  'precision', 'predicted', average, warn_for)


For C:  0.001
0.5181224004753416
             precision    recall  f1-score   support

    related       0.52      1.00      0.68       872
  unrelated       0.00      0.00      0.00       811

avg / total       0.27      0.52      0.35      1683



  'precision', 'predicted', average, warn_for)


For C:  0.01
0.5181224004753416
             precision    recall  f1-score   support

    related       0.52      1.00      0.68       872
  unrelated       0.00      0.00      0.00       811

avg / total       0.27      0.52      0.35      1683



  'precision', 'predicted', average, warn_for)


For C:  0.1
0.5187165775401069
             precision    recall  f1-score   support

    related       0.52      0.84      0.64       872
  unrelated       0.50      0.17      0.25       811

avg / total       0.51      0.52      0.46      1683

For C:  1.0
0.5513963161021984
             precision    recall  f1-score   support

    related       0.56      0.64      0.60       872
  unrelated       0.54      0.45      0.49       811

avg / total       0.55      0.55      0.55      1683

For C:  10.0
0.5460487225193108
             precision    recall  f1-score   support

    related       0.56      0.61      0.58       872
  unrelated       0.53      0.48      0.51       811

avg / total       0.54      0.55      0.54      1683

For C:  100.0
0.5353535353535354
             precision    recall  f1-score   support

    related       0.55      0.59      0.57       872
  unrelated       0.52      0.48      0.50       811

avg / total       0.53      0.54      0.53      1683

For C:  1000.

In [25]:
list_acc
for key in list_acc:
    print(key,'\t',list_acc[key])

1e-10 	 0.5181224004753416
1e-09 	 0.5181224004753416
1e-08 	 0.5181224004753416
1.0000000000000001e-07 	 0.5181224004753416
1e-06 	 0.5181224004753416
1e-05 	 0.5181224004753416
0.0001 	 0.5181224004753416
0.001 	 0.5181224004753416
0.01 	 0.5181224004753416
0.1 	 0.5187165775401069
1.0 	 0.5513963161021984
10.0 	 0.5460487225193108
100.0 	 0.5353535353535354
1000.0 	 0.5353535353535354
10000.0 	 0.5311942959001783
100000.0 	 0.5252525252525253
1000000.0 	 0.5210932857991681
10000000.0 	 0.5175282234105764
100000000.0 	 0.5199049316696376
1000000000.0 	 0.5193107546048723
10000000000.0 	 0.5193107546048723
