In [13]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from string import punctuation
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [14]:
train_df = pd.read_csv('train/train.csv')
train_info_df = pd.read_csv('train/information_train.csv', sep="\t")


In [15]:
print train_info_df.shape
print train_info_df.isnull().sum()

(3522, 7)
abstract            0
article_title       0
author_str        433
pmid                0
pub_date            0
set                 0
full_Text        3381
dtype: int64


In [16]:
def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return (text)
    

In [104]:
train_info_df['text'] = train_info_df['abstract'] + train_info_df['article_title']
train_info_df['process_text']=train_info_df['text'].apply(lambda row: text_to_wordlist(row))


In [219]:
def my_f1_score(y_true, y_pred):
    y_true = [str(element.strip().strip('\'')) for element in y_true.strip('[').strip(']').split(',')]
    y_true = set(y_true)
    if y_pred == 0:
        return 0
    else:
        y_pred = set(y_pred)
        precision = len(y_true.intersection(y_pred)) / len(y_pred) # TP/TP + FP
        recall = len(y_true.intersection(y_pred)) / len(y_true) # TP/TP + FN
        if (precision == 0 or recall == 0):
            return 0
        f1_score = 2 * precision * recall / (precision + recall)
        return f1_score

In [229]:
Tfidf=TfidfVectorizer()
train_tfidf=Tfidf.fit_transform(train_info_df.process_text)
sim_matr=cosine_similarity(train_tfidf)
ref_lengths_vs_scores = {}
train_final_df = train_info_df.merge(train_df, on='pmid', how='inner')
for column_length in range(1,100):
    top_matches = sim_matr.argsort()[:,-1*(column_length+1):-1]
    new_list=[]
    for index in range(0,train_info_df.shape[0]):
        pred_list=[]
        pred_pmid_setDF=train_final_df.loc[top_matches[index],['pmid','set']]
        pred_pmid_set_list = [tuple(x) for x in pred_pmid_setDF.values]
        #checks set of top matches and pmid in train file (should belong to same set)
        pmidset=train_final_df.loc[index,['pmid','set']]
        pmid_set_list= list(zip(pmidset,pmidset.index))
        # check here is sets match for predpmid and origionalpmid
        for pred_set in pred_pmid_set_list:
            if pred_set[1] == pmid_set_list[1][0]:
                pred_list.append(pred_set[0])
        
        
        if len(pred_list) == 0: # empty predictions
            new_list.append(0)
        else:
            pred_list = [str(pred) for pred in pred_list]
            new_list.append(pred_list)
    
    train_final_df['pred_list']=new_list
    train_final_df['f1_score']=train_final_df.apply(lambda row : my_f1_score(row.ref_list,row.pred_list),axis=1)
    ref_lengths_vs_scores[column_length] = np.average(train_final_df.f1_score)
    print (column_length,np.average(train_final_df.f1_score))
optimal_length = max([(value, key) for key, value in ref_lengths_vs_scores.items()])[1]
print optimal_length

(1, 0.07694491766042022)
(2, 0.0326519023282226)
(3, 0.019591141396933562)
(4, 0.011641113003975015)
(5, 0.011357183418512209)
(6, 0.010221465076660987)
(7, 0.008801817149346962)
(8, 0.008517887563884156)
(9, 0.008517887563884156)
(10, 0.007950028392958546)
(11, 0.0073821692220329355)
(12, 0.0068143100511073255)
(13, 0.00653038046564452)
(14, 0.006246450880181715)
(15, 0.006246450880181715)
(16, 0.006246450880181715)
(17, 0.00596252129471891)
(18, 0.005110732538330494)
(19, 0.004542873367404884)
(20, 0.003975014196479273)
(21, 0.004258943781942078)
(22, 0.0036910846110164677)
(23, 0.0036910846110164677)
(24, 0.0036910846110164677)
(25, 0.0036910846110164677)
(26, 0.0036910846110164677)
(27, 0.0028392958546280523)
(28, 0.0028392958546280523)
(29, 0.0028392958546280523)
(30, 0.0028392958546280523)
(31, 0.0028392958546280523)
(32, 0.0028392958546280523)
(33, 0.0028392958546280523)
(34, 0.0028392958546280523)
(35, 0.002555366269165247)
(36, 0.002555366269165247)
(37, 0.002271436683702442)
