#### head -1 data_combined_by_year/2016-train.csv > output.csv
#### tail -n +2  data_combined_by_year/*.csv >> output.csv

In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import warnings
from gensim.models import Word2Vec
from sklearn.metrics import matthews_corrcoef

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from scipy import spatial




In [3]:
train = pd.read_csv('train.csv',index_col=False)
dev = pd.read_csv('dev.csv',index_col=False)

(11222, 5) (11222, 1)


### cosine similarity between word2vec vectors

In [48]:
train_labels = train['similarity_score']
train_data = train.drop('similarity_score',axis=1)
test_labels = dev['similarity_score']
test_data = dev.drop('similarity_score',axis=1)

In [49]:
train_data['cleaned_S1'] = train_data['sentence1'].apply(lambda row: [val for sublist in text_to_sentence(row,tokenizer,True,False,True) for val in sublist])
train_data['cleaned_S2'] = train_data['sentence2'].apply(lambda row: [val for sublist in text_to_sentence(row,tokenizer,True,False,True) for val in sublist])

In [50]:
train_data['cleaned_tagged_S1'] = train_data['sentence1'].apply(lambda row: [val for sublist in text_to_sentence(row,tokenizer,True,True,True) for val in sublist])
train_data['cleaned_tagged_S2'] = train_data['sentence2'].apply(lambda row: [val for sublist in text_to_sentence(row,tokenizer,True,True,True) for val in sublist])

In [89]:
word2vec_feat = pd.read_csv('word2vec-feature-train.csv',index_col=False)
train_data['word2vec_cosine']=word2vec_feat['word2vec_feature']

In [37]:
def normalize_text(text):
    norm_text = text.lower()
    norm_text = norm_text.replace('<br />', ' ')
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    return norm_text

def tokenize(inputText, remove_stopwords=True,tagged=False, lemmatize=False):
    POS_TAGS=["NN", "NNS","NNP","NNPS","VBD","VBG","VBN","VBP","VBZ"]
    review_text = re.sub("[^a-zA-Z0-9]"," ", inputText)
    text = review_text.lower().split()
    words = []
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in text if not w in stops]
    else:
        return text
    if not tagged: 
        return(words)
    else:
        lem = WordNetLemmatizer()
        lemmatized_words = []
        tagged_words=[]
        filtered_tagged_words = []
        tagged_text = nltk.pos_tag(words)
        for word, tag in tagged_text:
            tagged_words.append({"word": word, "pos": tag})
        filtered_tagged_words = [word for word in tagged_words if word["pos"] in POS_TAGS]
        if lemmatize:
            for word in filtered_tagged_words:
                lemmatized_words.append(lem.lemmatize(word["word"]))
            return(lemmatized_words)
        else:
            temp=[]
            for entry in filtered_tagged_words:
                temp.append(entry["word"])
            return temp
        
def text_to_sentence(inputText,tokenizer,remove_stopwords=True,tagged=False,lemmatize=False):
    raw_sentences = tokenizer.tokenize(str(inputText).strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            temp2 = tokenize(raw_sentence,remove_stopwords,tagged,lemmatize)
            if len(temp2) > 0:
                sentences.append(temp2)
    return sentences

#Build word vector for training set by using the average value of all word vectors in the sentence, then scale
def buildWordVector(text, size, model):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            temp = model[word].reshape(1, size)
            vec += temp
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

def POS_tag(text):
    tags =[]
    tagged_text = nltk.pos_tag(text.split())
    for word, tag in tagged_text:
        tags.append(tag)
    return tags

In [None]:
sample = "Hello, My name is Tom. I am 20 years old with a love for books and food!!!"
sample= normalize_text(sample)
print(text_to_sentence(sample,tokenizer,True,True,True))

In [11]:
from sklearn.pipeline import make_pipeline
def PCA_model(samples):
    #Alternative to word2Vec for data vectorization
    vectorizer = TfidfVectorizer()
    svd = TruncatedSVD(n_components=5, random_state=42)
    pca = make_pipeline(vectorizer, svd, Normalizer(copy=False))
    model = pca.fit(samples)
    return model

In [12]:
#returns 2 arrays with vectors for sentences
#requirements: GoogleNews-vectors-negative300.bin in same folder if using that model

dimsize=300
def createWord2vecFeature(sentence1_arr,sentence2_arr,dimsize=300,default_model="other"):
    #sentence_arr is the column containing cleaned sentences (sentence1[] & sentence2[])
    if default_model == "google":
        model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    else:
        corpus = sentence1_arr + sentence2_arr
        model = Word2Vec(corpus, size=dimsize, window=5, min_count=5, workers=4)
    train_S1=np.concatenate([buildWordVector(w,dimsize,model) for w in sentence1_arr])
    train_S2=np.concatenate([buildWordVector(w,dimsize,model) for w in sentence2_arr])
    return calculateSimilarity(train_S1,train_S2)

In [83]:
# returns similarity array for each entry in arr1 and arr2
# spatial.distance.cosine computes the distance, and not the similarity. So, you must subtract the value from 1 to get the similarity.
from scipy import spatial
def calculateSimilarity(arr1,arr2,scale_flag=True):
    sims = []
    for vec1,vec2 in zip(arr1,arr2):
        sim = 1 - spatial.distance.cosine(vec1,vec2)
        sims.append(sim)
    if scale_flag:
        return scaleSimilarity(sims)
    else:
        return sims

In [14]:
import math
def scaleSimilarity(arr, scale_lower_bound=0, scale_upper_bound=5,write_to_file=False,file_name="temp.csv"):
    temp_arr=[]
    temp_val=0
    for s in arr:
        max_sim = max(arr)
        min_sim = min(arr)
        temp_val = (scale_upper_bound-scale_lower_bound)/(max_sim-min_sim)*(s-max_sim)+scale_upper_bound
        #check why nan
        if math.isnan(temp_val) : temp_val=0
        temp_arr.append(temp_val)
    if write_to_file:
        with open(file_name,'w') as fout: 
            for item in temp_arr:
                print(item,fout=file_name)
    else:
        return temp_arr

In [84]:
def TFIDFModel(sentence1_arr,sentence2_arr,scale_flag=True):
    temp_s1 =[]
    temp_s2 =[]
    items=[]
    
    corpus = sentence1_arr + sentence2_arr
    
    for item in corpus:
        items.append(" ".join(item))

    for item in sentence1_arr:
        temp_s1.append(" ".join(item))

    for item in sentence2_arr:
        temp_s2.append(" ".join(item))

    model = PCA_model(items)
    vec1 = model.transform(temp_s1)
    vec2 = model.transform(temp_s2)
    return calculateSimilarity(vec1, vec2,scale_flag)

In [85]:
#s1 = createWord2vecFeature(train_data['cleaned_S1'],train_data['cleaned_S2'])
train_data['TFIDF_cosine']=TFIDFModel(train_data['cleaned_S1'],train_data['cleaned_S2'], scale_flag=False)

In [30]:
def compute_jaccard_index(set_1, set_2):
    n = len(set_1.intersection(set_2))
    return n / float(len(set_1) + len(set_2) - n)

def jaccardFeature(arr1,arr2,feature_type):
    missing_cnt=0;
    for i in range(len(arr1)):
        try:
            train_data.ix[i,feature_type] = compute_jaccard_index(set(arr1[i]),set(arr2[i]))
        except ZeroDivisionError:
            missing_cnt+=1
            train_data.ix[i,feature_type]=0
    print('Missing values count for type:',feature_type," is ",missing_cnt)

In [58]:
def LCS_length(list_1, list_2):
    table = [[0] * (len(list_2) + 1) for _ in range(len(list_1) + 1)]
    for i, ca in enumerate(list_1, 1):
        for j, cb in enumerate(list_2, 1):
            table[i][j] = (
                table[i - 1][j - 1] + 1 if ca == cb else
                max(table[i][j - 1], table[i - 1][j]))
    return table[-1][-1]

def LCS_feature(arr1, arr2):
    for i in range(len(arr1)):
        temp_avg = np.mean([len(arr1[i]),len(arr2[i])])
        train_data.ix[i,'longest_common_subseq'] = LCS_length(arr1[i],arr2[i])/temp_avg

In [78]:
def compute_dice_coefficient(set_1, set_2):
    n = len(set_1.intersection(set_2))
    return 2*n / float(len(set_1) + len(set_2))

def diceFeature(arr1,arr2):
    missing_cnt=0;
    for i in range(len(arr1)):
        try:
            train_data.ix[i,'dice_coefficient'] = compute_dice_coefficient(set(arr1[i]),set(arr2[i]))
        except ZeroDivisionError:
            missing_cnt+=1
            train_data.ix[i,'dice_coefficient']=0
    print('Missing values count for dice_coefficient:'," is ",missing_cnt)

In [59]:
LCS_feature(train_data['cleaned_S1'],train_data['cleaned_S2'])
jaccardFeature(train_data['cleaned_S1'],train_data['cleaned_S2'],'word_jaccard')
jaccardFeature(train_data['sentence1'].apply(POS_tag), train_data['sentence2'].apply(POS_tag),'POS_jaccard')
diceFeature(train_data['cleaned_S1'],train_data['cleaned_S2'])

In [91]:
newDF = train_data
newDF['original_score'] = train_labels
train_data.to_csv('train-modified.csv');