#### head -1 data_combined_by_year/2016-train.csv > output.csv
#### tail -n +2  data_combined_by_year/*.csv >> output.csv

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import warnings
from gensim.models import Word2Vec
from sklearn.metrics import matthews_corrcoef

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from scipy import spatial


In [2]:
train = pd.read_csv('train.csv',index_col=False)
dev = pd.read_csv('dev.csv',index_col=False)

### cosine similarity between word2vec vectors

In [5]:
train_labels = train['similarity_score']
train_data = train.drop('similarity_score',axis=1)
test_labels = dev['similarity_score']
test_data = dev.drop('similarity_score',axis=1)

In [7]:
train_data['cleaned_S1'] = train_data['sentence1'].apply(lambda row: [val for sublist in text_to_sentence(row,tokenizer,True,False,True) for val in sublist])
train_data['cleaned_S2'] = train_data['sentence2'].apply(lambda row: [val for sublist in text_to_sentence(row,tokenizer,True,False,True) for val in sublist])

In [15]:
def normalize_text(text):
    norm_text = text.lower()
    norm_text = norm_text.replace('<br />', ' ')
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    return norm_text

def tokenize(inputText, remove_stopwords=True,tagged=False, lemmatize=False):
    review_text = re.sub("[^a-zA-Z0-9]"," ", inputText)
    text = review_text.lower().split()
    words = []
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in text if not w in stops]
    else:
        return text
    if not tagged: 
        return(words)
    else:
        lem = WordNetLemmatizer()
        lemmatized_words = []
        tagged_words=[]
        filtered_tagged_words = []
        tagged_text = nltk.pos_tag(words)
        for word, tag in tagged_text:
            tagged_words.append({"word": word, "pos": tag})
        filtered_tagged_words = [word for word in tagged_words if word["pos"] in ["NN", "NNS","NNP","NNPS","VBD","VBG","VBN","VBP","VBZ"]]
        if lemmatize:
            for word in filtered_tagged_words:
                lemmatized_words.append(lem.lemmatize(word["word"]))
            return(lemmatized_words)
        else:
            temp=[]
            for entry in filtered_tagged_words:
                temp.append(entry["word"])
            return temp
        
def text_to_sentence(inputText,tokenizer,remove_stopwords=True,tagged=False,lemmatize=False):
    raw_sentences = tokenizer.tokenize(str(inputText).strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            temp2 = tokenize(raw_sentence,remove_stopwords,tagged,lemmatize)
            if len(temp2) > 0:
                sentences.append(temp2)
    return sentences

#Build word vector for training set by using the average value of all word vectors in the sentence, then scale
def buildWordVector(text, size, model):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            temp = model[word].reshape(1, size)
            vec += temp
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
sample = "Hello, My name is Tom. I am 20 years old with a love for books and food!!!"
sample= normalize_text(sample)
print(text_to_sentence(sample,tokenizer,True,True,True))

In [8]:
from sklearn.pipeline import make_pipeline
def PCA_model(samples):
    """
    Alternative to word2Vec for data vectorization
    """
    vectorizer = TfidfVectorizer()
    svd = TruncatedSVD(n_components=5, random_state=42)
    pca = make_pipeline(vectorizer, svd, Normalizer(copy=False))
    model = pca.fit(samples)
    return model

In [13]:
#returns 2 arrays with vectors for sentences
#requirements: GoogleNews-vectors-negative300.bin in same folder if using that model

dimsize=300
def createWord2vecFeature(sentence1_arr,sentence2_arr,dimsize=300,default_model="other"):
    #sentence_arr is the column containing cleaned sentences (sentence1[] & sentence2[])
    if default_model == "google":
        model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    else:
        corpus = sentence1_arr + sentence2_arr
        model = Word2Vec(corpus, size=dimsize, window=5, min_count=5, workers=4)
    train_S1=np.concatenate([buildWordVector(w,dimsize,model) for w in sentence1_arr])
    train_S2=np.concatenate([buildWordVector(w,dimsize,model) for w in sentence2_arr])
    return calculateSimilarity(train_S1,train_S2)

In [10]:
#returns similarity array for each entry in arr1 and arr2
from scipy import spatial
def calculateSimilarity(arr1,arr2):
    sims = []
    for vec1,vec2 in zip(arr1,arr2):
        sim = 1 - spatial.distance.cosine(vec1,vec2)
        sims.append(sim)
    return scaleSimilarity(sims)

In [17]:
import math
def scaleSimilarity(arr, scale_lower_bound=0, scale_upper_bound=5,write_to_file=False,file_name="temp.csv"):
    temp_arr=[]
    temp_val=0
    for s in arr:
        max_sim = max(arr)
        min_sim = min(arr)
        temp_val = (scale_upper_bound-scale_lower_bound)/(max_sim-min_sim)*(s-max_sim)+scale_upper_bound
        #check why nan
        if math.isnan(temp_val) : temp_val=0
        temp_arr.append(temp_val)
    if write_to_file:
        with open(file_name,'w') as fout: 
            for item in temp_arr:
                print(item,fout=file_name)
    else:
        return temp_arr

In [22]:
def TFIDFModel(sentence1_arr,sentence2_arr):
    temp_s1 =[]
    temp_s2 =[]
    items=[]
    
    corpus = sentence1_arr + sentence2_arr
    
    for item in corpus:
        items.append(" ".join(item))

    for item in sentence1_arr:
        temp_s1.append(" ".join(item))

    for item in sentence2_arr:
        temp_s2.append(" ".join(item))

    model = PCA_model(items)
    vec1 = model.transform(temp_s1)
    vec2 = model.transform(temp_s2)
    return calculateSimilarity(vec1, vec2)

In [18]:
s1 = createWord2vecFeature(train_data['cleaned_S1'],train_data['cleaned_S2'])
s2 = TFIDFModel(train_data['cleaned_S1'],train_data['cleaned_S2'])

In [21]:
for i in range(5):
    print(train_labels[i],s1[i],s2[i])

4.6 4.99858650928 4.99528756196
2.0 4.88620399506 4.79515297126
2.2 4.95829897141 4.72730047087
1.6 4.90017293757 4.66259643043
4.2 4.99640191964 4.95254318146
