#### head -1 data_combined_by_year/2016-train.csv > output.csv
#### tail -n +2  data_combined_by_year/*.csv >> output.csv

In [53]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import warnings
from gensim.models import Word2Vec
from sklearn.metrics import matthews_corrcoef

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [54]:
train = pd.read_csv('train.csv',index_col=False)
dev = pd.read_csv('dev.csv',index_col=False)

### cosine similarity between corresponding TF-IDF vectors from tf-idf value per word

In [55]:
train_labels = train['similarity_score']
train_data = train.drop('similarity_score',axis=1)
test_labels = dev['similarity_score']
test_data = dev.drop('similarity_score',axis=1)

In [56]:
def normalize_text(text):
    norm_text = text.lower()
    norm_text = norm_text.replace('<br />', ' ')
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    return norm_text

def tokenize(inputText, remove_stopwords=True,tagged=False, lemmatize=False):
    review_text = re.sub("[^a-zA-Z0-9]"," ", inputText)
    text = review_text.lower().split()
    words = []
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in text if not w in stops]
    else:
        return text
    if not tagged: 
        return(words)
    else:
        lem = WordNetLemmatizer()
        lemmatized_words = []
        tagged_words=[]
        filtered_tagged_words = []
        tagged_text = nltk.pos_tag(words)
        for word, tag in tagged_text:
            tagged_words.append({"word": word, "pos": tag})
        filtered_tagged_words = [word for word in tagged_words if word["pos"] in ["NN", "NNS","NNP"]]
        if lemmatize:
            for word in filtered_tagged_words:
                lemmatized_words.append(lem.lemmatize(word["word"]))
            return(lemmatized_words)
        else:
            temp=[]
            for entry in filtered_tagged_words:
                temp.append(entry["word"])
            return temp
        
def text_to_sentence(inputText,tokenizer,remove_stopwords=True,tagged=False,lemmatize=False):
    raw_sentences = tokenizer.tokenize(str(inputText).strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            temp2 = tokenize(raw_sentence,remove_stopwords,tagged,lemmatize)
            if len(temp2) > 0:
                sentences.append(temp2)
    return sentences

def avg_word_vectors(wordlist,size):
    """
    returns a vector of zero for reviews containing words where none of them
    met the min_count or were not seen in the training set
    Otherwise return an average of the embeddings vectors
    """
    sumvec=np.zeros(shape=(1,size))
    wordcnt=0
    for w in wordlist:
        if w in model:
            sumvec += model[w]
            wordcnt +=1
    if wordcnt ==0:
        return sumvec
    else:
        return sumvec / wordcnt

In [57]:
sample = "Hello, My name is Tom. I am 20 years old with a love for books and food!!!"
sample= normalize_text(sample)
print(text_to_sentence(sample,tokenizer,True,True,True))

[['hello', 'name', 'tom'], ['year', 'book', 'food']]


In [58]:
train_data['cleaned_S1'] = train_data['sentence1'].apply(lambda row: [val for sublist in text_to_sentence(row,tokenizer) for val in sublist])
train_data['cleaned_S2'] = train_data['sentence2'].apply(lambda row: [val for sublist in text_to_sentence(row,tokenizer) for val in sublist])

In [59]:
corpus = train_data['cleaned_S1']+train_data['cleaned_S2']

In [60]:
#model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
dimsize=200
model = Word2Vec(corpus, size=dimsize, window=5, min_count=5, workers=4)

In [61]:
train_S1=np.concatenate([avg_word_vectors(w,dimsize) for w in train_data['cleaned_S1']])
train_S2=np.concatenate([avg_word_vectors(w,dimsize) for w in train_data['cleaned_S2']])

In [62]:
from scipy import spatial
sims = []
for vec1,vec2 in zip(train_S1,train_S2):
    sim = 1 - spatial.distance.cosine(vec1,vec2)
    sims.append(sim)

In [72]:
import math
with open('word2vec-output.txt','w') as fout: 
    for s in sims:
        temp3 = (5-0)/(max(sims)-min(sims))*(s-max(sims))+5
        #check why nan
        if math.isnan(temp3) : temp3=0
        print(temp3, file=fout)

In [None]:
for i in range(5):
    print(scaled_sims[i],train_labels[i], train_data.iloc[i,:])

In [None]:
def PCA_model(samples):
    """
    Alternative to word2Vec for data vectorization
    """
    vectorizer = TfidfVectorizer(stop_words='english')
    svd = TruncatedSVD(n_components=5, random_state=42)
    pca = make_pipeline(vectorizer, svd, Normalizer(copy=False))
    model = pca.fit(samples)
    return model