#### head -1 data_combined_by_year/2016-train.csv > output.csv
#### tail -n +2  data_combined_by_year/*.csv >> output.csv

In [126]:
import pandas as pd
import numpy as np
import nltk
import re
import string
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import warnings
from gensim.models import Word2Vec
from sklearn.metrics import matthews_corrcoef

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import SGDClassifier
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from scipy import spatial


In [120]:
train = pd.read_csv('train.csv',index_col=False)
dev = pd.read_csv('dev.csv',index_col=False)

### cosine similarity between word2vec vectors

In [121]:
train_labels = train['similarity_score']
train_data = train.drop('similarity_score',axis=1)
test_labels = dev['similarity_score']
test_data = dev.drop('similarity_score',axis=1)
dimsize=300

In [122]:
def normalize_text(text):
    norm_text = text.lower()
    norm_text = norm_text.replace('<br />', ' ')
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    return norm_text

def tokenize(inputText, remove_stopwords=True,tagged=False, lemmatize=False):
    review_text = re.sub("[^a-zA-Z0-9]"," ", inputText)
    text = review_text.lower().split()
    words = []
    
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in text if not w in stops]
    else:
        return text
    if not tagged: 
        return(words)
    else:
        lem = WordNetLemmatizer()
        lemmatized_words = []
        tagged_words=[]
        filtered_tagged_words = []
        tagged_text = nltk.pos_tag(words)
        for word, tag in tagged_text:
            tagged_words.append({"word": word, "pos": tag})
        filtered_tagged_words = [word for word in tagged_words if word["pos"] in ["NN", "NNS","NNP"]]
        if lemmatize:
            for word in filtered_tagged_words:
                lemmatized_words.append(lem.lemmatize(word["word"]))
            return(lemmatized_words)
        else:
            temp=[]
            for entry in filtered_tagged_words:
                temp.append(entry["word"])
            return temp
        
def text_to_sentence(inputText,tokenizer,remove_stopwords=True,tagged=False,lemmatize=False):
    raw_sentences = tokenizer.tokenize(str(inputText).strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            temp2 = tokenize(raw_sentence,remove_stopwords,tagged,lemmatize)
            if len(temp2) > 0:
                sentences.append(temp2)
    return sentences

#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
def buildWordVector(text, size, model):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            temp4 = model[word].reshape(1, size)
            vec += temp
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [57]:
sample = "Hello, My name is Tom. I am 20 years old with a love for books and food!!!"
sample= normalize_text(sample)
print(text_to_sentence(sample,tokenizer,True,True,True))

[['hello', 'name', 'tom'], ['year', 'book', 'food']]


In [124]:
train_data['cleaned_S1'] = train_data['sentence1'].apply(lambda row: [val for sublist in text_to_sentence(row,tokenizer) for val in sublist])
train_data['cleaned_S2'] = train_data['sentence2'].apply(lambda row: [val for sublist in text_to_sentence(row,tokenizer) for val in sublist])

In [125]:
corpus = train_data['cleaned_S1']+train_data['cleaned_S2']

In [60]:
#model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model = Word2Vec(corpus, size=dimsize, window=5, min_count=5, workers=4)

In [61]:
train_S1=np.concatenate([buildWordVector(w,dimsize,model) for w in train_data['cleaned_S1']])
train_S2=np.concatenate([buildWordVector(w,dimsize,model) for w in train_data['cleaned_S2']])

In [62]:
sims = []
for vec1,vec2 in zip(train_S1,train_S2):
    sim = 1 - spatial.distance.cosine(vec1,vec2)
    sims.append(sim)

In [73]:
import math
with open('word2vec-feature-train.csv','w') as fout: 
    print("word2vec_feature", file=fout)
    for s in sims:
        max_sim = max(sims)
        min_sim = min(sims)
        temp3 = (5-0)/(max_sim-min_sim)*(s-max_sim)+5
        #check why nan
        if math.isnan(temp3) : temp3=0
        print(temp3, file=fout)

In [132]:
for i in range(5):
    print(train_labels[i])

4.6
2.0
2.2
1.6
4.2


In [106]:
from sklearn.pipeline import make_pipeline
def PCA_model(samples):
    """
    Alternative to word2Vec for data vectorization
    """
    vectorizer = TfidfVectorizer()
    svd = TruncatedSVD(n_components=5, random_state=42)
    pca = make_pipeline(vectorizer, svd, Normalizer(copy=False))
    model = pca.fit(samples)
    return model

In [127]:
items=[]
for item in corpus:
    items.append(" ".join(item))

In [130]:
model2 = PCA_model(items)

In [131]:
model2

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...s=5, n_iter=5,
       random_state=42, tol=0.0)), ('normalizer', Normalizer(copy=False, norm='l2'))])