## Load packages and import data

In [3]:
import numpy
import scipy
import scipy.sparse

from nltk.stem import PorterStemmer
porter = PorterStemmer()

from nltk.stem import LancasterStemmer
lancaster = LancasterStemmer()

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import csv
file = open('dev.csv', encoding= "utf8")
data = file.readlines()[1:]
df = file.read() 
file.close()

## Preprocess data

In [4]:
def string_cleaning(text):
    text = text.lower()
    clean_string = ""
    for a in text:
        if (a >= 'a' and a <= 'z'):
            clean_string += str(a)
        elif ord(a) >= 48 and ord(a) <= 57:
            clean_string += str(a)
        else:
            clean_string += " "
    return(clean_string)

ID = []
paraID = []
question = []
for i in data:
    ID.append(int(i[0:5]))
    paraID.append(int(i[-6:-1])) # el vector paraID el volem en format int per poder-ho comparar amb les prediccions que obtenim
    question.append(i[6:-7]) 

cleanText = [] # text per fer el tf idf
for i in range(len(ID)):
    a = string_cleaning(question[i])
    cleanText.append(a)


In [None]:
## Run only one of the next 4 cells (without lemmitization - lemmitization - Porter - Lancaster)

In [None]:
## without lemmitization/stemming -> 0.9159
stopwords = ['a', 'and', 'the', 'is', 'in', 'on', 'at','what', 'why', 'how'] #these stopwords don't add any accuracy

cleanText2 = []
for text in cleanText:
    sentence = []
    for word in text.split():
        if word not in stopwords:
            sentence.append(word)
    cleanText2.append(" ".join(sentence))
        
print(cleanText[0:5])
print()        
print(cleanText2[0:5])

In [None]:
## Lemmatization -> 0.9382

cleanText2 = []
for text in cleanText:
    sentence = []
    for word in text.split():
        sentence.append(wordnet_lemmatizer.lemmatize(word))
    cleanText2.append(" ".join(sentence))
        
print(cleanText[0:5])
print()        
print(cleanText2[0:5])

In [5]:
## PorterStemmer -> 0.9533

cleanText2 = []
for text in cleanText:
    sentence = []
    for word in text.split():
        sentence.append(porter.stem(word))
    cleanText2.append(" ".join(sentence))
        
print(cleanText[0:5])
print()        
print(cleanText2[0:5])

['what is the value of pi ', 'how can i attract hyderabad boys ', 'why do people get mad ', 'does fasting improve brain function ', 'what is your opinion on the venus project ']

['what is the valu of pi', 'how can i attract hyderabad boy', 'whi do peopl get mad', 'doe fast improv brain function', 'what is your opinion on the venu project']


In [4]:
## LancasterStemmer -> 0.9502
cleanText2 = []
for text in cleanText:
    sentence = []
    for word in text.split():
        sentence.append(lancaster.stem(word))
    cleanText2.append(" ".join(sentence))
        
print(cleanText[0:5])
print()        
print(cleanText2[0:5])

['what is the value of pi ', 'how can i attract hyderabad boys ', 'why do people get mad ', 'does fasting improve brain function ', 'what is your opinion on the venus project ']

['what is the valu of pi', 'how can i attract hyderabad boy', 'why do peopl get mad', 'doe fast improv brain funct', 'what is yo opin on the ven project']


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [7]:
matrix = vectorizer.fit_transform(cleanText2)

In [8]:
print (matrix.shape)
#Now we have the TF-IDF matrix (tfidf_matrix) for each document (the number of rows of the matrix) with 11 tf-idf terms 
#(the number of columns from the matrix), we can calculate the Cosine Similarity between the first document (“The sky is blue”)
#with each of the other documents of the se
#http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/

(10000, 6311)


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
out = cosine_similarity(matrix, matrix)
print("Identity diagonal")
print(out)
numpy.fill_diagonal(out,0) #fill the diagonal with zeros in order to ignore the cosine similarity between each sentence and itself
print()
print("Diagonal filled with zeros")
print(out)

Identity diagonal
[[1.         0.         0.         ... 0.06000092 0.         0.12126646]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.05740334 0.        ]
 ...
 [0.06000092 0.         0.         ... 1.         0.         0.07319576]
 [0.         0.         0.05740334 ... 0.         1.         0.        ]
 [0.12126646 0.         0.         ... 0.07319576 0.         1.        ]]

Diagonal filled with zeros
[[0.         0.         0.         ... 0.06000092 0.         0.12126646]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.05740334 0.        ]
 ...
 [0.06000092 0.         0.         ... 0.         0.         0.07319576]
 [0.         0.         0.05740334 ... 0.         0.         0.        ]
 [0.12126646 0.         0.         ... 0.07319576 0.         0.        ]]


In [11]:
print(out.shape)
print(out.dtype)
print(type(out))
print(type(out[1]))# out és una  array d'arrays
out.max(axis=1) # maximum per one dimension (each row)
print(out.max(axis=1))
pred = out.argmax(axis=1) # index of maximum per one dimension (each row)

(10000, 10000)
float64
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[0.67626664 0.95066292 0.81544221 ... 0.81893016 0.88936119 0.89708172]


In [12]:
pred_para_id = pred + 10000
para_id_np = numpy.array(paraID)

#accuracy = (numpy.equal(pred_para_id, para_id_np))/len(para_id_np) # elementwise comparison in python is not yet implemented
#accuracy = (pred_para_id == para_id_np))/len(para_id_np)
# instead of doing it in numpy, we will define an accuracy function that contains a for loop through the elements of the vector

pred_id = list(pred_para_id)

def accuracy (pred, para): 
    TP = 0 
    for i in range(len(pred)):
        if pred[i] == para[i]:
            TP += 1
    return TP/len(pred)


In [13]:
print(pred_id[0:5])
print(paraID[0:5])

accuracy(pred_id, paraID)

[17156, 16551, 12493, 15057, 16764]
[16250, 16551, 12493, 15057, 16764]


0.9533