# Data TP

## Import librairies and databases

In [1]:
import pandas
import numpy

In [2]:
dataset = pandas.read_csv('.\\google\\data\\train.txt', delimiter=" ", names=["heads","tails","relation"])

In [3]:
dataset.describe()

Unnamed: 0,heads,tails,relation
count,573,573,573
unique,474,506,14
top,slow,california,capital-world
freq,5,12,116


In [4]:
dataset.shape

(573, 3)

In [5]:
dataset.head()

Unnamed: 0,heads,tails,relation
0,madrid,spain,capital-common-countries
1,ottawa,canada,capital-common-countries
2,cairo,egypt,capital-common-countries
3,paris,france,capital-common-countries
4,bern,switzerland,capital-common-countries


## Load Vectors

In [6]:
def load_vectors(file_path):
    resultDict = dict()
    with open(file_path) as f:
        raw_text = f.read()
        for raw_line in raw_text.split("\n")[:-1]:
            ind = raw_line.index(" ")
            resultDict[raw_line[:ind]] = numpy.array(raw_line[ind+1:].split(" "), dtype=numpy.float64)
        
    return resultDict

def formatPattern(algo, columns_value, file_name):
    return '.\\google\\{}\\subspaces\\{}\\{}.txt'.format(algo, columns_value, file_name)

In [7]:
vectors = {"heads": dict(), "tails": dict()}

for file_name in vectors.keys():
    for columns_value in set(dataset['relation'].values):
        file_dir = formatPattern('gloveCC', columns_value, file_name)
        vectors[file_name][columns_value] = load_vectors(file_dir)


## Replace values in dataframe

In [8]:
def replace_values(dataframe, vectorsDict):
    resultdf = dataframe.to_dict()
    to_remove = list()
    for indice in range(len(resultdf['relation'])):
        head = resultdf['heads'][indice]
        tail = resultdf['tails'][indice]
        relation = resultdf['relation'][indice]
        
        try:
            resultdf['heads'][indice] = vectorsDict['heads'][relation][head]
        except:
            print("no vector for (relation:%s, head:%s) " % (relation, head))
            to_remove.append(indice)
        
        try:
            resultdf['tails'][indice] = vectorsDict['tails'][relation][tail]
        except:
            print("no vector for (relation:%s, tail:%s) " % (relation, tail))
            to_remove.append(indice)
    
    for indice in to_remove:
        del(resultdf['heads'][indice])
        del(resultdf['tails'][indice])
        del(resultdf['relation'][indice])
        
    return resultdf


In [9]:
dataset2 = replace_values(dataset, vectors)

no vector for (relation:capital-world, head:funafuti) 
no vector for (relation:capital-world, head:ashgabat) 
no vector for (relation:capital-world, head:belmopan) 
no vector for (relation:capital-world, head:thimphu) 
no vector for (relation:capital-world, head:niamey) 
no vector for (relation:capital-world, head:nouakchott) 
no vector for (relation:capital-world, head:nuuk) 
no vector for (relation:capital-world, head:podgorica) 
no vector for (relation:capital-world, head:vaduz) 


In [10]:
dataset2.keys()

['tails', 'heads', 'relation']

In [11]:
dataset2pd = pandas.DataFrame({
    'tails': dataset2['tails'].values(),
    'heads': dataset2['heads'].values(),
    'relation': dataset2['relation'].values()
})

In [12]:
dataset2pd.shape

(564, 3)

In [13]:
dataset2pd.head()

Unnamed: 0,heads,relation,tails
0,"[0.26001, -0.53344, 0.22119, -0.16826, 0.58323...",capital-common-countries,"[-0.11981, 0.011386, 0.14965, -0.22285, 0.7112..."
1,"[0.14219, 0.01274, -0.5254, -0.60931, 1.263, 0...",capital-common-countries,"[-0.47825, 0.15908, -0.27509, -0.6478, 0.76022..."
2,"[0.56629, -0.20355, -0.093943, -0.45198, 0.301...",capital-common-countries,"[-0.051869, -0.26547, 0.034687, -0.45404, 0.26..."
3,"[0.35213, -0.074228, -0.23725, -0.32726, 0.539...",capital-common-countries,"[-0.16306, 0.45292, -0.14638, -0.64332, 0.7901..."
4,"[0.57031, 0.077812, 0.062999, -0.23876, 0.1227...",capital-common-countries,"[0.11316, 0.16675, -0.28034, -0.18965, 0.61872..."


## Compute Sum of substraction of heads_i and tails_i

In [14]:
def minus_of_two_vector(v1, v2):
    try:   
        return v1 - v2
    except:
        if type(v1) is not numpy.ndarray:
            print("substraction is not applicable with {}, type:{}".format(v1,type(v1)))
        if type(v2) is not numpy.ndarray:
            print("substraction is not applicable with {}, type:{}".format(v2,type(v2)))
        return 0.0

def sum_of_vectors(vs1, vs2):
    return numpy.sum([minus_of_two_vector(*l) for l in zip(vs1,vs2)])

In [15]:
rvector = sum_of_vectors(dataset2pd["heads"],dataset2pd["tails"])

## Split Dataset into Xset and Yset

In [16]:
xset = dataset2pd.iloc[:,[0,2]]
yset = dataset2pd.iloc[:,[1]]

In [17]:
xset.head()

Unnamed: 0,heads,tails
0,"[0.26001, -0.53344, 0.22119, -0.16826, 0.58323...","[-0.11981, 0.011386, 0.14965, -0.22285, 0.7112..."
1,"[0.14219, 0.01274, -0.5254, -0.60931, 1.263, 0...","[-0.47825, 0.15908, -0.27509, -0.6478, 0.76022..."
2,"[0.56629, -0.20355, -0.093943, -0.45198, 0.301...","[-0.051869, -0.26547, 0.034687, -0.45404, 0.26..."
3,"[0.35213, -0.074228, -0.23725, -0.32726, 0.539...","[-0.16306, 0.45292, -0.14638, -0.64332, 0.7901..."
4,"[0.57031, 0.077812, 0.062999, -0.23876, 0.1227...","[0.11316, 0.16675, -0.28034, -0.18965, 0.61872..."


In [18]:
yset.head()

Unnamed: 0,relation
0,capital-common-countries
1,capital-common-countries
2,capital-common-countries
3,capital-common-countries
4,capital-common-countries


## Use sigmoid function to awser the membership of one RDF tuple

In [19]:
def is_sigmoid(rfd_array, vectors_dict, rvector):
    
    try:
        rv = vectors_dict["heads"][rfd_array[2]][rfd_array[0]]
    except: 
        print("there is no vector equivalance for head %s: %s" % (rfd_array[0], rfd_array[2]))
        return 0

    try:
        fv = vectors_dict["tails"][rfd_array[2]][rfd_array[1]]
    except: 
        print("there is no vector equivalance for tail %s: %s" % (rfd_array[1], rfd_array[2]))
        return 0
    
    return (rv + rvector) * fv

In [20]:
rdf = dataset.head(1).values.tolist()[0]
sigmoid_var = is_sigmoid(rdf, vectors, rvector)

### Some checks

In [21]:
print(xset.shape)
print(yset.shape)
print("there is exactly the same number of rows ? %r" % (yset.shape[0] == xset.shape[0]))

xsetLen = set()
for line in xset.iterrows():
    xsetLen.add(len(line[1]['heads']))
    xsetLen.add(len(line[1]['tails']))
    
print("All rows contains 2 entrys (heads,tails) witch are vectors of 300 dim ?: %r" % (xsetLen == set([300])))


(564, 2)
(564, 1)
there is exactly the same number of rows ? True
All rows contains 2 entrys (heads,tails) witch are vectors of 300 dim ?: True


## 10 Fold Cross Validation, SVM & PRFS

In [32]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
from sklearn.svm import SVC

def get_f1_from_svm(xtrain, ytrain, xtest, ytest, C):
    print(xtrain)
    c = SVC(C=C).fit(xtrain,ytrain)
    return precision_recall_fscore_support(ytest, c.predict(xtest))[:,0]

def foreach_C(xtrain,ytrain,xtunning,ytunning,C=[0.1,1,10,100]):
    f1s = list()
    for c in C:
        f1s.append(get_f1_from_svm(xtrain,ytrain,xtunning,ytunning, c))
    return f1s

def foreachKfold(xset,yset,kfold=10):
    meanList = list()
    kf = KFold(n_splits=kfold, shuffle=False)
    for trainI, testI in kf.split(xset):
        xtrain = xset.iloc[trainI]
        ytrain = yset.iloc[trainI]
        xtest = xset.iloc[testI]
        ytest = yset.iloc[testI]
        meanList.append(foreach_C(xtrain,ytrain,xtest,ytest))
        
    return meanList



In [34]:
## foreachKfold(xset,yset)

In [44]:
def ghash(vector):
    vector.flags.writeable = False
    return hash(vector.data)

dataset_test = pandas.DataFrame({
    'tails': [ghash(vector) for vector in dataset2['tails'].values()],
    'heads': [ghash(vector) for vector in dataset2['heads'].values()],
    'relation': dataset2['relation'].values()
})

dataset_test

Unnamed: 0,heads,relation,tails
0,314029544,capital-common-countries,-761850682
1,924315510,capital-common-countries,-664130057
2,-1916388128,capital-common-countries,-505161522
3,882212183,capital-common-countries,-1733314066
4,-953757236,capital-common-countries,1580261113
5,686774894,capital-common-countries,-802899467
6,-406138258,capital-common-countries,67130661
7,-2087809711,capital-common-countries,316610479
8,-885438768,capital-common-countries,437731123
9,-1834503339,capital-common-countries,-1230027245
