# Pruebas Neo4j

In [1]:
from neo4j import GraphDatabase as GD
import pandas as pd
import numpy as np

In [2]:
uri = 'bolt://localhost:7687'
user = 'neo4j'
psw = 'password'

### Primera prueba

In [14]:
d = GD.driver(uri, auth=(user, psw))

In [15]:
def get_pod(tx, cat):
    pods = []
    result = tx.run("MATCH (c:Category)<--(p) "
                   "WHERE c.name = $cat "
                   "RETURN p.id AS pod", cat=cat)
    for r in result:
        pods.append(r['pod'])
    return pods

In [16]:
with d.session() as sess:
    pod = sess.read_transaction(get_pod, "news")
    for p in pod:
        print(p)

e89a2cc24f1fa1138a56a532ce68ab4a
f5fce0325ac6a4bf5e191d6608b95797
f42e0606b820b99354ab58d423598493
e8be02e7d7281ba3edb77da6a9fe95cd
ebb2a803a79ea0b5abdefb92d1c816f5
fc32aac8fe942ed89feae0e164185912
c4eafea85540de95544b73777e19c696
b9a5920d121ff4e239e6ca3b210e65e7
a343c93505bffeaf20564bd2977b4743
f1a7cabd6127108563a0e38ae044b88a
fa7162cfb3963c4d51b67094a51740d6
c684ea6c24f8a6a16c18edd37f2aea41
fa3cac03e05c1e9ec792e95e4850d7ce


### Segunda Prueba
Generar una clase

In [61]:
class App:
    def __init__(self, uri, auth):
        self.driver = GD.driver(uri, auth = auth)
        
    def close(self):
        self.driver.close()
        
    def get_cat_pod(self, user_id):
        with self.driver.session() as sess:
            result = sess.write_transaction(
                self._get_cat_pod, user_id)
            return result
        
    @staticmethod
    def _get_cat_pod(tx, user_id):
        query = (
            "MATCH (u:User)-[r]->(Podcast)-->(Category)<--(p:Podcast) "
            "WHERE u.id = $user_id "
            "RETURN r, p "
        )
        result = tx.run(query, user_id=user_id)
        return pd.DataFrame([{'cum_rating':record['r']['rating'], 
                              'podcast_id':record['p']['id']}
                    for record in result])

In [62]:
app = App(uri, (user, psw))

In [63]:
df = app.get_cat_pod("F5B4FD0698E670E")
df.head()

Unnamed: 0,cum_rating,podcast_id
0,5,bead83f2330788f9e629cce951e1df99
1,5,cc451ec20dfe8f985c246cc71749917d
2,5,b16332f51b9746dc5a97c7ce37644fc6
3,5,c1adb6ca5ca39575420fda03c099b037
4,5,cc451ec20dfe8f985c246cc71749917d


In [64]:
df.groupby('podcast_id')['cum_rating'].sum().reset_index()

Unnamed: 0,podcast_id,cum_rating
0,a3a535f66c7e8004e7dc54c2b2829a9e,5
1,b16332f51b9746dc5a97c7ce37644fc6,55
2,bead83f2330788f9e629cce951e1df99,55
3,c1adb6ca5ca39575420fda03c099b037,55
4,c2adb71e6b51e1397b49e216e07f3cef,55
5,cc451ec20dfe8f985c246cc71749917d,555
6,ee41bdc529ac60b6cd3b9a50413b0dee,5
7,f335df99185d1401ada492230b42514a,5
8,fa3cac03e05c1e9ec792e95e4850d7ce,5


## Generar atributos para entrenar modelo

In [6]:
class GenAtr:
    def __init__(self, uri, auth):
        self.driver = GD.driver(uri, auth = auth)
        
    def close(self):
        self.driver.close()
        
    
    def gen_train_data(self, df):
        c_names = ['cat_based', 'cat_cnt', 'user_based', 'user_cnt',
                   'adamic_adar', 'resource_allocation', 'link_cnt']
        df[c_names] = df.apply(self.gen_train_data_row, axis=1, result_type='expand')
        df['cat_avg'] = df['cat_based'] / df['cat_cnt']
        df['user_avg'] = df['user_based'] / df['user_cnt']
        df['adar_avg'] = df['adamic_adar'] / df['link_cnt']
        df['ra_avg'] = df['resource_allocation'] / df['link_cnt']
        df.fillna(0)
        return df 
    
    def gen_train_data_row(self, row):
        u_id, p_id, rtg = row['user_id'], row['podcast_id'], row['rating']
        self.delete_rtg(u_id, p_id)
        
        result = self.get_cat_based(u_id, p_id)
        result += self.get_user_based(u_id, p_id)
        result += self.adamic_adar(u_id, p_id)
        result += self.resource_allocation(u_id, p_id)
        
        self.create_rtg(u_id, p_id, rtg)
        
        return result
        
    def delete_rtg(self, user_id, podcast_id):
        with self.driver.session() as sess:
            sess.write_transaction(
                self._delete_rtg, user_id, podcast_id)
    
    @staticmethod
    def _delete_rtg(tx, user_id, podcast_id):
        query = (
            "MATCH (u:User)-[r]->(p:Podcast) "
            "WHERE u.id = $user_id AND p.id = $podcast_id "
            "DELETE r"
        )
        tx.run(query, user_id=user_id, podcast_id=podcast_id)

        
    def create_rtg(self, user_id, podcast_id, rating):
        with self.driver.session() as sess:
            sess.write_transaction(
                self._create_rtg, user_id, podcast_id, rating)
    
    @staticmethod
    def _create_rtg(tx, user_id, podcast_id, rating):
        query = (
            "MATCH (u:User) MATCH (p:Podcast) "
            "WHERE u.id = $user_id AND p.id = $podcast_id "
            "MERGE (u)-[r:Rating{rating:toInteger($rating)}]->(p) "
        )
        tx.run(query, user_id=user_id, podcast_id=podcast_id, rating=rating)
        
    def get_cat_based(self, user_id, podcast_id):
        with self.driver.session() as sess:
            result = sess.write_transaction(
                self._get_cat_based, user_id, podcast_id)
            return result
        
    @staticmethod
    def _get_cat_based(tx, user_id, podcast_id):
        query = (
            "MATCH (u:User)-[r]->(Podcast)-->(Category)<--(p:Podcast) "
            "WHERE u.id = $user_id AND p.id = $podcast_id "
            "RETURN r"
        )
        result = tx.run(query, user_id=user_id, podcast_id=podcast_id)
        total = 0
        cnt = 0
        for rec in result:
            total += rec['r']['rating']
            cnt += 1
        return [total, cnt]
    
    def get_user_based(self, user_id, podcast_id):
        with self.driver.session() as sess:
            result = sess.write_transaction(
                self._get_user_based, user_id, podcast_id)
            return result
        
    @staticmethod
    def _get_user_based(tx, user_id, podcast_id):
        query = (
            "MATCH (u:User)-[r1]->(Podcast)<-[r2]->(User)-[r3]->(p:Podcast) "
            "WHERE u.id = $user_id AND p.id = $podcast_id "
            "RETURN r1.rating + r2.rating + r3.rating "
            "AS total"
        )
        result = tx.run(query, user_id=user_id, podcast_id=podcast_id)
        total = 0
        cnt = 0
        for rec in result:
            total += rec['total']
            cnt += 1
        return [total, cnt]
    
    def adamic_adar(self, user_id, podcast_id):
        with self.driver.session() as sess:
            result = sess.write_transaction(
                self._adamic_adar, user_id, podcast_id)
            return result
        
    @staticmethod
    def _adamic_adar(tx, user_id, podcast_id):
        query = (
            "MATCH (u:User)-[r]->(p1:Podcast) MATCH (p:Podcast)"
            "WHERE u.id = $user_id AND p.id = $podcast_id "
            "RETURN r.rating * gds.alpha.linkprediction.adamicAdar(p1, p) "
            "AS score "
        )
        result = tx.run(query, user_id=user_id, podcast_id=podcast_id)
        total = 0
        for rec in result:
            total += rec['score']
        return [total]
    
    def resource_allocation(self, user_id, podcast_id):
        with self.driver.session() as sess:
            result = sess.write_transaction(
                self._resource_allocation, user_id, podcast_id)
            return result
        
    @staticmethod
    def _resource_allocation(tx, user_id, podcast_id):
        query = (
            "MATCH (u:User)-[r]->(p1:Podcast) MATCH (p:Podcast)"
            "WHERE u.id = $user_id AND p.id = $podcast_id "
            "RETURN r.rating * gds.alpha.linkprediction.resourceAllocation(p1, p) "
            "AS score "
        )
        result = tx.run(query, user_id=user_id, podcast_id=podcast_id)
        total = 0
        cnt = 0
        for rec in result:
            total += rec['score']
            cnt += 1
        return [total, cnt]

In [7]:
gen = GenAtr(uri, (user, psw))

Generación de atributos

In [8]:
rtg = pd.read_csv('data/ratings_sample.csv')
rtg.head()

Unnamed: 0,podcast_id,rating,user_id,liked
0,d34629baca14d87860276b12a6cb7a3b,1,949410AE9D72DA4,0
1,d34629baca14d87860276b12a6cb7a3b,5,F9939C4F2B7BCA4,1
2,a3a535f66c7e8004e7dc54c2b2829a9e,5,96A3F0358E7808D,1
3,a6cc4101cb004140936103e2ffbf8451,5,96A3F0358E7808D,1
4,a6cc4101cb004140936103e2ffbf8451,5,F5B4FD0698E670E,1


In [13]:
data = gen.gen_train_data(rtg.copy())
data.describe()

Unnamed: 0,rating,liked,cat_based,cat_cnt,user_based,user_cnt,adamic_adar,resource_allocation,link_cnt,cat_avg,user_avg,adar_avg,ra_avg
count,620.0,620.0,620.0,620.0,620.0,620.0,620.0,620.0,620.0,410.0,353.0,469.0,469.0
mean,3.345161,0.590323,10.4,2.401613,21.616129,1.864516,7.751768,1.793092,5.35,4.1482,10.029525,1.600547,0.370912
std,1.923469,0.492171,13.202952,2.7216,39.220899,2.829746,10.483605,2.624007,4.810975,1.392738,3.128532,1.288044,0.324119
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.75,7.5,0.670716,0.116883
50%,5.0,1.0,5.0,1.0,7.0,1.0,4.256634,0.944416,5.0,5.0,10.666667,1.465282,0.291209
75%,5.0,1.0,16.0,6.0,18.0,2.0,9.422815,2.051282,11.0,5.0,11.0,2.522747,0.560133
max,5.0,1.0,85.0,17.0,270.0,18.0,78.028664,20.546037,18.0,5.0,15.0,7.822715,1.616242


In [38]:
from sklearn.model_selection import train_test_split
c_names = ['cat_based', 'cat_cnt', 'user_based', 'user_cnt',
                   'adamic_adar', 'resource_allocation', 'link_cnt']
x = data[c_names]
y = data['liked']
x_train, x2, y_train, y2 = train_test_split(x, y, test_size=0.2)
x_val, x_test, y_val, y_test = train_test_split(x2, y2, test_size=0.5)
len(y_train), len(y_val), len(y_test)

(496, 62, 62)

In [39]:
y_train.mean(), y_val.mean(), y_test.mean()

(0.592741935483871, 0.5806451612903226, 0.5806451612903226)

In [40]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression()

In [41]:
y_pred = lr.predict(x_val)
y_pred

array([1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1])

In [42]:
y_val.values

array([1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1])

In [43]:
df = pd.DataFrame({'y_val':y_val.values, 'y_pred':y_pred})
df['right'] = df['y_val'] == df['y_pred']
df.sum() / len(df)

y_val     0.580645
y_pred    0.548387
right     0.677419
dtype: float64

In [44]:
pred = lr.predict(x_test)
df = pd.DataFrame({'y_test':y_test.values, 'y_pred':pred})
df['right'] = df['y_test'] == df['y_pred']
df.sum() / len(df)

y_test    0.580645
y_pred    0.580645
right     0.741935
dtype: float64