In [16]:
from mongo import Redditdb
import dataset
from pprint import pprint
import inspect
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

def createDataframe():
    db = Redditdb()
    users = db.allUsers()
    data = {}
    
    for user in users: 
        subs = {x : 1 for x in user['subreddits']}
        data[user['username']] = subs
    
    df = pd.DataFrame.from_dict(data, orient='index')
    df = df.fillna(0)
    return df

def findNeighbors(df, username):
    neigh = NearestNeighbors(n_neighbors=10, metric = 'cosine')
    neigh.fit(df)
    dist, ind = neigh.kneighbors(df.loc[username].values.reshape(1,-1))
    names = [df.iloc[i].index for i in ind]
    return names[0][1:], 1/(dist[0][1:])

def transform_df(df):
    pca = PCA(n_components=10, svd_solver='full')
    index = df.index
    tmp = pca.fit_transform(df)
    return pd.DataFrame(tmp, index=index)

def getRecommendedSubreddit(df, names, similarities, username):
    result = pd.Series()
    for name, sim in zip(names, similarities):
        s = df.loc[name]
        s =  s * sim
        result = result.add(s, fill_value=0.0)
    
    result = result.sort_values(ascending=False)
    result = result[result > 0.0]
    result = result.index

    alreadySub = df.loc[username]
    alreadySub = alreadySub[alreadySub > 0.0]
    alreadySub = alreadySub.index

    result = [x for x in result if x not in alreadySub]

    return result






In [17]:
#username = input()
username = Redditdb().getUser()['username']
dataset.getComments(username)

df = createDataframe()

In [18]:
df.head()

Unnamed: 0,PewdiepieSubmissions,Minecraft,entitledparents,memes,GrandTheftAutoV,gtaonline,cursedcomments,Wolfenstein,ToiletPaperUSA,dankmemes,...,guitarpedals,synthesizers,dreampop,shoegaze,diypedals,GuitarAmps,Bedbugs,modular,glasses,synthesizercirclejerk
-Yes-Sir-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1QUEEN12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1iopen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2003Slobra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2510EA,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df2 = transform_df(df)
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
-Yes-Sir-,-0.590303,-0.075641,-0.096686,0.023442,-0.154037,-0.027941,-0.031285,0.223193,0.295544,0.377013
1QUEEN12,1.330965,-1.121789,0.308707,0.11062,-0.0677,0.177984,0.125906,-0.35345,0.797143,-0.308894
1iopen,1.877512,-1.136717,0.192306,-0.006869,-0.353502,-0.303375,-0.004259,0.45729,0.150513,0.152242
2003Slobra,-0.903909,-0.165171,0.032738,-0.078973,0.058808,-0.022944,-0.131464,-0.059234,-0.03331,-0.134361
2510EA,-0.296304,1.203761,-0.387954,0.132807,0.049805,1.105133,-0.250573,-0.737542,-0.854322,0.084089


In [None]:
names, sim = findNeighbors(df2, username)
rec = getRecommendedSubreddit(df, names, sim, username)
print("Recommended subreddit for user {} is {}".format(username, rec[0]))

Recommended subreddit for user OpticRocky is Marvel
