In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate


In [2]:
reddit_user_data = pd.read_csv('reddit_user_data_count.csv')
subreddit_data = pd.read_csv('subreddit_info.csv')

In [3]:
reddit_user_data.head()

Unnamed: 0,user,subreddit,count
0,------Username------,AskReddit,20
1,------Username------,Barca,9
2,------Username------,FIFA,4
3,------Username------,MMA,5
4,------Username------,RioGrandeValley,3


In [4]:
subreddit_data.head()

Unnamed: 0,subreddit,num_subscribers,over18,public_description
0,ChoosingBeggars,2134849.0,False,"This subreddit is for posting screenshots, pic..."
1,Python,809272.0,False,News about the programming language Python. If...
2,interestingasfuck,8092462.0,False,For anything that is InterestingAsFuck
3,PublicFreakout,3257059.0,False,"A subreddit dedicated to people freaking out, ..."
4,ShitMomGroupsSay,258681.0,False,Share the drama. \n\nEssential oils cure all? ...


In [8]:
reader = Reader()

data = Dataset.load_from_df(reddit_user_data[['user','subreddit','count']],reader)

svd = SVD()

Once the data and model for product recommendation are ready, the model can be evaluated using cross-validation as follows:

In [9]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    48.9749 47.6031 49.0220 47.7506 49.0976 48.4896 0.6664  
MAE (testset)     10.9160 10.7682 10.9426 10.7263 10.8842 10.8475 0.0849  
Fit time          174.26  204.24  215.34  209.60  231.73  207.03  18.80   
Test time         11.02   19.15   12.93   15.92   10.77   13.96   3.18    


{'test_rmse': array([48.97491516, 47.60311093, 49.02199194, 47.75061225, 49.09759397]),
 'test_mae': array([10.91602827, 10.76816833, 10.94256744, 10.72628376, 10.8842233 ]),
 'fit_time': (174.25849509239197,
  204.24384951591492,
  215.33659195899963,
  209.59552097320557,
  231.7269628047943),
 'test_time': (11.020808458328247,
  19.146918296813965,
  12.930643081665039,
  15.917253971099854,
  10.773241758346558)}

Once the model has been evaluated to our satisfaction, then we can re-train the model using the entire training dataset:

In [10]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x159d87c9fd0>

In [21]:
titles = subreddit_data.copy()
titles['Estimate_Score'] = titles['subreddit'].apply(lambda x: svd.predict('Python', x).est)

In [19]:
titles = titles.sort_values(by=['Estimate_Score'], ascending=False)
titles.tail(15)

Unnamed: 0,subreddit,num_subscribers,over18,public_description,Estimate_Score
57840,DiamondCasinoHeist,45.0,False,A community for people who want to play the Di...,5
57843,CupToast,463.0,False,a subreddit dedicated to adorable twitter arti...,5
57854,MakoReizei,1140.0,False,please dont beeb beeb because im sleeb,5
57829,SanAntonioFC,501.0,False,San Antonio FC is a team founded in 2016 that ...,5
57844,KayFanClub,2001.0,False,Welcome to KayFanClub (Our Acronym is KFC.) He...,5
57845,Turtle_Team_GuP,1474.0,False,"Dedicated to \nAnzu Kadotani, Momo and Yuzu - ...",5
57846,Erika_empire,2258.0,False,Hello! Welcome to Erika's Empire. This is a Re...,5
57847,Underrated_GuP,3006.0,False,A place for all of the underrated/unmentioned ...,5
57848,h2odelirious,589.0,False,"Jonathan, also known as H2O Delirious is a You...",5
57849,NishiWarCrimes,633.0,False,This is for fans of the GUP character Nishi an...,5


In [5]:
test = reddit_user_data.merge(subreddit_data)
test

Unnamed: 0,user,subreddit,count,num_subscribers,over18,public_description
0,------Username------,AskReddit,20,32374022.0,False,r/AskReddit is the place to ask and answer tho...
1,----Michel----,AskReddit,6,32374022.0,False,r/AskReddit is the place to ask and answer tho...
2,----petrichor----,AskReddit,31,32374022.0,False,r/AskReddit is the place to ask and answer tho...
3,--Anarchaeopteryx--,AskReddit,1,32374022.0,False,r/AskReddit is the place to ask and answer tho...
4,--Orchid--,AskReddit,84,32374022.0,False,r/AskReddit is the place to ask and answer tho...
...,...,...,...,...,...,...
1646701,zynu,developer,1,10632.0,False,"The home for developers. Post your projects, g..."
1646702,zzreywasol,SubaruOldSchool,2,221.0,False,A place for older Subarus in all their glory. ...
1646703,zzzayah,DylanPierce,3,5.0,False,Dylan Pierce happy sad Brennan Smith scary man
1646704,zzzayah,Zynn,1,1955.0,False,Hello! This is a community all about Zynn! Ple...


In [6]:
#check for null values
total = test.isnull().sum().sort_values(ascending=False)
percent = (test.isnull().sum()/test.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([total, percent],axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
public_description,0,0.0
over18,0,0.0
num_subscribers,0,0.0
count,0,0.0
subreddit,0,0.0
user,0,0.0


In [11]:
test['public_description'][0]

'r/AskReddit is the place to ask and answer thought-provoking questions.'

Create word vector representation for the public_description. \
The goal if to reduce the importance of words that frequently occur in plot overviews and, therefore, 
their significance in computing the final similarity score
** I may have to deleite the /r from the sentence


In [13]:
# remove all english stop words
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(test['public_description'])

In [17]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

RuntimeError: nnz of the result is too large

In [None]:
cosine_sim.shape

In [9]:
# # create new collumn (Over18) reddit_user_data 
# for subreddit in reddit_user_data['subreddit']:
#     if subreddit in subreddit_data['subreddit']:
#         reddit_user_data['over18'] = subreddit_data['subreddit']
#     else: reddit_user_data['over18'] = 'NaoTem'


In [None]:
reddit_user_data.head() 