# Collaborative Filtering

## Load data

In [8]:
import data_processing


data_processing.load_resources_from_raw_delta_logs(min_num_interactions=6)
data_processing.load_users(min_num_interactions=10)
users,resources,ratings,num_interactions = data_processing.compute_ratings()

293 users
2995 rated resources from initially 3727 loaded resources
6052 interactions
feedback matrix is 0.6896591019161629% populated


## Fit model

In [11]:

def compute_matrix_factorization():
    
    '''
        Aproximate data_processing_session/ratings file with pyspark framework, using ALS algo
       
        returns (ratings pyspark dataframe,model object,rmse aproximation score)
    '''

    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col 
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.recommendation import ALS
    from pyspark.sql import Row
    import os
    import sys

    
    # uncomment these in case you have problems with environment variables
    # os.environ['PYSPARK_PYTHON'] = sys.executable
    # os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

    #initialize spark
    import findspark
    findspark.init()

    spark = SparkSession.builder.appName("Collaborative Filtering Flow").getOrCreate()

    #preprocess ratings file into pyspark dataframe
    path = os.path.join('data processing session','extracted_ratings')
    lines = spark.read.text(path).rdd

    parts = lines.map(lambda row: row.value.split(" "))
    ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), postId=int(p[1]),
                                        rating=float(p[2]) ))


    ratings = spark.createDataFrame(ratingsRDD)

    (training, test) = ratings, ratings

    # # Build the recommendation model using ALS on the training data
    # # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="postId", ratingCol="rating",
            coldStartStrategy="drop")
    model = als.fit(training)

    # # Evaluate the model by computing the RMSE on the test (in this case also train) data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))
    return (ratings,model,rmse)

ratingsPysparkDf,model,rmse = compute_matrix_factorization()

Root-mean-square error = 0.018678266517322833


## Extract resources relevant to the latent factors

In [12]:
def extract_baseline_posts(model,cached=True):

    '''
        Extracts and returns baseline posts for solving cold start problem 
         
        based on top posts for every latent factor of the model
        
        or return hardcoded posts
        
    '''

    import data_processing

    #these hardcoded posts are extracted by taking the top 10 posts from every latent score of the model
    #and picking the ones that have a functional link/a novel topic
    titles =[
        "Infidelity should not happen when divorce is possible",
         "The default world lingua franca should be Spanish",
         "Buying clothes or goods from factories in the developing world is moral, eve...",
         "The legal owner of a firearm should be responsible for the weapon and anythi...",
         "Airport security screenings do very little to stop deliberate terrorism such as the attacks of 9/11. They are a show put on to make passengers think something is being done.",
         "Luxury watches are useless in the practical sense",
         "The concept of an omniscient (*) and capable creator is not compatible with ...",
         "If whatever makes your character different (sexual identity/disability etc) ...",
         "Cutlery should be placed at the end of a buffet line",
         "If an animal has a big enough population, hunting of it should be allowed"
]
    
    if cached:
        baseline_posts = []
        for title in titles:
            #print(title)
            id = data_processing.res_id[title]
            #print(id)
            baseline_posts.append(data_processing.get_resource_info(id))
        return baseline_posts
    else:    
        '''After advisor approval, these posts shall be hardcoded '''

        from pyspark.sql.functions import expr

        # Extract item factors
        item_factors = model.itemFactors

        # Find the top 5 highest scores for each latent factor
        num_factors = model.rank  # Number of latent factors
        top_scores_per_factor = []

        for i in range(num_factors):
            factor_col = expr(f"features[{i}]")
            top_rows = (item_factors
                        .select("id", factor_col.alias("factor"))
                        .orderBy("factor", ascending=False)
                        .limit(10)
                        .collect())
            top_scores_per_factor.append((i, [(row.id, row.factor) for row in top_rows]))

        #for the final version, the resources will be predefined, to ensure that the reddit posts links are not deprecated
        baseline_posts = []
        for factor, top_scores in top_scores_per_factor:
            for rank, (post_id, factor_score) in enumerate(top_scores, start=1):
                if rank ==1:
                    baseline_posts.append({'id':post_id,'score':factor_score,'content':data_processing.id_res[post_id]})
                print(f'title:{data_processing.id_res[post_id]["title"]},url:{data_processing.id_res[post_id]["url"]},factor {factor},score:{factor_score}')
            print("-"*100)
        return baseline_posts

baseline_posts = extract_baseline_posts(model,cached=True)

In [13]:
#example of a post object content
print(baseline_posts[0])

{'title': 'Infidelity should not happen when divorce is possible', 'pos_feedback': ['Nitrousoxide72', 'RedditExplorer89'], 'neg_feedback': ['ripcelinedionhusband', 'Melodic_Echidna', 'joopface', 'dublea', 'ripcelinedionhusband', 'WelfareBear', 'JimboMan1234', '', 'muyamable', 'Nephisimian', 'USoverthem'], 'url': 'https://www.reddit.com//r/changemyview/comments/imh6iw/cmv_infidelity_should_not_happen_when_divorce_is/'}


In [14]:
print(len(baseline_posts),baseline_posts)

10 [{'title': 'Infidelity should not happen when divorce is possible', 'pos_feedback': ['Nitrousoxide72', 'RedditExplorer89'], 'neg_feedback': ['ripcelinedionhusband', 'Melodic_Echidna', 'joopface', 'dublea', 'ripcelinedionhusband', 'WelfareBear', 'JimboMan1234', '', 'muyamable', 'Nephisimian', 'USoverthem'], 'url': 'https://www.reddit.com//r/changemyview/comments/imh6iw/cmv_infidelity_should_not_happen_when_divorce_is/'}, {'title': 'The default world lingua franca should be Spanish', 'pos_feedback': ['NicholasLeo', 'BrotherItsInTheDrum'], 'neg_feedback': ['Igor_Furman', 'muyamable', '', 'MontiBurns', '', 'parentheticalobject'], 'url': 'https://www.reddit.com//r/changemyview/comments/hf49v5/cmv_the_default_world_lingua_franca_should_be/'}, {'title': 'Buying clothes or goods from factories in the developing world is moral, eve...', 'pos_feedback': ['mr-logician', 'thedobya'], 'neg_feedback': ['AnythingApplied', 'AnythingApplied', 'MercurianAspirations', 'StellaAthena'], 'url': 'https://

## Collect data for cold start problem

In [16]:
def init_cold_start(baseline_posts,cached=True):

    '''
    Feed user baseline posts and collect feedback in order to compute recommendations
    '''

    import data_processing

    name = input("Please write your name:")
    description = "Dummy description of the task"

    baseline_results =[]

    if cached:
        for res in baseline_posts:
            print(f'Title:{res["title"]}')
            print(f'url: {res["url"]}')
            baseline_score = float(input('How much did this post make you see things trough a new perspective?:'))
            baseline_results.append({'id':data_processing.res_id[res['title']],'user_score':baseline_score})
        return (baseline_results,name)
    else:
        for baseline_post in baseline_posts:
            res = baseline_post["content"]
            print(f'Title:{res["title"]}')
            print(f'url: {res["url"]}')
            baseline_score = float(input('How much did this post make you see things trough a new perspective?:'))
            baseline_results.append({'id':baseline_post['id'],'factor_score':baseline_post['score'],'user_score':baseline_score})
        return (baseline_results,name)


baseline_results,name = init_cold_start(baseline_posts)

Title:Infidelity should not happen when divorce is possible
url: https://www.reddit.com//r/changemyview/comments/imh6iw/cmv_infidelity_should_not_happen_when_divorce_is/
Title:The default world lingua franca should be Spanish
url: https://www.reddit.com//r/changemyview/comments/hf49v5/cmv_the_default_world_lingua_franca_should_be/
Title:Buying clothes or goods from factories in the developing world is moral, eve...
url: https://www.reddit.com//r/changemyview/comments/dwebfc/cmv_buying_clothes_or_goods_from_factories_in_the/
Title:The legal owner of a firearm should be responsible for the weapon and anythi...
url: https://www.reddit.com//r/changemyview/comments/i4f917/cmv_the_legal_owner_of_a_firearm_should_be/
Title:Airport security screenings do very little to stop deliberate terrorism such as the attacks of 9/11. They are a show put on to make passengers think something is being done.
url: https://www.reddit.com/r/changemyview/comments/8a9r3h/cmv_airport_security_screenings_do_very_l

## Fit the model with the new user's data

In [17]:
import os

def add_new_user_ratings(baseline_results):

    '''
        Append user feedback to ratings file
        
    '''
    ratings_path_file = os.path.join('data processing session','extracted_ratings')

    ratings_file = open(ratings_path_file,'r')


    for line in ratings_file:
        last_line = line
    last_line = last_line.split(" ")
    new_user_id = int(last_line[0]) + 1
    print(new_user_id)
    ratings_file.close()
    ratings_file = open(ratings_path_file,'a')

    for result in baseline_results:
        #ratings_file.write()
        res_id = result['id']
        score = result['user_score']
        ratings_file.write(f'{new_user_id} {res_id} {score}\n')
        #print(f'{new_user_id} {res_id} {score}\n')
    ratings_file.close()
    return new_user_id
user_id = add_new_user_ratings(baseline_results)

294


In [19]:
ratingsPysparkDf,model,rmse = compute_matrix_factorization()

Root-mean-square error = 0.018717117442661333


## Make recommendations for the new user based on matrix factorization

### Extract ids of already rated posts by user

In [21]:
def extract_posts_already_rated(ratings,user_id = 294):
    from pyspark.sql.functions import col 
    user_ratings = ratings.filter(col("userId") == 294).select("postId").distinct()
    posts_rated_by_user = set()

    for post_id in user_ratings.toPandas().values:
        posts_rated_by_user.add(post_id[0])

    return posts_rated_by_user
current_user_already_rated_posts = extract_posts_already_rated(ratingsPysparkDf)

In [11]:
#This posts have already been rated by the user
user_recommendations = model.recommendForAllUsers(3)
user_recommendations.filter(user_recommendations.userId==294).show(truncate=False)

+------+-------------------------------------------------------+
|userId|recommendations                                        |
+------+-------------------------------------------------------+
|294   |[{344, 1.005191}, {1088, 1.0026215}, {2322, 1.0008016}]|
+------+-------------------------------------------------------+



In [22]:
from pyspark.sql.functions import col,lit

def generate_matrix_factorization_recs(ratingsPysparkDf,user_id = 294,number_recs=15,similar=True):
    
    # Generate recommendations for a specific user
    user_ratings = ratingsPysparkDf.filter(col("userId") == user_id).select("postId").distinct()
    #user_ratings.show()
    all_posts = ratingsPysparkDf.select("postId").distinct()
    posts_not_rated_by_user = all_posts.join(user_ratings, on="postId", how="left_anti")

    #posts_not_rated_by_user.show()

    # Recommend top number_recs posts
    posts_not_rated_by_user = posts_not_rated_by_user.withColumn("userId", lit(user_id))
    recommendations = model.transform(posts_not_rated_by_user)
    if similar:
        top_recommendations = recommendations.orderBy(col("prediction").desc()).select("postId", "prediction").limit(number_recs)
    else:
        top_recommendations = recommendations.orderBy(col("prediction")).select("postId", "prediction").limit(number_recs)

    #top_recommendations.show()


    #extract them in a list
    postIds = [p[0] for p in top_recommendations.select("postId").toPandas().values.tolist()]
    predictions = [p[0] for p in top_recommendations.select("prediction").toPandas().values.tolist()]


    matrix_factorization_predictions = []
    for index in range(len(postIds)):
        matrix_factorization_predictions.append({'post_id':postIds[index],'prediction':predictions[index]})
    print(matrix_factorization_predictions)
    return matrix_factorization_predictions

matrix_factorization_predictions = generate_matrix_factorization_recs(ratingsPysparkDf,user_id = 294,similar=True)


[{'post_id': 1688, 'prediction': 0.9327794313430786}, {'post_id': 3602, 'prediction': 0.8363574743270874}, {'post_id': 735, 'prediction': 0.8085615634918213}, {'post_id': 1713, 'prediction': 0.7815169095993042}, {'post_id': 2860, 'prediction': 0.7676966190338135}, {'post_id': 661, 'prediction': 0.7543203830718994}, {'post_id': 2110, 'prediction': 0.7425070405006409}, {'post_id': 2410, 'prediction': 0.6914209723472595}, {'post_id': 1214, 'prediction': 0.6531306505203247}, {'post_id': 2000, 'prediction': 0.6489737033843994}, {'post_id': 1468, 'prediction': 0.6381682753562927}, {'post_id': 1381, 'prediction': 0.6366795897483826}, {'post_id': 1623, 'prediction': 0.6358826160430908}, {'post_id': 3511, 'prediction': 0.6330591440200806}, {'post_id': 1459, 'prediction': 0.6318784952163696}]


In [23]:
#Note:
# after collecting user ratings, we will add them to the current_user_already_rated_posts set

## Make recommendations for the new user based on similar and opposite users

### Make recommendations based on similar users


### Create similarity matrix of users

In [24]:
def compute_user_similarity_matrix(model):

    
    import pandas as pd
    from sklearn.metrics.pairwise import cosine_similarity


    # Extract user factors
    user_factors = model.userFactors.orderBy('id')

    # Convert to Pandas DataFrame
    user_factors_pd = user_factors.toPandas()

    # Create a matrix of user factors
    user_ids = user_factors_pd['id'].values
    user_features = pd.DataFrame(user_factors_pd['features'].tolist(), index=user_ids)

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(user_features)

    # Convert the similarity matrix to a DataFrame for easier interpretation
    user_similarity_df = pd.DataFrame(similarity_matrix, index=user_ids, columns=user_ids)

    print(user_similarity_df)
    return user_similarity_df

user_similarity_df = compute_user_similarity_matrix(model)

          2         3         4         5         6         7         8    \
2    1.000000  0.689224 -0.002478  0.364748  0.464495  0.091506  0.111115   
3    0.689224  1.000000  0.315972  0.536956  0.228551 -0.020890  0.062861   
4   -0.002478  0.315972  1.000000  0.290739  0.337977  0.019709  0.149715   
5    0.364748  0.536956  0.290739  1.000000  0.158604  0.260201  0.126891   
6    0.464495  0.228551  0.337977  0.158604  1.000000 -0.160172  0.307491   
..        ...       ...       ...       ...       ...       ...       ...   
290  0.238161  0.017078  0.179156  0.323665  0.369684  0.441151 -0.151274   
291  0.346127  0.239607  0.538189 -0.038823  0.135164  0.420478  0.334786   
292  0.221272  0.416876  0.171109  0.289792  0.238637  0.226735 -0.340302   
293 -0.322204 -0.475196  0.338437 -0.268219  0.272247  0.090514 -0.271748   
294  0.532317  0.381894 -0.032160 -0.175198 -0.037670  0.196692  0.381441   

          9         10        11   ...       285       286       287  \
2  

In [25]:
#example how to access the similarity between user with itself for  id 2 
user_similarity_df.iloc[2][4]


1.0

In [27]:
user_similarity_df.iloc[292][294]

1.0

### Extract similar and opposite users

In [28]:
def extract_similar_and_opposite_users(user_similarity_df,interogated_user_id=294):

    import numpy as np

    #Note1: if you want to check if user ids are good, 
    #replace [-4:-1] with [-4:] and see if last user has a similarity score of aprox 1

    #Note2: -2 and +2 comes from the fact that we use numpy methods for a pandas dataframe, 
    # dataframe row index starts from 0; column index from 2

    user_similarity_scores_df =user_similarity_df.iloc[interogated_user_id-2]
    similar_users = []

    print(f'Similar users of user with id: {interogated_user_id}')

    for id in np.argsort(user_similarity_scores_df)[-4:-1]:
        user_id = id+2 #colum indexes start from 2 because lowest user_id value is 2
        similarity_score = user_similarity_scores_df[user_id]

        similar_users.append((user_id,similarity_score))
        try:
            print(f'user id: {user_id},similarity score: {similarity_score}')
        except:
            print(f'index error is {user_id} ')
    
    #reverse the list so that most similar user recs come first
    similar_users.reverse()

    print('-'*100)

    opposite_users = []

    print(f'Opposite users of user with id: {interogated_user_id}')
    for id in np.argsort(user_similarity_scores_df)[:3]:
        user_id = id+2
        similarity_score = user_similarity_scores_df[user_id]
        try:
        
            opposite_users.append((user_id,similarity_score))
            print(f'user id: {user_id},similarity score: {similarity_score}')
        except:
            print(f'index error is {user_id} ')
    print('-'*200)
    print(similar_users)
    print(opposite_users)

    return (similar_users,opposite_users)

similar_users,opposite_users = extract_similar_and_opposite_users(user_similarity_df)

Similar users of user with id: 294
user id: 61,similarity score: 0.6957035138603461
user id: 60,similarity score: 0.7540770037835002
user id: 120,similarity score: 0.7710613628470725
----------------------------------------------------------------------------------------------------
Opposite users of user with id: 294
user id: 203,similarity score: -0.7777278511934795
user id: 37,similarity score: -0.6149076628818244
user id: 247,similarity score: -0.6051832605515592
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[(120, 0.7710613628470725), (60, 0.7540770037835002), (61, 0.6957035138603461)]
[(203, -0.7777278511934795), (37, -0.6149076628818244), (247, -0.6051832605515592)]


### Extract and filter recommendations based on similar and opposite users 

Similar users flow

In [29]:
def generate_sim_recs(ratings_file, baseline_users,current_user_already_rated_posts,num_recs = 30,similar=True):

    
    num_users = len(baseline_users)
    for index in range(num_users):
        sim_user_id,similarity_score = baseline_users[index]

        print(f'user_id = {sim_user_id}')
        print(f'similarity_score ={similarity_score}')

        sim_user_recs = generate_matrix_factorization_recs(ratings_file,user_id=sim_user_id,number_recs=num_recs,similar=similar)
        current_user_recs = []
        for sim_user_rec_struct in sim_user_recs:

            if sim_user_rec_struct['post_id'] in current_user_already_rated_posts:
                #print('found already rated post')
                continue
            else:
                current_user_rec = {'post_id':sim_user_rec_struct['post_id'],
                                    'prediction':sim_user_rec_struct['prediction'] * similarity_score,
                                    'baseline_user_id':sim_user_id,
                                    'similarity_score': similarity_score,
                                    'baseline_user_prediction':sim_user_rec_struct['prediction'] }

                current_user_recs.append(current_user_rec)
    return current_user_recs
similar_users_recs = generate_sim_recs(ratings_file=ratingsPysparkDf,baseline_users=similar_users,current_user_already_rated_posts=current_user_already_rated_posts)

user_id = 120
similarity_score =0.7710613628470725
[{'post_id': 1791, 'prediction': 1.022874116897583}, {'post_id': 3602, 'prediction': 0.9346370100975037}, {'post_id': 3145, 'prediction': 0.7468541860580444}, {'post_id': 1088, 'prediction': 0.7215915322303772}, {'post_id': 344, 'prediction': 0.6749767065048218}, {'post_id': 1909, 'prediction': 0.6694530248641968}, {'post_id': 3102, 'prediction': 0.6545191407203674}, {'post_id': 2322, 'prediction': 0.6522638201713562}, {'post_id': 1333, 'prediction': 0.6422619819641113}, {'post_id': 182, 'prediction': 0.6071485877037048}, {'post_id': 735, 'prediction': 0.6027434468269348}, {'post_id': 3555, 'prediction': 0.5821658968925476}, {'post_id': 1713, 'prediction': 0.5777302980422974}, {'post_id': 2119, 'prediction': 0.5575791597366333}, {'post_id': 3403, 'prediction': 0.5493278503417969}, {'post_id': 1398, 'prediction': 0.5472486019134521}, {'post_id': 2427, 'prediction': 0.5319478511810303}, {'post_id': 1555, 'prediction': 0.5315414667129517}

Opposite users flow

In [30]:
opposite_users_recs = generate_sim_recs(ratings_file=ratingsPysparkDf,baseline_users=opposite_users,current_user_already_rated_posts=current_user_already_rated_posts,similar=False)

user_id = 203
similarity_score =-0.7777278511934795
[{'post_id': 1088, 'prediction': -0.940750777721405}, {'post_id': 2427, 'prediction': -0.8550181984901428}, {'post_id': 3602, 'prediction': -0.8071874976158142}, {'post_id': 2322, 'prediction': -0.7492169141769409}, {'post_id': 3102, 'prediction': -0.6862250566482544}, {'post_id': 2493, 'prediction': -0.6812204718589783}, {'post_id': 1791, 'prediction': -0.6640809774398804}, {'post_id': 486, 'prediction': -0.6570506691932678}, {'post_id': 1320, 'prediction': -0.6324908137321472}, {'post_id': 1909, 'prediction': -0.6243507266044617}, {'post_id': 3487, 'prediction': -0.5904501676559448}, {'post_id': 661, 'prediction': -0.5581014156341553}, {'post_id': 2373, 'prediction': -0.5576084852218628}, {'post_id': 386, 'prediction': -0.5543782711029053}, {'post_id': 1581, 'prediction': -0.5524340867996216}, {'post_id': 1623, 'prediction': -0.5490610599517822}, {'post_id': 3442, 'prediction': -0.538173258304596}, {'post_id': 2620, 'prediction': -0

Testing on real users logic 

matrix factorization

In [31]:
#Showing the 3 types of recomendations the user will encounter
for rec in matrix_factorization_predictions:
    print(rec)
    break
for rec in similar_users_recs:
    print(rec)
    break
for rec in opposite_users_recs:
    print(rec)
    break

{'post_id': 1688, 'prediction': 0.9327794313430786}
{'post_id': 3373, 'prediction': 0.5286839080942827, 'baseline_user_id': 61, 'similarity_score': 0.6957035138603461, 'baseline_user_prediction': 0.7599270343780518}
{'post_id': 3618, 'prediction': 0.5328360608636782, 'baseline_user_id': 247, 'similarity_score': -0.6051832605515592, 'baseline_user_prediction': -0.8804540634155273}


In [32]:
def test_matrix_factorization(recs=matrix_factorization_predictions,tolerance_limit=5):
    import data_processing
    num_rated_posts = 0
    matrix_factorization_results = {}
    for data in recs:
        id = data['post_id']
        res =data_processing.get_resource_info(id)
        print(res['title'])
        print(res['url'])
        is_valid = input('Does the url provide enough info for you to understand the basic ideas expressed in this post?(Y/N):')
        if is_valid.lower() == 'y':
            rec_score = float(input('How much did this recommended post make you see things trough a new perspective?:'))
            matrix_factorization_results[id]={'user_score': rec_score, 'predicted_score':data['prediction'],'validity':True}
            num_rated_posts +=1
        else:
            matrix_factorization_results[id]={'validity':False, 'predicted_score':data['prediction']}

        current_user_already_rated_posts.add(id)
        if num_rated_posts == tolerance_limit:
            return matrix_factorization_results
    return matrix_factorization_results


matrix_factorization_results = test_matrix_factorization(matrix_factorization_predictions) 

Lawns are stupid, wasteful, and vain.
https://www.reddit.com//r/changemyview/comments/gjqxef/cmv_lawns_are_stupid_wasteful_and_vain/
Valid input
Making student loans bankruptcy dischargeable is a terrible idea and regress...
https://www.reddit.com//r/changemyview/comments/14vmd6z/cmv_making_student_loans_bankruptcy_dischargeable/
Valid input
All classified govt material should be unclassified after 100 years
https://www.reddit.com//r/changemyview/comments/9rk89i/cmv_all_classified_govt_material_should_be/
Valid input
If you believe a boyfriend/husband has no say in an abortion, than it is hyp...
https://www.reddit.com//r/changemyview/comments/gqf338/cmv_if_you_believe_a_boyfriendhusband_has_no_say/
Valid input
Definition of fascism is being used incorrectly. Both right and left can be ...
https://www.reddit.com//r/changemyview/comments/t2ru6g/cmv_definition_of_fascism_is_being_used/
Valid input


In [33]:
len(current_user_already_rated_posts)

15

similar users

In [34]:
import data_processing
def test_user_similarity_recommendations(baseline_users_recs,tolerance_limit =5):
    num_rated_posts = 0
    baseline_user_recs_results = {}
    for data in baseline_users_recs:
        res_id = data['post_id']
        if res_id in current_user_already_rated_posts:
            print('found already recommended post')
            continue
        
        res = data_processing.get_resource_info(res_id)
        baseline_user_id = data['baseline_user_id']
        print(res['title'])
        print(res['url'])
        is_valid = input('Does the url provide enough info for you to understand the basic ideas expressed in this post?(Y/N):')
        if is_valid.lower() == 'y':
            print("Valid input")
            rec_score = float(input('How much did this recommended post make you see things trough a new perspective?:'))
            baseline_user_recs_results[baseline_user_id]={'user_score': rec_score, 'rec_info':data,'validity':True}
            num_rated_posts +=1
        else:
            baseline_user_recs_results[baseline_user_id]={'validity':False, 'rec_info':data}

        current_user_already_rated_posts.add(res_id)
        if num_rated_posts==tolerance_limit:
            return baseline_user_recs_results
    return baseline_user_recs_results


similar_user_recs_results = test_user_similarity_recommendations(similar_users_recs)

The stock market is government sanctioned gambling that suppresses the poor
https://www.reddit.com//r/changemyview/comments/10gsh70/cmv_the_stock_market_is_government_sanctioned/
Valid input
Criticizing the Chinese government does not make you Sinophobic, Criticizing...
https://www.reddit.com//r/changemyview/comments/loakbf/cmv_criticizing_the_chinese_government_does_not/
Valid input
racist jokes are bad and people who tell them are racist
https://www.reddit.com//r/changemyview/comments/ct2hk5/cmv_racist_jokes_are_bad_and_people_who_tell_them/
Valid input
Jada's and Wills reaction was unacceptable.
https://www.reddit.com//r/changemyview/comments/tqpqvh/cmv_jadas_and_wills_reaction_was_unacceptable/
Valid input
All binary transgender people should press "The Button" assuming they would ...
https://www.reddit.com//r/changemyview/comments/dg8ozy/cmv_all_binary_transgender_people_should_press/
Valid input


In [35]:
len(current_user_already_rated_posts)

20

Opposite users

In [36]:
opposite_user_recs_results = test_user_similarity_recommendations(opposite_users_recs)

The Israeli military operation is counterproductive
https://www.reddit.com//r/changemyview/comments/180iel8/cmv_the_israeli_military_operation_is/
Valid input
Historically, the only successful long-term alternatives to Strongman Author...
https://www.reddit.com//r/changemyview/comments/k9r1ip/cmv_historically_the_only_successful_longterm/
Valid input
found already recommended post
Trading Card Games are a business model that hurts the game's experience mor...
https://www.reddit.com//r/changemyview/comments/bcg9wc/cmv_trading_card_games_are_a_business_model_that/
Valid input
Math should be assessed like English, allowing creativity through open ended answer
https://www.reddit.com//r/changemyview/comments/a1pifu/cmv_math_should_be_assessed_like_english_allowing/
Valid input
Abolishing the electoral college is a selfish ambition and further marginali...
https://www.reddit.com//r/changemyview/comments/b31329/cmv_abolishing_the_electoral_college_is_a_selfish/
Valid input


In [37]:
len(current_user_already_rated_posts)

25

saving results


In [38]:
import json

print(opposite_user_recs_results)
print(similar_user_recs_results)
print(matrix_factorization_results)
print(name)




{247: {'user_score': 1.0, 'rec_info': {'post_id': 969, 'prediction': 0.40599652016496535, 'baseline_user_id': 247, 'similarity_score': -0.6051832605515592, 'baseline_user_prediction': -0.6708654165267944}, 'validity': True}}
{61: {'user_score': 1.0, 'rec_info': {'post_id': 1280, 'prediction': 0.44513292832056395, 'baseline_user_id': 61, 'similarity_score': 0.6957035138603461, 'baseline_user_prediction': 0.6398313641548157}, 'validity': True}}
{1688: {'user_score': 1.0, 'predicted_score': 0.9327794313430786, 'validity': True}, 3602: {'user_score': 1.0, 'predicted_score': 0.8363574743270874, 'validity': True}, 735: {'user_score': 1.0, 'predicted_score': 0.8085615634918213, 'validity': True}, 1713: {'user_score': 1.0, 'predicted_score': 0.7815169095993042, 'validity': True}, 2860: {'user_score': 1.0, 'predicted_score': 0.7676966190338135, 'validity': True}}
1


In [39]:
import os 

RESULTS_FOLDER_LOCATION = "collaborative_filtering_results"
results_file_location = os.path.join(RESULTS_FOLDER_LOCATION,name+'_collaborative_filtering.json')

results_file = open(results_file_location,'w')

final_user_result={
    'name': name,
    'id':user_id,
    'matrix_factorization':matrix_factorization_results,
    'similar_users':similar_user_recs_results,
    'opposite_users':opposite_user_recs_results
}
json.dump(final_user_result,results_file)


loading results

In [40]:
results_file = open(results_file_location,'r')

print(json.load(results_file))

{'name': '1', 'id': 294, 'matrix_factorization': {'1688': {'user_score': 1.0, 'predicted_score': 0.9327794313430786, 'validity': True}, '3602': {'user_score': 1.0, 'predicted_score': 0.8363574743270874, 'validity': True}, '735': {'user_score': 1.0, 'predicted_score': 0.8085615634918213, 'validity': True}, '1713': {'user_score': 1.0, 'predicted_score': 0.7815169095993042, 'validity': True}, '2860': {'user_score': 1.0, 'predicted_score': 0.7676966190338135, 'validity': True}}, 'similar_users': {'61': {'user_score': 1.0, 'rec_info': {'post_id': 1280, 'prediction': 0.44513292832056395, 'baseline_user_id': 61, 'similarity_score': 0.6957035138603461, 'baseline_user_prediction': 0.6398313641548157}, 'validity': True}}, 'opposite_users': {'247': {'user_score': 1.0, 'rec_info': {'post_id': 969, 'prediction': 0.40599652016496535, 'baseline_user_id': 247, 'similarity_score': -0.6051832605515592, 'baseline_user_prediction': -0.6708654165267944}, 'validity': True}}}
