# Collaborative Filtering

## Load data

In [28]:
import data_processing


data_processing.load_resources_from_raw_delta_logs(min_num_interactions=6)
data_processing.load_users(min_num_interactions=10)
users,resources,ratings,num_interactions = data_processing.compute_ratings()

293 users
2995 rated resources from initially 3727 loaded resources
6052 interactions
feedback matrix is 0.6896591019161629% populated


## Fit model

In [3]:

def compute_matrix_factorization():
    
    '''
        Aproximate data_processing_session/ratings file with pyspark framework, using ALS algo
       
        returns (ratings pyspark dataframe,model object,rmse aproximation score)
    '''

    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col 
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.recommendation import ALS
    from pyspark.sql import Row
    import os
    import sys

    
    # uncomment these in case you have problems with environment variables
    # os.environ['PYSPARK_PYTHON'] = sys.executable
    # os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

    #initialize spark
    import findspark
    findspark.init()

    spark = SparkSession.builder.appName("Collaborative Filtering Flow").getOrCreate()

    #preprocess ratings file into pyspark dataframe
    path = os.path.join('data processing session','extracted_ratings')
    lines = spark.read.text(path).rdd

    parts = lines.map(lambda row: row.value.split(" "))
    ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), postId=int(p[1]),
                                        rating=float(p[2]) ))


    ratings = spark.createDataFrame(ratingsRDD)

    (training, test) = ratings, ratings

    # # Build the recommendation model using ALS on the training data
    # # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="postId", ratingCol="rating",
            coldStartStrategy="drop")
    model = als.fit(training)

    # # Evaluate the model by computing the RMSE on the test (in this case also train) data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))
    return (ratings,model,rmse)

ratingsPysparkDf,model,rmse = compute_matrix_factorization()

Root-mean-square error = 0.018965150366215943


## Extract resources relevant to the latent factors

In [4]:
def extract_baseline_posts(model,cached=True):

    '''
        Extracts and returns baseline posts for solving cold start problem 
         
        based on top posts for every latent factor of the model
        
        or return hardcoded posts
        
    '''

    import data_processing

    #these hardcoded posts are extracted by taking the top 10 posts from every latent score of the model
    #and picking the ones that have a functional link/a novel topic
    titles =[
        "Infidelity should not happen when divorce is possible",
         "The default world lingua franca should be Spanish",
         "Buying clothes or goods from factories in the developing world is moral, eve...",
         "The legal owner of a firearm should be responsible for the weapon and anythi...",
         "Airport security screenings do very little to stop deliberate terrorism such as the attacks of 9/11. They are a show put on to make passengers think something is being done.",
         "Luxury watches are useless in the practical sense",
         "The concept of an omniscient (*) and capable creator is not compatible with ...",
         "If whatever makes your character different (sexual identity/disability etc) ...",
         "Cutlery should be placed at the end of a buffet line",
         "If an animal has a big enough population, hunting of it should be allowed"
]
    
    if cached:
        baseline_posts = []
        for title in titles:
            #print(title)
            id = data_processing.res_id[title]
            #print(id)
            baseline_posts.append(data_processing.get_resource_info(id))
        return baseline_posts
    else:    
        '''After advisor approval, these posts shall be hardcoded '''

        from pyspark.sql.functions import expr

        # Extract item factors
        item_factors = model.itemFactors

        # Find the top 5 highest scores for each latent factor
        num_factors = model.rank  # Number of latent factors
        top_scores_per_factor = []

        for i in range(num_factors):
            factor_col = expr(f"features[{i}]")
            top_rows = (item_factors
                        .select("id", factor_col.alias("factor"))
                        .orderBy("factor", ascending=False)
                        .limit(10)
                        .collect())
            top_scores_per_factor.append((i, [(row.id, row.factor) for row in top_rows]))

        #for the final version, the resources will be predefined, to ensure that the reddit posts links are not deprecated
        baseline_posts = []
        for factor, top_scores in top_scores_per_factor:
            for rank, (post_id, factor_score) in enumerate(top_scores, start=1):
                if rank ==1:
                    baseline_posts.append({'id':post_id,'score':factor_score,'content':data_processing.id_res[post_id]})
                print(f'title:{data_processing.id_res[post_id]["title"]},url:{data_processing.id_res[post_id]["url"]},factor {factor},score:{factor_score}')
            print("-"*100)
        return baseline_posts

baseline_posts = extract_baseline_posts(model,cached=True)

In [5]:
#example of a post object content
print(baseline_posts[0])

{'title': 'Infidelity should not happen when divorce is possible', 'pos_feedback': ['Nitrousoxide72', 'RedditExplorer89'], 'neg_feedback': ['ripcelinedionhusband', 'Melodic_Echidna', 'joopface', 'dublea', 'ripcelinedionhusband', 'WelfareBear', 'JimboMan1234', '', 'muyamable', 'Nephisimian', 'USoverthem'], 'url': 'https://www.reddit.com//r/changemyview/comments/imh6iw/cmv_infidelity_should_not_happen_when_divorce_is/'}


In [6]:
print(len(baseline_posts),baseline_posts)

10 [{'title': 'Infidelity should not happen when divorce is possible', 'pos_feedback': ['Nitrousoxide72', 'RedditExplorer89'], 'neg_feedback': ['ripcelinedionhusband', 'Melodic_Echidna', 'joopface', 'dublea', 'ripcelinedionhusband', 'WelfareBear', 'JimboMan1234', '', 'muyamable', 'Nephisimian', 'USoverthem'], 'url': 'https://www.reddit.com//r/changemyview/comments/imh6iw/cmv_infidelity_should_not_happen_when_divorce_is/'}, {'title': 'The default world lingua franca should be Spanish', 'pos_feedback': ['NicholasLeo', 'BrotherItsInTheDrum'], 'neg_feedback': ['Igor_Furman', 'muyamable', '', 'MontiBurns', '', 'parentheticalobject'], 'url': 'https://www.reddit.com//r/changemyview/comments/hf49v5/cmv_the_default_world_lingua_franca_should_be/'}, {'title': 'Buying clothes or goods from factories in the developing world is moral, eve...', 'pos_feedback': ['mr-logician', 'thedobya'], 'neg_feedback': ['AnythingApplied', 'AnythingApplied', 'MercurianAspirations', 'StellaAthena'], 'url': 'https://

## Collect data for cold start problem

In [7]:
#helper functions for user input#
def is_valid(input,inf,sup,categorical=False):
    try:
        if categorical:
            rating  = int(input)
        else:
            rating  = float(input)
        if rating < inf or rating > sup:
            return False
        else:
            return True
    except:
        return False

def collect_feedback(input_text,error_text,inf_bound,sup_bound,categorical=False):
    feedback = input(input_text)
    while is_valid(feedback,inf_bound,sup_bound,categorical) == False:
        print(error_text)
        feedback = input(input_text)
    return feedback

In [8]:
def init_cold_start(baseline_posts,cached=True):

    '''
    Feed user baseline posts and collect feedback in order to compute recommendations
    '''

    import data_processing

    name = input("Please write your name:")
    description = "Dummy description of the task"

    baseline_results =[]

    if cached:
        for res in baseline_posts:
            print(f'Title:{res["title"]}')
            print(f'url: {res["url"]}')
            
            feedback = collect_feedback('How much did this post make you see things trough a new perspective?:',
                         "Input should be a continuous value between -1 and 1",
                         inf_bound=-1,
                         sup_bound=1)
            baseline_score = float(feedback)
            baseline_results.append({'id':data_processing.res_id[res['title']],'user_score':baseline_score})
        return (baseline_results,name)
    else:
        for baseline_post in baseline_posts:
            res = baseline_post["content"]
            print(f'Title:{res["title"]}')
            print(f'url: {res["url"]}')
            feedback = collect_feedback('How much did this post make you see things trough a new perspective?:',
                         "Input should be a continuous value between -1 and 1",
                         inf_bound=-1,
                         sup_bound=1)
            baseline_score = float(feedback)
            baseline_results.append({'id':baseline_post['id'],'factor_score':baseline_post['score'],'user_score':baseline_score})
        return (baseline_results,name)


baseline_results,name = init_cold_start(baseline_posts)

Title:Infidelity should not happen when divorce is possible
url: https://www.reddit.com//r/changemyview/comments/imh6iw/cmv_infidelity_should_not_happen_when_divorce_is/
Title:The default world lingua franca should be Spanish
url: https://www.reddit.com//r/changemyview/comments/hf49v5/cmv_the_default_world_lingua_franca_should_be/
Title:Buying clothes or goods from factories in the developing world is moral, eve...
url: https://www.reddit.com//r/changemyview/comments/dwebfc/cmv_buying_clothes_or_goods_from_factories_in_the/
Title:The legal owner of a firearm should be responsible for the weapon and anythi...
url: https://www.reddit.com//r/changemyview/comments/i4f917/cmv_the_legal_owner_of_a_firearm_should_be/
Title:Airport security screenings do very little to stop deliberate terrorism such as the attacks of 9/11. They are a show put on to make passengers think something is being done.
url: https://www.reddit.com/r/changemyview/comments/8a9r3h/cmv_airport_security_screenings_do_very_l

## Fit the model with the new user's data

In [9]:
import os

def add_new_user_ratings(baseline_results):

    '''
        Append user feedback to ratings file
        
    '''
    ratings_path_file = os.path.join('data processing session','extracted_ratings')

    ratings_file = open(ratings_path_file,'r')


    for line in ratings_file:
        last_line = line
    last_line = last_line.split(" ")
    new_user_id = int(last_line[0]) + 1
    print(new_user_id)
    ratings_file.close()
    ratings_file = open(ratings_path_file,'a')

    for result in baseline_results:
        #ratings_file.write()
        res_id = result['id']
        score = result['user_score']
        ratings_file.write(f'{new_user_id} {res_id} {score}\n')
        #print(f'{new_user_id} {res_id} {score}\n')
    ratings_file.close()
    return new_user_id
user_id = add_new_user_ratings(baseline_results)

294


In [10]:
ratingsPysparkDf,model,rmse = compute_matrix_factorization()

Root-mean-square error = 0.018959604211848585


## Make recommendations for the new user based on matrix factorization

### Extract ids of already rated posts by user

In [11]:
def extract_posts_already_rated(ratings,user_id = 294):
    from pyspark.sql.functions import col 
    user_ratings = ratings.filter(col("userId") == 294).select("postId").distinct()
    posts_rated_by_user = set()

    for post_id in user_ratings.toPandas().values:
        posts_rated_by_user.add(post_id[0])

    return posts_rated_by_user
current_user_already_rated_posts = extract_posts_already_rated(ratingsPysparkDf)

In [12]:
#This posts have already been rated by the user
user_recommendations = model.recommendForAllUsers(3)
user_recommendations.filter(user_recommendations.userId==294).show(truncate=False)

+------+----------------------------------------------------------+
|userId|recommendations                                           |
+------+----------------------------------------------------------+
|294   |[{1252, 1.0259914}, {3102, 0.9951712}, {2427, 0.99407864}]|
+------+----------------------------------------------------------+



In [13]:
from pyspark.sql.functions import col,lit

def generate_matrix_factorization_recs(ratingsPysparkDf,user_id = 294,number_recs=15,similar=True):
    
    # Generate recommendations for a specific user
    user_ratings = ratingsPysparkDf.filter(col("userId") == user_id).select("postId").distinct()
    #user_ratings.show()
    all_posts = ratingsPysparkDf.select("postId").distinct()
    posts_not_rated_by_user = all_posts.join(user_ratings, on="postId", how="left_anti")

    #posts_not_rated_by_user.show()

    # Recommend top number_recs posts
    posts_not_rated_by_user = posts_not_rated_by_user.withColumn("userId", lit(user_id))
    recommendations = model.transform(posts_not_rated_by_user)
    if similar:
        top_recommendations = recommendations.orderBy(col("prediction").desc()).select("postId", "prediction").limit(number_recs)
    else:
        top_recommendations = recommendations.orderBy(col("prediction")).select("postId", "prediction").limit(number_recs)

    #top_recommendations.show()


    #extract them in a list
    postIds = [p[0] for p in top_recommendations.select("postId").toPandas().values.tolist()]
    predictions = [p[0] for p in top_recommendations.select("prediction").toPandas().values.tolist()]


    matrix_factorization_predictions = []
    for index in range(len(postIds)):
        matrix_factorization_predictions.append({'post_id':postIds[index],'prediction':predictions[index]})
    print(matrix_factorization_predictions)
    return matrix_factorization_predictions

matrix_factorization_predictions = generate_matrix_factorization_recs(ratingsPysparkDf,user_id = 294,similar=True)


[{'post_id': 1252, 'prediction': 1.025991439819336}, {'post_id': 1605, 'prediction': 0.9424614906311035}, {'post_id': 3462, 'prediction': 0.8485513925552368}, {'post_id': 2220, 'prediction': 0.8133757710456848}, {'post_id': 114, 'prediction': 0.8082108497619629}, {'post_id': 3619, 'prediction': 0.8057456612586975}, {'post_id': 2043, 'prediction': 0.8053984045982361}, {'post_id': 2601, 'prediction': 0.7940481901168823}, {'post_id': 2737, 'prediction': 0.7828012704849243}, {'post_id': 620, 'prediction': 0.7738643288612366}, {'post_id': 1599, 'prediction': 0.773391604423523}, {'post_id': 1443, 'prediction': 0.7569259405136108}, {'post_id': 277, 'prediction': 0.7462075352668762}, {'post_id': 2998, 'prediction': 0.7305809855461121}, {'post_id': 2335, 'prediction': 0.7199878692626953}]


In [23]:
#Note:
# after collecting user ratings, we will add them to the current_user_already_rated_posts set

## Make recommendations for the new user based on similar and opposite users

### Make recommendations based on similar users


### Create similarity matrix of users

In [14]:
def compute_user_similarity_matrix(model):

    
    import pandas as pd
    from sklearn.metrics.pairwise import cosine_similarity


    # Extract user factors
    user_factors = model.userFactors.orderBy('id')

    # Convert to Pandas DataFrame
    user_factors_pd = user_factors.toPandas()

    # Create a matrix of user factors
    user_ids = user_factors_pd['id'].values
    user_features = pd.DataFrame(user_factors_pd['features'].tolist(), index=user_ids)

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(user_features)

    # Convert the similarity matrix to a DataFrame for easier interpretation
    user_similarity_df = pd.DataFrame(similarity_matrix, index=user_ids, columns=user_ids)

    print(user_similarity_df)
    return user_similarity_df

user_similarity_df = compute_user_similarity_matrix(model)

          2         3         4         5         6         7         8    \
2    1.000000  0.501149  0.687818  0.046454  0.045648 -0.351726 -0.547974   
3    0.501149  1.000000  0.532797  0.078803 -0.220489  0.004581 -0.401920   
4    0.687818  0.532797  1.000000  0.159743 -0.320948 -0.480992 -0.603157   
5    0.046454  0.078803  0.159743  1.000000 -0.515379 -0.535386 -0.500516   
6    0.045648 -0.220489 -0.320948 -0.515379  1.000000  0.281414  0.359716   
..        ...       ...       ...       ...       ...       ...       ...   
290  0.125160  0.450687 -0.235402 -0.450344  0.223754  0.187425  0.267485   
291  0.157260  0.383757  0.335712  0.225980 -0.426012 -0.088554 -0.492939   
292 -0.359224  0.036293 -0.255834  0.161450 -0.641699 -0.173836  0.076878   
293 -0.158616  0.011091  0.178531 -0.221737 -0.335482 -0.374536  0.151570   
294  0.042420  0.376775 -0.021965 -0.178982 -0.062401  0.237455 -0.297423   

          9         10        11   ...       285       286       287  \
2  

In [16]:
#example how to access the similarity between user with itself for  id 2 
user_similarity_df.iloc[2][4]


1.0000000000000002

In [17]:
user_similarity_df.iloc[292][294]

1.0

### Extract similar and opposite users

In [18]:
def extract_similar_and_opposite_users(user_similarity_df,interogated_user_id=294):

    import numpy as np

    #Note1: if you want to check if user ids are good, 
    #replace [-4:-1] with [-4:] and see if last user has a similarity score of aprox 1

    #Note2: -2 and +2 comes from the fact that we use numpy methods for a pandas dataframe, 
    # dataframe row index starts from 0; column index from 2

    user_similarity_scores_df =user_similarity_df.iloc[interogated_user_id-2]
    similar_users = []

    print(f'Similar users of user with id: {interogated_user_id}')

    for id in np.argsort(user_similarity_scores_df)[-4:-1]:
        user_id = id+2 #colum indexes start from 2 because lowest user_id value is 2
        similarity_score = user_similarity_scores_df[user_id]

        similar_users.append((user_id,similarity_score))
        try:
            print(f'user id: {user_id},similarity score: {similarity_score}')
        except:
            print(f'index error is {user_id} ')
    
    #reverse the list so that most similar user recs come first
    similar_users.reverse()

    print('-'*100)

    opposite_users = []

    print(f'Opposite users of user with id: {interogated_user_id}')
    for id in np.argsort(user_similarity_scores_df)[:3]:
        user_id = id+2
        similarity_score = user_similarity_scores_df[user_id]
        try:
        
            opposite_users.append((user_id,similarity_score))
            print(f'user id: {user_id},similarity score: {similarity_score}')
        except:
            print(f'index error is {user_id} ')
    print('-'*200)
    print(similar_users)
    print(opposite_users)

    return (similar_users,opposite_users)

similar_users,opposite_users = extract_similar_and_opposite_users(user_similarity_df)

Similar users of user with id: 294
user id: 114,similarity score: 0.7231622139903068
user id: 50,similarity score: 0.7407005325844904
user id: 36,similarity score: 0.7454345503480371
----------------------------------------------------------------------------------------------------
Opposite users of user with id: 294
user id: 98,similarity score: -0.8322027528008487
user id: 93,similarity score: -0.7611366903602812
user id: 210,similarity score: -0.683925147191811
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[(36, 0.7454345503480371), (50, 0.7407005325844904), (114, 0.7231622139903068)]
[(98, -0.8322027528008487), (93, -0.7611366903602812), (210, -0.683925147191811)]


### Extract and filter recommendations based on similar and opposite users 

Similar users flow

In [19]:
def generate_sim_recs(ratings_file, baseline_users,current_user_already_rated_posts,num_recs = 30,similar=True):

    
    num_users = len(baseline_users)
    for index in range(num_users):
        sim_user_id,similarity_score = baseline_users[index]

        print(f'user_id = {sim_user_id}')
        print(f'similarity_score ={similarity_score}')

        sim_user_recs = generate_matrix_factorization_recs(ratings_file,user_id=sim_user_id,number_recs=num_recs,similar=similar)
        current_user_recs = []
        for sim_user_rec_struct in sim_user_recs:

            if sim_user_rec_struct['post_id'] in current_user_already_rated_posts:
                #print('found already rated post')
                continue
            else:
                current_user_rec = {'post_id':sim_user_rec_struct['post_id'],
                                    'prediction':sim_user_rec_struct['prediction'] * similarity_score,
                                    'baseline_user_id':sim_user_id,
                                    'similarity_score': similarity_score,
                                    'baseline_user_prediction':sim_user_rec_struct['prediction'] }

                current_user_recs.append(current_user_rec)
    return current_user_recs
similar_users_recs = generate_sim_recs(ratings_file=ratingsPysparkDf,baseline_users=similar_users,current_user_already_rated_posts=current_user_already_rated_posts)

user_id = 36
similarity_score =0.7454345503480371
[{'post_id': 3102, 'prediction': 0.9692136645317078}, {'post_id': 1909, 'prediction': 0.9673794507980347}, {'post_id': 1088, 'prediction': 0.9397230744361877}, {'post_id': 2574, 'prediction': 0.9280208945274353}, {'post_id': 3271, 'prediction': 0.9001338481903076}, {'post_id': 3462, 'prediction': 0.8703027367591858}, {'post_id': 2122, 'prediction': 0.8347253203392029}, {'post_id': 2493, 'prediction': 0.8104441165924072}, {'post_id': 1650, 'prediction': 0.7950367331504822}, {'post_id': 1599, 'prediction': 0.7686772346496582}, {'post_id': 2875, 'prediction': 0.7583449482917786}, {'post_id': 2895, 'prediction': 0.7471024990081787}, {'post_id': 3618, 'prediction': 0.7337385416030884}, {'post_id': 344, 'prediction': 0.7314099669456482}, {'post_id': 1688, 'prediction': 0.7276389598846436}, {'post_id': 2427, 'prediction': 0.702218770980835}, {'post_id': 2601, 'prediction': 0.6742368936538696}, {'post_id': 1252, 'prediction': 0.6559600830078125

Opposite users flow

In [20]:
opposite_users_recs = generate_sim_recs(ratings_file=ratingsPysparkDf,baseline_users=opposite_users,current_user_already_rated_posts=current_user_already_rated_posts,similar=False)

user_id = 98
similarity_score =-0.8322027528008487
[{'post_id': 1971, 'prediction': -1.0209319591522217}, {'post_id': 1443, 'prediction': -0.9009338021278381}, {'post_id': 2322, 'prediction': -0.8177556395530701}, {'post_id': 3102, 'prediction': -0.7578437328338623}, {'post_id': 3510, 'prediction': -0.7507975697517395}, {'post_id': 1599, 'prediction': -0.7448555827140808}, {'post_id': 344, 'prediction': -0.7424383759498596}, {'post_id': 2493, 'prediction': -0.7211824059486389}, {'post_id': 1909, 'prediction': -0.7153779864311218}, {'post_id': 3148, 'prediction': -0.696463942527771}, {'post_id': 2236, 'prediction': -0.6873149275779724}, {'post_id': 2335, 'prediction': -0.6669315695762634}, {'post_id': 620, 'prediction': -0.6666615009307861}, {'post_id': 2694, 'prediction': -0.6618669629096985}, {'post_id': 2601, 'prediction': -0.6427597999572754}, {'post_id': 1605, 'prediction': -0.6400316953659058}, {'post_id': 3462, 'prediction': -0.6363513469696045}, {'post_id': 3271, 'prediction': -

Testing on real users logic 

matrix factorization

In [21]:
#Showing the 3 types of recomendations the user will encounter
for rec in matrix_factorization_predictions:
    print(rec)
    break
for rec in similar_users_recs:
    print(rec)
    break
for rec in opposite_users_recs:
    print(rec)
    break

{'post_id': 1252, 'prediction': 1.025991439819336}
{'post_id': 114, 'prediction': 0.5659663504961598, 'baseline_user_id': 114, 'similarity_score': 0.7231622139903068, 'baseline_user_prediction': 0.7826271057128906}
{'post_id': 2083, 'prediction': 0.5749493946604028, 'baseline_user_id': 210, 'similarity_score': -0.683925147191811, 'baseline_user_prediction': -0.8406612873077393}


In [22]:
def test_matrix_factorization(recs=matrix_factorization_predictions,tolerance_limit=5):
    import data_processing
    num_rated_posts = 0
    matrix_factorization_results = {}
    for data in recs:
        id = data['post_id']
        res =data_processing.get_resource_info(id)
        print(res['title'])
        print(res['url'])
        is_valid = ""
        while is_valid.lower() !='y' and is_valid.lower() !='n':
            is_valid = input('Does the url provide enough info for you to understand the basic ideas expressed in this post?(Y/N):')

        if is_valid.lower() == 'y':
      
            feedback = collect_feedback('How much did this post make you see things trough a new perspective?:',
                "Input should be a continuous value between -1 and 1",
                inf_bound=-1,
                sup_bound=1)
            rec_score = float(feedback)
            matrix_factorization_results[id]={'user_score': rec_score, 'predicted_score':data['prediction'],'validity':True}
            num_rated_posts +=1
        else:
            print('Invalid post')
            matrix_factorization_results[id]={'validity':False, 'predicted_score':data['prediction']}

        current_user_already_rated_posts.add(id)
        if num_rated_posts == tolerance_limit:
            return matrix_factorization_results
    return matrix_factorization_results


matrix_factorization_results = test_matrix_factorization(matrix_factorization_predictions) 

The future of transportation is based on self-driving, electrical and 5G-pow...
https://www.reddit.com//r/changemyview/comments/d58liu/cmv_the_future_of_transportation_is_based_on/
FDR was a terrible president, and it's hypocritical for democrats to use him...
https://www.reddit.com//r/changemyview/comments/g05bve/cmv_fdr_was_a_terrible_president_and_its/
Invalid post
People who refuse to get a job should not be able to continue drawing from s...
https://www.reddit.com//r/changemyview/comments/11run29/cmv_people_who_refuse_to_get_a_job_should_not_be/
Invalid post
I don't feel bad for people in Texas. I feel like this disaster was the pred...
https://www.reddit.com//r/changemyview/comments/lw7x2a/cmv_i_dont_feel_bad_for_people_in_texas_i_feel/
Invalid post
America should be ruled by scholars with specified knowledge, and voting rights should be limited to relevantly educated citizens
https://www.reddit.com/r/changemyview/comments/7nnndc/cmv_america_should_be_ruled_by_scholars_with/
Ther

In [23]:
len(current_user_already_rated_posts)

20

similar users

In [24]:
import data_processing
def test_user_similarity_recommendations(baseline_users_recs,tolerance_limit =5):
    num_rated_posts = 0
    baseline_user_recs_results = {}
    for data in baseline_users_recs:
        res_id = data['post_id']
        if res_id in current_user_already_rated_posts:
            print('found already recommended post')
            continue
        
        res = data_processing.get_resource_info(res_id)
        baseline_user_id = data['baseline_user_id']
        print(res['title'])
        print(res['url'])
        is_valid = ""
        while is_valid.lower() !='y' and is_valid.lower() !='n':
            is_valid = input('Does the url provide enough info for you to understand the basic ideas expressed in this post?(Y/N):')
        if is_valid.lower() == 'y':
            print("Valid input")
            feedback = collect_feedback('How much did this post make you see things trough a new perspective?:',
                "Input should be a continuous value between -1 and 1",
                inf_bound=-1,
                sup_bound=1)
            rec_score = float(feedback)
            baseline_user_recs_results[baseline_user_id]={'user_score': rec_score, 'rec_info':data,'validity':True}
            num_rated_posts +=1
        else:
            print("Invalid post")
            baseline_user_recs_results[baseline_user_id]={'validity':False, 'rec_info':data}

        current_user_already_rated_posts.add(res_id)
        if num_rated_posts==tolerance_limit:
            return baseline_user_recs_results
    return baseline_user_recs_results


similar_user_recs_results = test_user_similarity_recommendations(similar_users_recs)

found already recommended post
Every single person caught driving drunk should be immediately charged with ...
https://www.reddit.com//r/changemyview/comments/n6bxfa/cmv_every_single_person_caught_driving_drunk/
Valid input
Piracy is wrong except possibly for college textbooks.
https://www.reddit.com//r/changemyview/comments/clwldz/cmv_piracy_is_wrong_except_possibly_for_college/
Invalid post
field-specific research or practical work should be part of all four-year de...
https://www.reddit.com//r/changemyview/comments/nymijb/cmv_fieldspecific_research_or_practical_work/
Valid input
found already recommended post
found already recommended post
all fines (or other monetary punishments) should be determined by your incom...
https://www.reddit.com//r/changemyview/comments/minlt1/cmv_all_fines_or_other_monetary_punishments/
Valid input
found already recommended post
found already recommended post
Giving your child a hard-to-spell first name is vain and selfish because it places a lifelong b

In [25]:
len(current_user_already_rated_posts)

26

Opposite users

In [26]:
opposite_user_recs_results = test_user_similarity_recommendations(opposite_users_recs)

We shouldn't worry if Trump will refuse the results of the election, we shou...
https://www.reddit.com//r/changemyview/comments/jnh757/cmv_we_shouldnt_worry_if_trump_will_refuse_the/
Valid input
found already recommended post
found already recommended post
found already recommended post
Religions never solved the problem of evil and Epicurus argument holds fast.
https://www.reddit.com//r/changemyview/comments/920s2w/cmv_religions_never_solved_the_problem_of_evil/
Invalid post
When I have a kid, I will not tell them that Santa (or any other mythical figure) is real
https://www.reddit.com//r/changemyview/comments/9s71jh/cmv_when_i_have_a_kid_i_will_not_tell_them_that/
Valid input
Input should be a continuous value between -1 and 1
The space industry should be nationalized
https://www.reddit.com//r/changemyview/comments/acm689/cmv_the_space_industry_should_be_nationalized/
Valid input
The changelings/shapeshifters on Star Trek make no sense from a biological s...
https://www.reddit.com//r

In [27]:
len(current_user_already_rated_posts)

32

saving results


In [38]:
import json

print(opposite_user_recs_results)
print(similar_user_recs_results)
print(matrix_factorization_results)
print(name)




{247: {'user_score': 1.0, 'rec_info': {'post_id': 969, 'prediction': 0.40599652016496535, 'baseline_user_id': 247, 'similarity_score': -0.6051832605515592, 'baseline_user_prediction': -0.6708654165267944}, 'validity': True}}
{61: {'user_score': 1.0, 'rec_info': {'post_id': 1280, 'prediction': 0.44513292832056395, 'baseline_user_id': 61, 'similarity_score': 0.6957035138603461, 'baseline_user_prediction': 0.6398313641548157}, 'validity': True}}
{1688: {'user_score': 1.0, 'predicted_score': 0.9327794313430786, 'validity': True}, 3602: {'user_score': 1.0, 'predicted_score': 0.8363574743270874, 'validity': True}, 735: {'user_score': 1.0, 'predicted_score': 0.8085615634918213, 'validity': True}, 1713: {'user_score': 1.0, 'predicted_score': 0.7815169095993042, 'validity': True}, 2860: {'user_score': 1.0, 'predicted_score': 0.7676966190338135, 'validity': True}}
1


In [39]:
import os 

RESULTS_FOLDER_LOCATION = "collaborative_filtering_results"
results_file_location = os.path.join(RESULTS_FOLDER_LOCATION,name+'_collaborative_filtering.json')

results_file = open(results_file_location,'w')

final_user_result={
    'name': name,
    'id':user_id,
    'matrix_factorization':matrix_factorization_results,
    'similar_users':similar_user_recs_results,
    'opposite_users':opposite_user_recs_results
}
json.dump(final_user_result,results_file)


loading results

In [40]:
results_file = open(results_file_location,'r')

print(json.load(results_file))

{'name': '1', 'id': 294, 'matrix_factorization': {'1688': {'user_score': 1.0, 'predicted_score': 0.9327794313430786, 'validity': True}, '3602': {'user_score': 1.0, 'predicted_score': 0.8363574743270874, 'validity': True}, '735': {'user_score': 1.0, 'predicted_score': 0.8085615634918213, 'validity': True}, '1713': {'user_score': 1.0, 'predicted_score': 0.7815169095993042, 'validity': True}, '2860': {'user_score': 1.0, 'predicted_score': 0.7676966190338135, 'validity': True}}, 'similar_users': {'61': {'user_score': 1.0, 'rec_info': {'post_id': 1280, 'prediction': 0.44513292832056395, 'baseline_user_id': 61, 'similarity_score': 0.6957035138603461, 'baseline_user_prediction': 0.6398313641548157}, 'validity': True}}, 'opposite_users': {'247': {'user_score': 1.0, 'rec_info': {'post_id': 969, 'prediction': 0.40599652016496535, 'baseline_user_id': 247, 'similarity_score': -0.6051832605515592, 'baseline_user_prediction': -0.6708654165267944}, 'validity': True}}}
