# Collaborative Filtering

## Load data

In [4]:
import data_processing

#load data
data_processing.load_resources_from_raw_delta_logs(min_num_interactions=6)
data_processing.load_users(min_num_interactions=10)
data_processing.compute_ratings()



KeyboardInterrupt: 

## Fit model

In [2]:

def compute_matrix_factorization():
    import os
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col 
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.recommendation import ALS
    from pyspark.sql import Row
    import os
    import sys

    # os.environ['PYSPARK_PYTHON'] = sys.executable
    # os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

    import findspark

    findspark.init()

    path = os.path.join('data processing session','extracted_ratings')
    spark = SparkSession.builder.appName("Collaborative Filtering Flow").getOrCreate()


    lines = spark.read.text(path).rdd

    parts = lines.map(lambda row: row.value.split(" "))
    ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), postId=int(p[1]),
                                        rating=float(p[2]) ))


    ratings = spark.createDataFrame(ratingsRDD)
    (training, test) = ratings, ratings
    # # Build the recommendation model using ALS on the training data
    # # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="postId", ratingCol="rating",
            coldStartStrategy="drop")
    model = als.fit(training)

    # # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))
    return (ratings,model,rmse)

ratingsPysparkDf,model,rmse = compute_matrix_factorization()

Root-mean-square error = 0.018822509641530854


## Extract resources relevant to the latent factors

In [5]:

def extract_baseline_posts(model,cached=True):
    import data_processing
    titles =[
        "Infidelity should not happen when divorce is possible",
         "The default world lingua franca should be Spanish",
         "Buying clothes or goods from factories in the developing world is moral, eve...",
         "The legal owner of a firearm should be responsible for the weapon and anythi...",
         "Airport security screenings do very little to stop deliberate terrorism such as the attacks of 9/11. They are a show put on to make passengers think something is being done.",
         "Luxury watches are useless in the practical sense",
         "The concept of an omniscient (*) and capable creator is not compatible with ...",
         "If whatever makes your character different (sexual identity/disability etc) ...",
         "Cutlery should be placed at the end of a buffet line",
         "If an animal has a big enough population, hunting of it should be allowed"
]
    
    if cached:
        baseline_posts = []
        for title in titles:
            #print(title)
            id = data_processing.res_id[title]
            #print(id)
            baseline_posts.append(data_processing.get_resource_info(id))
        return baseline_posts
    else:    
        '''After advisor approval, these posts shall be hardcoded '''

        from pyspark.sql.functions import expr

        # Extract item factors
        item_factors = model.itemFactors

        # Find the top 5 highest scores for each latent factor
        num_factors = model.rank  # Number of latent factors
        top_scores_per_factor = []

        for i in range(num_factors):
            factor_col = expr(f"features[{i}]")
            top_rows = (item_factors
                        .select("id", factor_col.alias("factor"))
                        .orderBy("factor", ascending=False)
                        .limit(10)
                        .collect())
            top_scores_per_factor.append((i, [(row.id, row.factor) for row in top_rows]))

        #for the final version, the resources will be predefined, to ensure that the reddit posts are not deprecated
        baseline_posts = []
        for factor, top_scores in top_scores_per_factor:
            for rank, (post_id, factor_score) in enumerate(top_scores, start=1):
                if rank ==1:
                    baseline_posts.append({'id':post_id,'score':factor_score,'content':data_processing.id_res[post_id]})
                print(f'title:{data_processing.id_res[post_id]["title"]},url:{data_processing.id_res[post_id]["url"]},factor {factor},score:{factor_score}')
            print("-"*100)
        return baseline_posts

baseline_posts = extract_baseline_posts(model,cached=True)

In [6]:
titles =["Infidelity should not happen when divorce is possible"]
data_processing.get_resource_id("The default world lingua franca should be Spanish")
data_processing.get_resource_id("Buying clothes or goods from factories in the developing world is moral, eve...")
data_processing.get_resource_id("The legal owner of a firearm should be responsible for the weapon and anythi...")
data_processing.get_resource_id("Airport security screenings do very little to stop deliberate terrorism such as the attacks of 9/11. They are a show put on to make passengers think something is being done.")


344

In [7]:
print(baseline_posts[0])

{'title': 'Infidelity should not happen when divorce is possible', 'pos_feedback': ['Nitrousoxide72', 'RedditExplorer89'], 'neg_feedback': ['ripcelinedionhusband', 'Melodic_Echidna', 'joopface', 'dublea', 'ripcelinedionhusband', 'WelfareBear', 'JimboMan1234', '', 'muyamable', 'Nephisimian', 'USoverthem'], 'url': 'https://www.reddit.com//r/changemyview/comments/imh6iw/cmv_infidelity_should_not_happen_when_divorce_is/'}


In [8]:
print(len(baseline_posts),baseline_posts)

10 [{'title': 'Infidelity should not happen when divorce is possible', 'pos_feedback': ['Nitrousoxide72', 'RedditExplorer89'], 'neg_feedback': ['ripcelinedionhusband', 'Melodic_Echidna', 'joopface', 'dublea', 'ripcelinedionhusband', 'WelfareBear', 'JimboMan1234', '', 'muyamable', 'Nephisimian', 'USoverthem'], 'url': 'https://www.reddit.com//r/changemyview/comments/imh6iw/cmv_infidelity_should_not_happen_when_divorce_is/'}, {'title': 'The default world lingua franca should be Spanish', 'pos_feedback': ['NicholasLeo', 'BrotherItsInTheDrum'], 'neg_feedback': ['Igor_Furman', 'muyamable', '', 'MontiBurns', '', 'parentheticalobject'], 'url': 'https://www.reddit.com//r/changemyview/comments/hf49v5/cmv_the_default_world_lingua_franca_should_be/'}, {'title': 'Buying clothes or goods from factories in the developing world is moral, eve...', 'pos_feedback': ['mr-logician', 'thedobya'], 'neg_feedback': ['AnythingApplied', 'AnythingApplied', 'MercurianAspirations', 'StellaAthena'], 'url': 'https://

## Collect data for cold start problem

In [3]:
def init_cold_start(baseline_posts,cached=True):

    import data_processing

    name = input("Please write your name:")
    description = "Dummy description of the task"

    baseline_results =[]

    if cached:
        for res in baseline_posts:
            print(f'Title:{res["title"]}')
            print(f'url: {res["url"]}')
            baseline_score = float(input('How much did this post make you see things trough a new perspective?:'))
            baseline_results.append({'id':data_processing.res_id[res['title']],'user_score':baseline_score})
        return (baseline_results,name)
    else:
        for baseline_post in baseline_posts:
            res = baseline_post["content"]
            print(f'Title:{res["title"]}')
            print(f'url: {res["url"]}')
            baseline_score = float(input('How much did this post make you see things trough a new perspective?:'))
            baseline_results.append({'id':baseline_post['id'],'factor_score':baseline_post['score'],'user_score':baseline_score})
        return (baseline_results,name)


baseline_results,name = init_cold_start(baseline_posts)

NameError: name 'baseline_posts' is not defined

## Fit the model with the new user's data

In [10]:
import os

os.path.join("data processing session","extracted_ratings")
def add_new_user_ratings(baseline_results):
    ratings_path_file = r'C:\Users\A&A\Downloads\Date Personale Laptop Nagarro\Proiect Licenta\data pipeline\data processing workflow\data processing session\extracted_ratings'

    ratings_file = open(ratings_path_file,'r')


    for line in ratings_file:
        last_line = line
    last_line = last_line.split(" ")
    new_user_id = int(last_line[0]) + 1
    print(new_user_id)
    ratings_file.close()
    ratings_file = open(ratings_path_file,'a')

    for result in baseline_results:
        #ratings_file.write()
        res_id = result['id']
        score = result['user_score']
        ratings_file.write(f'{new_user_id} {res_id} {score}\n')
        #print(f'{new_user_id} {res_id} {score}\n')
    ratings_file.close()
    return new_user_id
user_id = add_new_user_ratings(baseline_results)

295


In [11]:
def compute_matrix_factorization():
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col 
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.recommendation import ALS
    from pyspark.sql import Row
    import os
    import sys

    # os.environ['PYSPARK_PYTHON'] = sys.executable
    # os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

    import data_processing
    import findspark

    findspark.init()

    path = r'C:\Users\A&A\Downloads\Date Personale Laptop Nagarro\Proiect Licenta\data pipeline\data processing workflow\data processing session\extracted_ratings'

    spark = SparkSession.builder.appName("Collaborative Filtering Flow").getOrCreate()


    lines = spark.read.text(path).rdd

    parts = lines.map(lambda row: row.value.split(" "))
    ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), postId=int(p[1]),
                                        rating=float(p[2]) ))


    ratings = spark.createDataFrame(ratingsRDD)
    (training, test) = ratings, ratings
    # # Build the recommendation model using ALS on the training data
    # # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="postId", ratingCol="rating",
            coldStartStrategy="drop")
    model = als.fit(training)

    # # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))
    return (ratings,model,rmse)

ratingsPysparkDf,model,rmse = compute_matrix_factorization()

Root-mean-square error = 0.018861102460057386


## Make recommendations for the new user based on matrix factorization

### Extract ids of already rated posts by user

In [12]:
def extract_posts_already_rated(ratings,user_id = 294):
    from pyspark.sql.functions import col 
    user_ratings = ratings.filter(col("userId") == 294).select("postId").distinct()
    posts_rated_by_user = set()

    for post_id in user_ratings.toPandas().values:
        posts_rated_by_user.add(post_id[0])

    #print(posts_rated_by_user)
    return posts_rated_by_user
current_user_already_rated_posts = extract_posts_already_rated(ratingsPysparkDf)

In [13]:
user_recommendations = model.recommendForAllUsers(3)

In [14]:
#This posts have already been rated by the user

user_recommendations.filter(user_recommendations.userId==294).show(truncate=False)

+------+---------------------------------------------------------+
|userId|recommendations                                          |
+------+---------------------------------------------------------+
|294   |[{1972, 1.3016436}, {1279, 1.1852648}, {2322, 1.0094897}]|
+------+---------------------------------------------------------+



In [13]:
from pyspark.sql.functions import col,lit

def generate_matrix_factorization_recs(ratingsPysparkDf,user_id = 294,number_recs=15,similar=True):
    
    # Generate recommendations for a specific user
    user_ratings = ratingsPysparkDf.filter(col("userId") == user_id).select("postId").distinct()
    #user_ratings.show()
    all_posts = ratingsPysparkDf.select("postId").distinct()
    posts_not_rated_by_user = all_posts.join(user_ratings, on="postId", how="left_anti")

    #posts_not_rated_by_user.show()

    # Recommend top number_recs posts
    posts_not_rated_by_user = posts_not_rated_by_user.withColumn("userId", lit(user_id))
    recommendations = model.transform(posts_not_rated_by_user)
    if similar:
        top_recommendations = recommendations.orderBy(col("prediction").desc()).select("postId", "prediction").limit(number_recs)
    else:
        top_recommendations = recommendations.orderBy(col("prediction")).select("postId", "prediction").limit(number_recs)

    #top_recommendations.show()


    #extract them in a list
    postIds = [p[0] for p in top_recommendations.select("postId").toPandas().values.tolist()]
    predictions = [p[0] for p in top_recommendations.select("prediction").toPandas().values.tolist()]

    #print(postIds)
    #print(predictions)

    matrix_factorization_predictions = []
    for index in range(len(postIds)):
        matrix_factorization_predictions.append({'post_id':postIds[index],'prediction':predictions[index]})
    print(matrix_factorization_predictions)
    return matrix_factorization_predictions

matrix_factorization_predictions = generate_matrix_factorization_recs(ratingsPysparkDf,user_id = 294,similar=True)


[{'post_id': 1190, 'prediction': 0.7996417880058289}, {'post_id': 268, 'prediction': 0.7205122709274292}, {'post_id': 2570, 'prediction': 0.6130921244621277}, {'post_id': 3040, 'prediction': 0.6093191504478455}, {'post_id': 2992, 'prediction': 0.5738112330436707}, {'post_id': 47, 'prediction': 0.5597562193870544}, {'post_id': 2904, 'prediction': 0.5365568399429321}, {'post_id': 3148, 'prediction': 0.5309612154960632}, {'post_id': 2915, 'prediction': 0.5214865207672119}, {'post_id': 946, 'prediction': 0.5204827189445496}, {'post_id': 607, 'prediction': 0.52031010389328}, {'post_id': 1942, 'prediction': 0.4960876703262329}, {'post_id': 138, 'prediction': 0.4893319010734558}, {'post_id': 1152, 'prediction': 0.48891910910606384}, {'post_id': 2346, 'prediction': 0.477201908826828}]


In [13]:
#after rating
# for rec in matrix_factorization_predictions:
#     current_user_already_rated_posts.add(rec['post_id'])

## Make recommendations for the new user based on similar and opposite users

### Make recommendations based on similar users


### Create similarity matrix of users

In [14]:
def compute_user_similarity_matrix(model):

    from pyspark.sql import SparkSession
    from pyspark.ml.recommendation import ALS
    from pyspark.sql import Row
    import pandas as pd
    from sklearn.metrics.pairwise import cosine_similarity


    # Extract user factors
    user_factors = model.userFactors.orderBy('id')

    # Convert to Pandas DataFrame
    user_factors_pd = user_factors.toPandas()

    # Create a matrix of user factors
    user_ids = user_factors_pd['id'].values
    user_features = pd.DataFrame(user_factors_pd['features'].tolist(), index=user_ids)

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(user_features)

    # Convert the similarity matrix to a DataFrame for easier interpretation
    user_similarity_df = pd.DataFrame(similarity_matrix, index=user_ids, columns=user_ids)

    print(user_similarity_df)
    return user_similarity_df

user_similarity_df = compute_user_similarity_matrix(model)

          2         3         4         5         6         7         8    \
2    1.000000  0.426558 -0.353396 -0.517619  0.293009 -0.233070 -0.428046   
3    0.426558  1.000000 -0.437803 -0.665493  0.367526  0.224347  0.009407   
4   -0.353396 -0.437803  1.000000 -0.002396  0.264755  0.524021  0.389853   
5   -0.517619 -0.665493 -0.002396  1.000000 -0.637277 -0.609434  0.055064   
6    0.293009  0.367526  0.264755 -0.637277  1.000000  0.390253  0.217179   
..        ...       ...       ...       ...       ...       ...       ...   
291  0.215823  0.287657  0.012634 -0.101011 -0.006236 -0.088646  0.183040   
292 -0.015122  0.421971 -0.176516 -0.219887  0.561426  0.066851  0.429446   
293  0.131947  0.273390 -0.558932 -0.109349 -0.180771 -0.269134  0.035844   
294 -0.200416  0.005570 -0.460710  0.399972 -0.267628 -0.225177  0.157379   
295 -0.200416  0.005570 -0.460710  0.399972 -0.267628 -0.225177  0.157379   

          9         10        11   ...       286       287       288  \
2  

In [15]:
#example how to access the similarity between user with itself for  id 2 
user_similarity_df.iloc[2][4]


1.0

In [16]:
user_similarity_df.iloc[292][294]

1.0000000000000002

### Extract similar and opposite users

In [17]:
def extract_similar_and_opposite_users(user_similarity_df,interogated_user_id=294):
    import numpy as np

    #Note1: if you want to check if user ids are good, 
    #replace [-4:-1] with [-4:] and see if last user has a similarity score of aprox 1

    #Note2: -2 and +2 comes from the fact that we use numpy methods for a pandas dataframe, 
    # dataframe row index starts from 0; column index from 2

    user_similarity_scores_df =user_similarity_df.iloc[interogated_user_id-2]
    similar_users = []

    print(f'Similar users of user with id: {interogated_user_id}')

    for id in np.argsort(user_similarity_scores_df)[-4:-1]:
        user_id = id+2 #colum indexes start from 2 because lowest user_id value is 2
        similarity_score = user_similarity_scores_df[user_id]

        similar_users.append((user_id,similarity_score))
        try:
            print(f'user id: {user_id},similarity score: {similarity_score}')
        except:
            print(f'index error is {user_id} ')
    
    #reverse the list so that most similar user recs come first
    similar_users.reverse()

    print('-'*100)

    opposite_users = []

    print(f'Opposite users of user with id: {interogated_user_id}')
    for id in np.argsort(user_similarity_scores_df)[:3]:
        user_id = id+2
        similarity_score = user_similarity_scores_df[user_id]
        try:
        
            opposite_users.append((user_id,similarity_score))
            print(f'user id: {user_id},similarity score: {similarity_score}')
        except:
            print(f'index error is {user_id} ')
    print('-'*200)
    print(similar_users)
    print(opposite_users)

    return (similar_users,opposite_users)

similar_users,opposite_users = extract_similar_and_opposite_users(user_similarity_df)

Similar users of user with id: 294
user id: 37,similarity score: 0.7810509035373925
user id: 97,similarity score: 0.7860258314912422
user id: 294,similarity score: 1.0000000000000002
----------------------------------------------------------------------------------------------------
Opposite users of user with id: 294
user id: 44,similarity score: -0.778018459774237
user id: 242,similarity score: -0.759221556823808
user id: 226,similarity score: -0.7421612527584794
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[(294, 1.0000000000000002), (97, 0.7860258314912422), (37, 0.7810509035373925)]
[(44, -0.778018459774237), (242, -0.759221556823808), (226, -0.7421612527584794)]


### Extract and filter recommendations based on similar and opposite users 

Similar users flow

In [18]:
def generate_sim_recs(ratings_file, baseline_users,current_user_already_rated_posts,num_recs = 30,similar=True):
    num_users = len(baseline_users)
    #print(num_users)
    for index in range(num_users):
        sim_user_id,similarity_score = baseline_users[index]

        print(f'user_id = {sim_user_id}')
        print(f'similarity_score ={similarity_score}')

        sim_user_recs = generate_matrix_factorization_recs(ratings_file,user_id=sim_user_id,number_recs=num_recs,similar=similar)
        current_user_recs = []
        for sim_user_rec_struct in sim_user_recs:

            if sim_user_rec_struct['post_id'] in current_user_already_rated_posts:
                #print('found already rated post')
                continue
            else:
                current_user_rec = {'post_id':sim_user_rec_struct['post_id'],
                                    'prediction':sim_user_rec_struct['prediction'] * similarity_score,
                                    'baseline_user_id':sim_user_id,
                                    'similarity_score': similarity_score,
                                    'baseline_user_prediction':sim_user_rec_struct['prediction'] }

                #print(f'current user recommendation :{current_user_rec}')

                current_user_recs.append(current_user_rec)
    return current_user_recs
similar_users_recs = generate_sim_recs(ratings_file=ratingsPysparkDf,baseline_users=similar_users,current_user_already_rated_posts=current_user_already_rated_posts)

user_id = 294
similarity_score =1.0000000000000002
[{'post_id': 1190, 'prediction': 0.7996417880058289}, {'post_id': 268, 'prediction': 0.7205122709274292}, {'post_id': 2570, 'prediction': 0.6130921244621277}, {'post_id': 3040, 'prediction': 0.6093191504478455}, {'post_id': 2992, 'prediction': 0.5738112330436707}, {'post_id': 47, 'prediction': 0.5597562193870544}, {'post_id': 2904, 'prediction': 0.5365568399429321}, {'post_id': 3148, 'prediction': 0.5309612154960632}, {'post_id': 2915, 'prediction': 0.5214865207672119}, {'post_id': 946, 'prediction': 0.5204827189445496}, {'post_id': 607, 'prediction': 0.52031010389328}, {'post_id': 1942, 'prediction': 0.4960876703262329}, {'post_id': 138, 'prediction': 0.4893319010734558}, {'post_id': 1152, 'prediction': 0.48891910910606384}, {'post_id': 2346, 'prediction': 0.477201908826828}, {'post_id': 1222, 'prediction': 0.4736446440219879}, {'post_id': 72, 'prediction': 0.466156542301178}, {'post_id': 1700, 'prediction': 0.4648670554161072}, {'pos

Opposite users flow

In [19]:
opposite_users_recs = generate_sim_recs(ratings_file=ratingsPysparkDf,baseline_users=opposite_users,current_user_already_rated_posts=current_user_already_rated_posts,similar=False)

user_id = 44
similarity_score =-0.778018459774237
[{'post_id': 2427, 'prediction': -1.3995832204818726}, {'post_id': 1320, 'prediction': -1.2892322540283203}, {'post_id': 1909, 'prediction': -1.2672719955444336}, {'post_id': 344, 'prediction': -1.1839497089385986}, {'post_id': 1971, 'prediction': -1.1302331686019897}, {'post_id': 2493, 'prediction': -1.1251966953277588}, {'post_id': 268, 'prediction': -1.0672991275787354}, {'post_id': 3040, 'prediction': -1.0038970708847046}, {'post_id': 1088, 'prediction': -0.9969940781593323}, {'post_id': 2322, 'prediction': -0.8950653076171875}, {'post_id': 2904, 'prediction': -0.8632669448852539}, {'post_id': 3102, 'prediction': -0.8436636924743652}, {'post_id': 2043, 'prediction': -0.8325833678245544}, {'post_id': 262, 'prediction': -0.8186514377593994}, {'post_id': 2992, 'prediction': -0.7720634937286377}, {'post_id': 2925, 'prediction': -0.7716626524925232}, {'post_id': 607, 'prediction': -0.7671437859535217}, {'post_id': 1859, 'prediction': -0.

Testing on real users logic 

matrix factorization

In [20]:
for rec in matrix_factorization_predictions:
    print(rec)
    break
for rec in similar_users_recs:
    print(rec)
    break
for rec in opposite_users_recs:
    print(rec)
    break

def test_matrix_factorization(recs=matrix_factorization_predictions,tolerance_limit=5):
    import data_processing
    num_rated_posts = 0
    matrix_factorization_results = {}
    for data in recs:
        id = data['post_id']
        res =data_processing.get_resource_info(id)
        print(res['title'])
        print(res['url'])
        is_valid = input('Does the url provide enough info for you to understand the basic ideas expressed in this post?(Y/N):')
        if is_valid.lower() == 'y':
            print("Valid input")
            rec_score = float(input('How much did this recommended post make you see things trough a new perspective?:'))
            matrix_factorization_results[id]={'user_score': rec_score, 'predicted_score':data['prediction'],'validity':True}
            num_rated_posts +=1
        else:
            matrix_factorization_results[id]={'validity':False, 'predicted_score':data['prediction']}

        current_user_already_rated_posts.add(id)
        if num_rated_posts == tolerance_limit:
            return matrix_factorization_results
    return matrix_factorization_results


matrix_factorization_results = test_matrix_factorization(matrix_factorization_predictions) 

{'post_id': 1190, 'prediction': 0.7996417880058289}
{'post_id': 1190, 'prediction': 0.7017064735623817, 'baseline_user_id': 37, 'similarity_score': 0.7810509035373925, 'baseline_user_prediction': 0.8984132409095764}
{'post_id': 268, 'prediction': 0.7620485471941373, 'baseline_user_id': 226, 'similarity_score': -0.7421612527584794, 'baseline_user_prediction': -1.0267964601516724}
Necromancy within D&amp;amp;D isn't evil
https://www.reddit.com//r/changemyview/comments/cl6iby/cmv_necromancy_within_dd_isnt_evil/
Valid input
Biological sex is not a social construct.
https://www.reddit.com/r/changemyview/comments/82xw1t/cmv_biological_sex_is_not_a_social_construct/
Valid input
There are no negative effects because of the overall decline of spirituality...
https://www.reddit.com//r/changemyview/comments/o0hbvq/cmv_there_are_no_negative_effects_because_of_the/
Valid input
Puberty blocks and gender reassignment surgery should not be given to kids u...
https://www.reddit.com//r/changemyview/comm

In [21]:
len(current_user_already_rated_posts)

15

similar users

In [22]:
import data_processing
def test_user_similarity_recommendations(baseline_users_recs,tolerance_limit =5):
    num_rated_posts = 0
    baseline_user_recs_results = {}
    for data in baseline_users_recs:
        res_id = data['post_id']
        if res_id in current_user_already_rated_posts:
            print('found already recommended post')
            continue
        
        res = data_processing.get_resource_info(res_id)
        baseline_user_id = data['baseline_user_id']
        print(res['title'])
        print(res['url'])
        is_valid = input('Does the url provide enough info for you to understand the basic ideas expressed in this post?(Y/N):')
        if is_valid.lower() == 'y':
            print("Valid input")
            rec_score = float(input('How much did this recommended post make you see things trough a new perspective?:'))
            baseline_user_recs_results[baseline_user_id]={'user_score': rec_score, 'rec_info':data,'validity':True}
            num_rated_posts +=1
        else:
            baseline_user_recs_results[baseline_user_id]={'validity':False, 'rec_info':data}

        current_user_already_rated_posts.add(res_id)
        if num_rated_posts==tolerance_limit:
            return baseline_user_recs_results
    return baseline_user_recs_results


similar_user_recs_results = test_user_similarity_recommendations(similar_users_recs)

found already recommended post
The police should use a hoplite phalanx formation to better deal with large ...
https://www.reddit.com//r/changemyview/comments/jex8u5/cmv_the_police_should_use_a_hoplite_phalanx/
Valid input
found already recommended post
Criticizing the Chinese government does not make you Sinophobic, Criticizing...
https://www.reddit.com//r/changemyview/comments/loakbf/cmv_criticizing_the_chinese_government_does_not/
Valid input
having a messy room serves as a practice for walking through debris in emergency
https://www.reddit.com/r/changemyview/comments/8g9eth/cmv_having_a_messy_room_serves_as_a_practice_for/
Valid input
found already recommended post
Germany would be better off today if they won WW2
https://www.reddit.com//r/changemyview/comments/an7dxl/cmv_germany_would_be_better_off_today_if_they_won/
Valid input
All content creators on the internet earn way too much money
https://www.reddit.com//r/changemyview/comments/m1ao10/cmv_all_content_creators_on_the_intern

In [23]:
len(current_user_already_rated_posts)

20

Opposite users

In [24]:
opposite_user_recs_results = test_user_similarity_recommendations(opposite_users_recs)

found already recommended post
Eating dogs is NOT more immoral than eating conventional meats.
https://www.reddit.com//r/changemyview/comments/fw1uxi/cmv_eating_dogs_is_not_more_immoral_than_eating/
Valid input
The “my body, my choice” slogan for pro-choice advocates does not benefit th...
https://www.reddit.com//r/changemyview/comments/jb2gut/cmv_the_my_body_my_choice_slogan_for_prochoice/
Valid input
found already recommended post
Online dating has made finding a relationship impossible for all but the top...
https://www.reddit.com//r/changemyview/comments/tvnps6/cmv_online_dating_has_made_finding_a_relationship/
Valid input
The US legislative and executive branches should be replaced by a randomly s...
https://www.reddit.com//r/changemyview/comments/cuaj7z/cmv_the_us_legislative_and_executive_branches/
Valid input
California bill SB-827 would solve the state's housing crisis by allowing new housing to be built near transit hubs without local height, size, and design restrictions.
ht

In [25]:
len(current_user_already_rated_posts)

25

saving results


In [26]:
import json

print(opposite_user_recs_results)
print(similar_user_recs_results)
print(matrix_factorization_results)
print(name)




{226: {'user_score': 1.0, 'rec_info': {'post_id': 264, 'prediction': 0.4841961438596068, 'baseline_user_id': 226, 'similarity_score': -0.7421612527584794, 'baseline_user_prediction': -0.6524136662483215}, 'validity': True}}
{37: {'user_score': 1.0, 'rec_info': {'post_id': 2236, 'prediction': 0.48958630986930535, 'baseline_user_id': 37, 'similarity_score': 0.7810509035373925, 'baseline_user_prediction': 0.6268302202224731}, 'validity': True}}
{1190: {'user_score': 1.0, 'predicted_score': 0.7996417880058289, 'validity': True}, 268: {'user_score': 1.0, 'predicted_score': 0.7205122709274292, 'validity': True}, 2570: {'user_score': 1.0, 'predicted_score': 0.6130921244621277, 'validity': True}, 3040: {'user_score': 1.0, 'predicted_score': 0.6093191504478455, 'validity': True}, 2992: {'user_score': 1.0, 'predicted_score': 0.5738112330436707, 'validity': True}}
tes


In [27]:
import os 

RESULTS_FOLDER_LOCATION = "collaborative_filtering_results"
results_file_location = os.path.join(RESULTS_FOLDER_LOCATION,name+'_collaborative_filtering.json')

results_file = open(results_file_location,'w')

final_user_result={
    'name': name,
    'id':user_id,
    'matrix_factorization':matrix_factorization_results,
    'similar_users':similar_user_recs_results,
    'opposite_users':opposite_user_recs_results
}
json.dump(final_user_result,results_file)


loading results

In [28]:
results_file = open(results_file_location,'r')

print(json.load(results_file))

{'name': 'tes', 'id': 295, 'matrix_factorization': {'1190': {'user_score': 1.0, 'predicted_score': 0.7996417880058289, 'validity': True}, '268': {'user_score': 1.0, 'predicted_score': 0.7205122709274292, 'validity': True}, '2570': {'user_score': 1.0, 'predicted_score': 0.6130921244621277, 'validity': True}, '3040': {'user_score': 1.0, 'predicted_score': 0.6093191504478455, 'validity': True}, '2992': {'user_score': 1.0, 'predicted_score': 0.5738112330436707, 'validity': True}}, 'similar_users': {'37': {'user_score': 1.0, 'rec_info': {'post_id': 2236, 'prediction': 0.48958630986930535, 'baseline_user_id': 37, 'similarity_score': 0.7810509035373925, 'baseline_user_prediction': 0.6268302202224731}, 'validity': True}}, 'opposite_users': {'226': {'user_score': 1.0, 'rec_info': {'post_id': 264, 'prediction': 0.4841961438596068, 'baseline_user_id': 226, 'similarity_score': -0.7421612527584794, 'baseline_user_prediction': -0.6524136662483215}, 'validity': True}}}
