# Graph implementation of the courses

In [1]:
import numpy as np
import pandas as pd

import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
from scipy.stats import pearsonr

In [2]:
# Load the dataset
df=pd.read_csv('data/Coursera_reviews.csv')

## Meta-Data

In [3]:
# Print out Basic Meta-Data about the dataset  

# Print the number of rows and columns
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

# Print the column names
print("Column names:", df.columns.tolist())

# Print the data types of each column
print("Data types:")
print(df.dtypes)

# Print the summary statistics of numerical columns
print("Summary statistics:")
print(df.describe())

# Print the first few rows of the DataFrame
df.head()

Number of rows: 1454711
Number of columns: 5
Column names: ['reviews', 'reviewers', 'date_reviews', 'rating', 'course_id']
Data types:
reviews         object
reviewers       object
date_reviews    object
rating           int64
course_id       object
dtype: object
Summary statistics:
             rating
count  1.454711e+06
mean   4.696649e+00
std    6.983271e-01
min    1.000000e+00
25%    5.000000e+00
50%    5.000000e+00
75%    5.000000e+00
max    5.000000e+00


Unnamed: 0,reviews,reviewers,date_reviews,rating,course_id
0,"Pretty dry, but I was able to pass with just t...",By Robert S,"Feb 12, 2020",4,google-cbrs-cpi-training
1,would be a better experience if the video and ...,By Gabriel E R,"Sep 28, 2020",4,google-cbrs-cpi-training
2,Information was perfect! The program itself wa...,By Jacob D,"Apr 08, 2020",4,google-cbrs-cpi-training
3,A few grammatical mistakes on test made me do ...,By Dale B,"Feb 24, 2020",4,google-cbrs-cpi-training
4,Excellent course and the training provided was...,By Sean G,"Jun 18, 2020",4,google-cbrs-cpi-training


remove the "By" in the reviewers name

In [4]:
df['reviewers'] = df['reviewers'].str[3:]

## Pre-processing

In [5]:
print(len(df['reviewers'].unique()))

287808


Only keep a certain amount of students

In [6]:
# Determine the filtering criteria and desired number of users
desired_user_count = 10000

# Retrieve user information and reviews from the database
desired_users = df['reviewers'].unique()[:desired_user_count]

# Keep the selected users along with their reviews
filtered_reviews = df[df['reviewers'].isin(desired_users)]
filtered_reviews.reset_index(inplace=True,drop=True)

# Print the filtered reviews
print(filtered_reviews)

                                                  reviews     reviewers   
0       Pretty dry, but I was able to pass with just t...      Robert S  \
1       would be a better experience if the video and ...   Gabriel E R   
2       Information was perfect! The program itself wa...       Jacob D   
3       A few grammatical mistakes on test made me do ...        Dale B   
4       Excellent course and the training provided was...        Sean G   
...                                                   ...           ...   
149903                                   excellent course  PRASHANT K G   
149904                                               Nice      Daniel V   
149905                                               Nice      Daniel V   
149906                                                fun       Felix D   
149907                                                fun       Felix D   

        date_reviews  rating                 course_id  
0       Feb 12, 2020       4  google-cbrs-

Add the sentiment analysis to each review

In [14]:
filtered_reviews = filtered_reviews.dropna(subset=['reviews'])
print(filtered_reviews['reviews'].dtype)
print(filtered_reviews['reviews'].apply(type).value_counts())
filtered_reviews['reviews'] = filtered_reviews['reviews'].astype(str)
print(filtered_reviews['reviews'].dtype)
filtered_reviews = filtered_reviews.dropna(subset=['reviews'])

object
reviews
<class 'str'>    149902
Name: count, dtype: int64
object


In [15]:
from sentiment_analysis import Sentiment_model

filtered_reviews['positive'] = 0.0
filtered_reviews['neutral'] = 0.0
filtered_reviews['negative'] = 0.0

sentiment_model = Sentiment_model()
filtered_reviews = filtered_reviews.apply(sentiment_model.apply_review_sentiment, axis=1)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Cluster students by common courses to optimize the code

In [16]:
# Create sets for each unique course containing users who reviewed the course

course_user_sets = {}
for course in filtered_reviews['course_id'].unique():
    users = filtered_reviews.loc[filtered_reviews['course_id'] == course, 'reviewers'].tolist()
    course_user_sets[course] = set(users)

all_courses = filtered_reviews['course_id'].unique()
all_users = filtered_reviews['reviewers'].unique()

In [17]:
# Remove user pairs occurring in one set from other sets
for course1, users1 in course_user_sets.items():
    for course2, users2 in course_user_sets.items():
        if course1 != course2:
            course_user_sets[course2] -= users1

In [18]:
# Using a list comprehension to make a list of the keys to be deleted
# (keys having value in 3.)
delete = [key for key in course_user_sets if len(course_user_sets[key])==0]
 
# delete the key/s
for key in delete:
    del course_user_sets[key]

In case a user has several reviews per course, only keep the last review

In [19]:
# order by date_reviews to keep most recent duplicated review
filtered_reviews = filtered_reviews.sort_values(by='date_reviews')

# delete duplicate reviews from same user for the same course and keep the most recent one
duplicate_rows = filtered_reviews[filtered_reviews.duplicated(['reviewers', 'course_id'], keep=False)]

filtered_reviews = filtered_reviews.drop_duplicates(['reviewers', 'course_id'],keep='last')
filtered_reviews.reset_index(drop=True,inplace=True)
filtered_reviews.head()

Unnamed: 0,reviews,reviewers,date_reviews,rating,course_id,positive,neutral,negative
0,The part on predictive analytics and the case ...,Roberto C,"Apr 01, 2016",3,wharton-customer-analytics,0.194603,0.402684,0.402713
1,"Great Course, looking forward to the rest of t...",Omar K,"Apr 01, 2016",4,introduction-to-software-product-management,0.977196,0.018081,0.004723
2,I struggled with some of the instruction as I ...,Janny v R J,"Apr 01, 2016",3,python,0.009783,0.173361,0.816856
3,Concepts are explained in clear and efficient ...,Shivam S,"Apr 01, 2017",4,principles-of-macroeconomics,0.346211,0.598413,0.055376
4,Very informative. I like the additional resour...,Sharon D,"Apr 01, 2017",5,childrens-rights,0.978591,0.018083,0.003326


## Graph creation

In [None]:
# Initialize an empty graph with user nodes
graph = nx.Graph()

for i in range(len(all_users)):
    user_id = i
    user_name = all_users[i]
    courses = filtered_reviews['course_id'].loc[filtered_reviews['reviewers']==user_name]
    
    # ratings = filtered_reviews['rating'].loc[(filtered_reviews['reviewers'] == user_name) & (filtered_reviews['course_id'].isin(courses))].tolist()
    # positive_sentiments = filtered_reviews['positive'].loc[(filtered_reviews['reviewers'] == user_name) & (filtered_reviews['course_id'].isin(courses))].tolist()
    # neutral_sentiments = filtered_reviews['neutral'].loc[(filtered_reviews['reviewers'] == user_name) & (filtered_reviews['course_id'].isin(courses))].tolist()
    # negative_sentiments = filtered_reviews['negative'].loc[(filtered_reviews['reviewers'] == user_name) & (filtered_reviews['course_id'].isin(courses))].tolist()   
    
    ratings = []
    positive_sentiments = []
    neutral_sentiments = []
    negative_sentiments = []
    
    for course in courses:
        rating = filtered_reviews['rating'].loc[(filtered_reviews['reviewers'] == user_name) & (filtered_reviews['course_id'] == course)].tolist()
        positive_sentiment = filtered_reviews['positive'].loc[(filtered_reviews['reviewers'] == user_name) & (filtered_reviews['course_id'] == course)].tolist()
        neutral_sentiment = filtered_reviews['neutral'].loc[(filtered_reviews['reviewers'] == user_name) & (filtered_reviews['course_id'] == course)].tolist()
        negative_sentiment = filtered_reviews['negative'].loc[(filtered_reviews['reviewers'] == user_name) & (filtered_reviews['course_id'] == course)].tolist()

        if rating:
            ratings.append(rating[0])
        if positive_sentiment:
            positive_sentiments.append(positive_sentiment[0])
        if neutral_sentiment:
            neutral_sentiments.append(neutral_sentiment[0])
        if negative_sentiment:
            negative_sentiments.append(negative_sentiment[0])

    graph.add_node(user_name,ratings = ratings, courses=courses,positive_sentiments=positive_sentiments,
                   neutral_sentiments=neutral_sentiments,negative_sentiments=negative_sentiments)

threshold_rating = 0.05
threshold_positive = 0.05
threshold_neutral = 0.05
threshold_negative = 0.05
# Set the minimum number of threshold crossings required
min_threshold_crossings = 3

similarity_thresholds = [threshold_rating,
                         threshold_positive,
                         threshold_neutral,
                         threshold_negative]

# Calculate similarity matrix and add edges to the graph
for course in course_user_sets.items():
    
    course_name = course[0]
    users = list(course[1])
    
    df_course = filtered_reviews[filtered_reviews['reviewers'].isin(users)]
    
    rating_pivot_table = pd.pivot_table(df_course, values='rating', index='reviewers', columns='course_id', fill_value=0)
    positive_pivot_table = pd.pivot_table(df_course, values='positive', index='reviewers', columns='course_id', fill_value=0)
    negative_pivot_table = pd.pivot_table(df_course, values='negative', index='reviewers', columns='course_id', fill_value=0)
    neutral_pivot_table = pd.pivot_table(df_course, values='neutral', index='reviewers', columns='course_id', fill_value=0)
    
    rating_similarity_matrix = cdist(rating_pivot_table, rating_pivot_table, metric='canberra')
    positive_similarity_matrix = cosine_similarity(positive_pivot_table)
    negative_similarity_matrix = cosine_similarity(negative_pivot_table)
    neutral_similarity_matrix = cosine_similarity(neutral_pivot_table)
    
    users_in_this_course = list(rating_pivot_table.index)
    # Add the edges to the graph based on similarity scores
    for i in range(len(users)-1):
        for j in range(i + 1, len(users)):
            # Check if the nodes already exist in the graph
            if users[i] in graph.nodes() and users[j] in graph.nodes():
                similarity_scores = [1/(1+rating_similarity_matrix[i, j]),
                                     positive_similarity_matrix[i][j],
                                     negative_similarity_matrix[i][j],
                                     neutral_similarity_matrix[i][j]]
                weights = np.array([0.5, 0.166, 0.166, 0.166])  # assume rating similarity is twice as important
                overall_similarity = np.average(similarity_scores, weights=weights)

                # print(1/(1+rating_similarity_matrix[i, j]),
                #                      positive_similarity_matrix[i][j],
                #                      negative_similarity_matrix[i][j],
                #                      neutral_similarity_matrix[i][j])

                # Count the number of threshold crossings
                threshold_crossings = sum(score > threshold for score, threshold in zip(similarity_scores, similarity_thresholds))
                # print(threshold_crossings)
                if threshold_crossings >= min_threshold_crossings:
                    graph.add_edge(users_in_this_course[i], users_in_this_course[j], weight=overall_similarity)
                    print("Weighted overall similarity: ", overall_similarity)

In [21]:
print(graph)

def get_sparsity(G):
    num_nodes = len(G.nodes())
    num_edges = len(G.edges())
    return 1.0 - (2.0 * num_edges) / (num_nodes * (num_nodes - 1))

print("Sparsity:", get_sparsity(graph))

Graph with 10000 nodes and 12961725 edges
Sparsity: 0.7407395739573958


In [22]:
for course_node, course_attrs in graph.nodes(data=True):
    if len(course_attrs)==0:
        print(course_attrs,course_node)

In [23]:
import pickle

file_path = "graphs/user_graph.pickle"

with open(file_path, "wb") as file:
    pickle.dump(graph, file)

### Check correctness of graph

In [24]:
# Specify the node for which you want to find the neighborhood
node = 'Robert S'

print(graph.nodes[node])


{'ratings': [4, 5, 2, 5, 5, 5, 4, 3, 5, 5, 5, 5, 5, 5, 5, 2, 5, 4, 4, 4, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 4, 5, 1, 5, 5, 3, 5, 5, 5, 5], 'courses': 70                              indigenous-canada
2274                      python-operating-system
2484     wharton-introduction-spreadsheets-models
3211              python-programming-introduction
5117                               nanotechnology
6010                             machine-learning
6723              linear-algebra-machine-learning
9116                        big-data-introduction
9121                            probability-intro
10989                       programming-languages
11572                                smart-cities
11616                        duke-programming-web
11755                                      python
13088                         grammar-punctuation
13721                        quantitative-methods
14462                   python-data-visualization
17350                       

In [25]:
reviews=pd.read_csv('data/Coursera_reviews.csv')

In [26]:
print(reviews[['course_id','rating']].loc[(reviews['reviewers']=='By Robert S')].drop_duplicates(['course_id'],keep='last'))

                                        course_id  rating
68                       google-cbrs-cpi-training       5
61346                         computer-networking       4
99731                         python-crash-course       5
146632              neural-networks-deep-learning       5
166021                        what-is-datascience       5
202955     html-css-javascript-for-web-developers       5
230765                                negotiation       5
265706                      learning-how-to-learn       5
303775                                python-data       5
346606                              os-power-user       4
397584                           project-planning       5
433492        classification-vector-spaces-in-nlp       4
438562            linear-algebra-machine-learning       4
445247                       python-data-analysis       4
497017                             childnutrition       1
541210                      data-scientists-tools       4
583184        