In [1]:
from cdhf.data import Data

ModuleNotFoundError: No module named 'cdhf'

In [None]:
data = Data("../input/mmdata.json")
data.load_all()
print(len(data.teams))

## Import Necessary Libraries 

In [None]:
import pandas as pd
import numpy as np

### Convert Channel Members to Pandas DataFrame


In [None]:
df = pd.DataFrame.from_records([vars(cm) for cm in data.channel_members])
df["index"] = df["channel_id"] + "-" + df["user_id"]
df.set_index('index', inplace=True)

### Remove Channels with less than 3 users

In [None]:
df_grouped_users = df.groupby(["channel_id"]).count()
allowed_channels = df_grouped_users[df_grouped_users["user_id"] > 3].index.array
df = df[df["channel_id"].isin(allowed_channels)]

### Scaling message count to be a value between 1 and 5

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(1,5))
df_scale = pd.DataFrame(df["msg_count"])
arr_scaled = scaler.fit_transform(df_scale)
df_scaled = pd.DataFrame(arr_scaled, columns=["msg_count"],index=df.index)
df["rating_scaled"] = df_scaled
df["rating"] = 1.0

In [None]:
ratings = df.drop(columns = ["msg_count", "mention_count", "rating_scaled"])

## Implicity Framework Example 

Collaborative Filtering Article --> http://yifanhu.net/PUB/cf.pdf

Framework Documentation --> https://benfred.github.io/implicit/tutorial_lastfm.html

In [None]:
import implicit
from implicit.datasets.lastfm import get_lastfm

artists, users, artist_user_plays = get_lastfm()
print(artist_user_plays.data)

## Spliting data into training and test data

In [None]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['user_id']
print(X.shape)
print(len(y))
#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

## Evaluation

In [None]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
#Define the baseline model to always return 3.

def baseline(user_id, movie_id):
    return 0.0

In [None]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['channel_id'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, channel) for (user, channel) in id_pairs])
    y_pred[np.isnan(y_pred)] = 0
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

### Score Baseline

In [None]:
print(1.9969011004768398)
score(baseline)

### User-based collaborative filtering

In [None]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='channel_id')
r_matrix.head()

#### Simple collaborative filter - Min

This simply takes in user_id and channel_id and outputs the min rating for the channel by all the users who have rated it. No distinction is made between the users. In other words, the rating of each user is assigned equal weight.

In [None]:
#User Based Collaborative Filter using Min Ratings
def cf_user_mean(user_id, channel_id):
    
    #Check if movie_id exists in r_matrix
    if channel_id in r_matrix:
        #Compute the mean of all the "ratings" given to the channel
        mean_rating = r_matrix[channel_id].mean()
    
    else:
        #Default to a "rating" of 0.0 in the absence of any information
        mean_rating = 0.0
    
    return mean_rating
    

#Compute RMSE for the Mean model
score(cf_user_mean)

#### Simple collaborative filter -  Weighted mean

In [None]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head(10)

##### User Based Collaborative Filter using Weighted Mean Ratings

In [None]:

def cf_user_wmin(user_id, channel_id):
    
    #Check if channel_id exists in r_matrix
    if channel_id in r_matrix :
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]

        #Get the user ratings for the channel in question
        m_ratings = r_matrix[channel_id]

        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index

        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()

        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    else:
        #Default to a rating of 0.0 in the absence of any information
        wmean_rating = 0.0

    return wmean_rating

score(cf_user_wmin)
