## Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd


# lightFm imports
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

# re for text cleaning
import re
from datetime import datetime, timedelta

# ignore warnings
import warnings
warnings.filterwarnings('ignore')



## Reading Dataset

In [2]:
base_path="./careervillageDataset/"

df_answer_scores=pd.read_csv(base_path+'answer_scores.csv')
df_answers=pd.read_csv(base_path+'answers.csv',parse_dates=['answers_date_added'])
df_comments=pd.read_csv(base_path+'comments.csv')
df_emails=pd.read_csv(base_path+'emails.csv')
df_group_memberships=pd.read_csv(base_path+'group_memberships.csv')
df_groups=pd.read_csv(base_path+'groups.csv')
df_matches=pd.read_csv(base_path+'matches.csv')
df_professionals=pd.read_csv(base_path+'professionals.csv',parse_dates=['professionals_date_joined'])
df_question_scores=pd.read_csv(base_path+'question_scores.csv')
df_questions=pd.read_csv(base_path+'questions.csv',parse_dates=['questions_date_added'])
df_school_memberships=pd.read_csv(base_path+'school_memberships.csv')
df_students=pd.read_csv(base_path+'students.csv',parse_dates=['students_date_joined'])
df_tag_questions=pd.read_csv(base_path+'tag_questions.csv')
df_tag_users=pd.read_csv(base_path+'tag_users.csv')
df_tags=pd.read_csv(base_path+'tags.csv')


## Defining necessary functions

In [3]:
def generate_int_id(dataframe,id_col_name):
    """
    Generate unique integer id for users, questions and answers

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    id_col_name : String 
        New integer id's column name.
        
    Returns
    -------
    Dataframe
        Updated dataframe containing new id column 
    """
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
    ).reset_index(drop=True)

    return new_dataframe.rename(columns={'int_id_col_name':id_col_name})

In [4]:
def create_features(dataframe, features_name, id_col_name):
    """
    Generate features that will be ready for feeding into lightfm

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe which contains features
    features_name : List
        List of feature columns name avaiable in dataframe
    id_col_name: String
        Column name which contains id of the question or
        answer that the features will map to.
        There are two possible values for this variable.
        1. questions_id_num
        2. professionals_id_num

    Returns
    -------
    Pandas Series
        A pandas series containing process features
        that are ready for feed into lightfm.
        The format of each value
        will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
        Ex. -> (1, ['military', 'army', '5'])
    """
    features=dataframe[features_name].apply(lambda x:','.join(x.map(str)),axis=1)
    features=features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features

In [5]:
def generate_feature_list(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features

In [6]:
def calculate_auc_score(lightfm_model, interactions_matrix, 
                        question_features, professional_features): 
    """
    Measure the ROC AUC metric for a model. 
    A perfect score is 1.0.

    Parameters
    ----------
    lightfm_model: LightFM model 
        A fitted lightfm model 
    interactions_matrix : 
        A lightfm interactions matrix 
    question_features, professional_features: 
        Lightfm features 
        
    Returns
    -------
    String containing AUC score 
    """
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=question_features, 
        user_features=professional_features, 
        num_threads=4).mean()
    return score

## Data preprocessing and Feature Creation

Generate numeric identifier: LightFM python only except numeric id. But the data we have has uuid for identifying users and professionals and others. In this step, I will make unique identifier for each professionals, students, questions and answers.

In [7]:
# generating unique integer id for users and q&a
df_professionals = generate_int_id(df_professionals, 'professionals_id_num')
df_students = generate_int_id(df_students, 'students_id_num')
df_questions = generate_int_id(df_questions, 'questions_id_num')
df_answers = generate_int_id(df_answers, 'answers_id_num')

### Merging Dataset

This is one of the most important steps for our solution. Our professionals, students, q&a and tags are stored in seperate datasets. For purpose of model, we have to merge our datasets in very carefull way so that they are useful for our model.

1. All tags (q&a) are stored in a separate dataset. So firstly we merge those tags with questions and answers datasets.
2. Then, we merge answers with quesitons because one question can have multiple answers.

In [8]:
df_tags = df_tags.dropna()
df_tags['tags_tag_name'] = df_tags['tags_tag_name'].str.replace('#', '')

In [9]:
# merge tag_questions with tags name
# then group all tags for each question into single rows

df_tags_question = df_tag_questions.merge(
    df_tags,how='inner',left_on='tag_questions_tag_id',right_on='tags_tag_id'
)

df_tags_question = df_tags_question.groupby(
    ['tag_questions_question_id'])['tags_tag_name'].apply(
        ','.join).reset_index()

df_tags_question = df_tags_question.rename(columns={'tags_tag_name': 'questions_tag_name'})

In [10]:
# merge tag_users with tags name 
# then group all tags for each user into single rows 
# after that rename the tag column name 

df_tags_pro=df_tag_users.merge(
    df_tags,
    how='inner',
    left_on='tag_users_tag_id',
    right_on='tags_tag_id'
)

df_tags_pro=df_tags_pro.groupby(
    ['tag_users_user_id'])['tags_tag_name'].apply(','.join).reset_index()

df_tags_pro = df_tags_pro.rename(columns={'tags_tag_name': 'professionals_tag_name'})

In [11]:
# merge professionals and questions tags with main merge_dataset 
df_questions = df_questions.merge(
    df_tags_question, how='left',
    left_on='questions_id', right_on='tag_questions_question_id')
df_professionals = df_professionals.merge(
    df_tags_pro, how='left',
    left_on='professionals_id', right_on='tag_users_user_id')

In [12]:
# merge questions with scores 
df_questions = df_questions.merge(
    df_question_scores, how='left',
    left_on='questions_id', right_on='id')

In [13]:
# merge questions with students 
df_questions = df_questions.merge(
    df_students, how='left',
    left_on='questions_author_id', right_on='students_id')

In [14]:
# merge answers with questions 
# then merge professionals and questions score with that 
df_merge = df_answers.merge(
    df_questions, how='inner',
    left_on='answers_question_id', right_on='questions_id')
df_merge = df_merge.merge(
    df_professionals, how='inner',
    left_on='answers_author_id', right_on='professionals_id')
# df_merge = df_merge.merge(
#     df_question_scores, how='inner',
#     left_on='questions_id', right_on='id')

In [15]:
df_merge.columns

Index(['answers_id', 'answers_author_id', 'answers_question_id',
       'answers_date_added', 'answers_body', 'answers_id_num', 'questions_id',
       'questions_author_id', 'questions_date_added', 'questions_title',
       'questions_body', 'questions_id_num', 'tag_questions_question_id',
       'questions_tag_name', 'id', 'score', 'students_id', 'students_location',
       'students_date_joined', 'students_id_num', 'professionals_id',
       'professionals_location', 'professionals_industry',
       'professionals_headline', 'professionals_date_joined',
       'professionals_id_num', 'tag_users_user_id', 'professionals_tag_name'],
      dtype='object')

**Generate some features:** In this steps, we are going to generate some features. We are going to generate number of answers by professionals, num of answers in each question, num of tags per professionals and number of tags per question. I will not use all of these features in this model. But I will use number of answers per question for weighting our model so that our model pay less attention to those quesitons that have higher number of answers.

In [16]:
df_merge['num_of_answers_by_professional']=df_merge.groupby(['answers_author_id'])['questions_id'].transform('count')
df_merge['num_ans_per_ques']=df_merge.groupby(['questions_id'])['answers_id'].transform('count')
df_merge['num_tags_professional'] = df_merge['professionals_tag_name'].str.split(",").str.len()
df_merge['num_tags_question'] = df_merge['questions_tag_name'].str.split(",").str.len()

In [17]:
print("Maximum number of answer per question : " + str(df_merge['num_ans_per_ques'].max()))
print("Maximum number of tags per professional : " + str(df_merge['num_tags_professional'].max()))
print("Maximum number of tags per question : " + str(df_merge['num_tags_question'].max()))

Maximum number of answer per question : 58
Maximum number of tags per professional : 82.0
Maximum number of tags per question : 54.0


**Merge answered questions tags with professional's tags:** Professionals can follow some tags. But not all professional follow tags and most especially we see from EDA that sometime professionals answers questions that is not related to their tags. For that reason, I have merge questions tags that each professional has answered with professional tags. This makes our model more robust and context aware.

In [18]:
# select professionals answered questions tags 
# and stored as a dataframe
professionals_prev_ans_tags = df_merge[['professionals_id', 'questions_tag_name']]

# drop null values from that 
professionals_prev_ans_tags = professionals_prev_ans_tags.dropna()

# we group all of tags of each user into single row 
professionals_prev_ans_tags = professionals_prev_ans_tags.groupby(
    ['professionals_id'])['questions_tag_name'].apply(
        ','.join).reset_index()

# drop duplicates tags from each professionals rows
professionals_prev_ans_tags['questions_tag_name'] = (
    professionals_prev_ans_tags['questions_tag_name'].str.split(',').apply(set).str.join(','))

# finally merge the dataframe with professionals dataframe 
df_professionals = df_professionals.merge(professionals_prev_ans_tags, how='left', on='professionals_id')

# join professionals tags and their answered tags 
# we replace nan values with ""
df_professionals['professional_all_tags'] = (
    df_professionals[['professionals_tag_name', 'questions_tag_name']].apply(
        lambda x: ','.join(x.dropna()),
        axis=1))

**Handling null and duplicates values:** Now we want clean our data a little bit. We will handle null and duplicate values. Because if we don't remove that they will cause error and wrong prediction. Also, we will replace null values with generic name or value.

In [19]:
# handling null values 
df_questions['score'] = df_questions['score'].fillna(0)
df_questions['score'] = df_questions['score'].astype(int)
df_questions['questions_tag_name'] = df_questions['questions_tag_name'].fillna('No Tag')
# remove duplicates tags from each questions 
df_questions['questions_tag_name'] = df_questions['questions_tag_name'].str.split(',').apply(set).str.join(',')


# fill nan with 'No Tag' if any 
df_professionals['professional_all_tags'] = df_professionals['professional_all_tags'].fillna('No Tag')
# replace "" with "No Tag", because previously we replace nan with ""
df_professionals['professional_all_tags'] = df_professionals['professional_all_tags'].replace('', 'No Tag')
df_professionals['professionals_location'] = df_professionals['professionals_location'].fillna('No Location')
df_professionals['professionals_industry'] = df_professionals['professionals_industry'].fillna('No Industry')

# remove duplicates tags from each professionals 
df_professionals['professional_all_tags'] = df_professionals['professional_all_tags'].str.split(',').apply(set).str.join(',')



# remove some null values from df_merge
df_merge['num_ans_per_ques']  = df_merge['num_ans_per_ques'].fillna(0)
df_merge['num_tags_professional'] = df_merge['num_tags_professional'].fillna(0)
df_merge['num_tags_question'] = df_merge['num_tags_question'].fillna(0)

## Building model in LightFM

LightFM Datset class makes it really easy for us for creating interection matrix, weights and user/item features.

- interection matrix: It is a matrix that contains user/ item interections or professional/quesiton intereactions.
- weights: weight of interection matrix. Less weight means less importance to that interection matrix.
- user/item features: user/item features supplied as like this (user_id, ['feature_1', 'feature_2', 'feature_3'])

Creating features list for Dataset class: LightFM library has a Dataset class that makes it really easy for building necessary information for model. But we have feed set of all professionals/questions unique ids and all questions and professional features list. This will create internel mapping for lightFM to use.

In [20]:
# generating features list for mapping 
question_feature_list = generate_feature_list(
    df_questions,
    ['questions_tag_name'])

professional_feature_list = generate_feature_list(
    df_professionals,
    ['professional_all_tags'])

In [21]:
# calculate our weight value 
df_merge['total_weights'] = 1 / (
    df_merge['num_ans_per_ques'])


# creating features for feeding into lightfm 
df_questions['question_features'] = create_features(
    df_questions, ['questions_tag_name'], 
    'questions_id_num')


df_professionals['professional_features'] = create_features(
    df_professionals,
    ['professional_all_tags'],
    'professionals_id_num')

**LightFM Dataset:** building lightfm datasets. Building our interactions matrix, weights and professional/question features.

In [22]:
# define our dataset variable
# then we feed unique professionals and questions ids
# and item and professional feature list
# this will create lightfm internel mapping
dataset=Dataset()
dataset.fit(
    set(df_professionals['professionals_id_num']),
    set(df_questions['questions_id_num']),
    item_features=question_feature_list,
    user_features=professional_feature_list)

# now we are building interactions matrix between professionals and quesitons
# we are passing professional and questions id as a tuple
# e.g -> pd.Series((pro_id, question_id), (pro_id, questin_id))
# then we use lightfm build in method for building interactions matrix
df_merge['author_question_id_tuple'] = list(zip(
    df_merge.professionals_id_num, df_merge.questions_id_num, df_merge.total_weights))


interactions, weights = dataset.build_interactions(
    df_merge['author_question_id_tuple'])

# now we are building our questions and professionals features
# in a way that lightfm understand.
# we are using lightfm build in method for building
# questions and professionals features 
questions_features=dataset.build_item_features(
    df_questions['question_features']
)
professional_features=dataset.build_user_features(
    df_professionals['professional_features']
)


## Model building and training

In [23]:
# define lightfm model by specifying hyper-parametre
# then fit the model with ineteractions matrix, item and user features 
model=LightFM(
    no_components=150,
    learning_rate=0.05,
    loss='bpr',
    random_state=2019
)

In [None]:
model.fit(
    interactions,
    item_features=questions_features,
    user_features=professional_features,
    sample_weight=weights,
    epochs=5,
    num_threads=4,
    verbose=True
)

Epoch:   0%|                                                                                     | 0/5 [00:00<?, ?it/s]

In [None]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k

# Load the MovieLens 100k dataset
data = fetch_movielens()

# Create and train the model
model = LightFM(loss='warp')
model.fit(data['train'], epochs=10, num_threads=1)

# Evaluate the model
train_precision = precision_at_k(model, data['train'], k=10).mean()
test_precision = precision_at_k(model, data['test'], k=10).mean()

print(f'Train precision: {train_precision}')
print(f'Test precision: {test_precision}')


