In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/data-science-for-good-careervillage/professionals.csv
/kaggle/input/data-science-for-good-careervillage/matches.csv
/kaggle/input/data-science-for-good-careervillage/comments.csv
/kaggle/input/data-science-for-good-careervillage/tag_users.csv
/kaggle/input/data-science-for-good-careervillage/groups.csv
/kaggle/input/data-science-for-good-careervillage/school_memberships.csv
/kaggle/input/data-science-for-good-careervillage/group_memberships.csv
/kaggle/input/data-science-for-good-careervillage/answers.csv
/kaggle/input/data-science-for-good-careervillage/emails.csv
/kaggle/input/data-science-for-good-careervillage/questions.csv
/kaggle/input/data-science-for-good-careervillage/tags.csv
/kaggle/input/data-science-for-good-careervillage/tag_questions.csv
/kaggle/input/data-science-for-good-careervillage/answer_scores.csv
/kaggle/input/data-science-for-good-careervillage/question_scores.csv
/kaggle/input/data-science-for-good-careervillage/students.csv


# Question Recommender: Recommend question based on tag

File path of the dataset

In [2]:
data_dir = "/kaggle/input/data-science-for-good-careervillage/"
questions_file_path = os.path.join(data_dir, 'questions.csv')
tags_file_path = os.path.join(data_dir, 'tags.csv')
professionals_file_path = os.path.join(data_dir, 'professionals.csv')
tag_questions_file_path = os.path.join(data_dir, 'tag_questions.csv')
tag_users_file_path = os.path.join(data_dir, 'tag_users.csv')

### Extraction of each user tags

#### Load dataset and inspection the dataset

In [3]:
# Load dataset and show the dataset information
tags_df = pd.read_csv(tags_file_path)
print(tags_df.info())
tags_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16269 entries, 0 to 16268
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tags_tag_id    16269 non-null  int64 
 1   tags_tag_name  16268 non-null  object
dtypes: int64(1), object(1)
memory usage: 254.3+ KB
None


Unnamed: 0,tags_tag_id,tags_tag_name
0,27490,college
1,461,computer-science


In [4]:
# Load dataset and show the dataset information
tag_users_df = pd.read_csv(tag_users_file_path)
print(tag_users_df.info())
tag_users_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136663 entries, 0 to 136662
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   tag_users_tag_id   136663 non-null  int64 
 1   tag_users_user_id  136663 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.1+ MB
None


Unnamed: 0,tag_users_tag_id,tag_users_user_id
0,593,c72ab38e073246e88da7e9a4ec7a4472
1,1642,8db519781ec24f2e8bdc67c2ac53f614


#### Merge and Extraction

Since `tag_users_df` and `tags_df` has common data `tag_id`, we can merge them and extract each user tag by grouping by `user_id`

In [5]:
# Merge dataframe
temp_df = pd.merge(tag_users_df, tags_df, 
                left_on='tag_users_tag_id', 
                right_on='tags_tag_id', 
                how='inner'
               )
# Remove unnecesary columns
temp_df.drop(columns=['tag_users_tag_id','tags_tag_id' ], inplace=True)

# reaname columns name
temp_df.rename(columns={'tag_users_user_id':'user_id', 'tags_tag_name':'tag'},inplace=True)
temp_df.head()

Unnamed: 0,user_id,tag
0,c72ab38e073246e88da7e9a4ec7a4472,computer-software
1,8db519781ec24f2e8bdc67c2ac53f614,computer-software
2,9ab6b54d55b24299a4795584508db4ff,computer-software
3,e327399c48584fcf81e433828a6d8715,computer-software
4,92494d9dc2124507972c5306badc6727,computer-software


In [6]:
# Extration of user tag by grouping
user_tag_df = temp_df.groupby(['user_id'])['tag'].apply(set).reset_index()
user_tag_df.head(2)

Unnamed: 0,user_id,tag
0,00009a0f9bda43eba47104e9ac62aff5,"{content-creation, script-writing, digital-media}"
1,000196ef8db54b9a86ae70ad31745d04,{accounting}


### Extraction of each question tags

#### Load and inspect the dataset

In [7]:
# Read the dataset
tag_questions_df = pd.read_csv(tag_questions_file_path)

# Dataset information
print(tag_questions_df.info())
tag_questions_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76553 entries, 0 to 76552
Data columns (total 2 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   tag_questions_tag_id       76553 non-null  int64 
 1   tag_questions_question_id  76553 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.2+ MB
None


Unnamed: 0,tag_questions_tag_id,tag_questions_question_id
0,28930,cb43ebee01364c68ac61d347a393ae39
1,28930,47f55e85ce944242a5a347ab85a8ffb4


In [8]:
# Read the dataset
questions_df = pd.read_csv(questions_file_path)

# Dataset information
print(questions_df.info())
questions_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23931 entries, 0 to 23930
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   questions_id          23931 non-null  object
 1   questions_author_id   23931 non-null  object
 2   questions_date_added  23931 non-null  object
 3   questions_title       23931 non-null  object
 4   questions_body        23931 non-null  object
dtypes: object(5)
memory usage: 934.9+ KB
None


Unnamed: 0,questions_id,questions_author_id,questions_date_added,questions_title,questions_body
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,2016-04-26 11:14:26 UTC+0000,Teacher career question,What is a maths teacher? what is a ma...
1,eb80205482e4424cad8f16bc25aa2d9c,acccbda28edd4362ab03fb8b6fd2d67b,2016-05-20 16:48:25 UTC+0000,I want to become an army officer. What can I d...,I am Priyanka from Bangalore . Now am in 10th ...


#### Merge and Extraction

`question_id` of the`questions_df` and `tag_questions_question_id` of the `tag_questions_df` both has same info. So we can merge them and extract id of the tag from the merged dataframe.

In [9]:
# Merge dataframe
temp_df = pd.merge(questions_df, tag_questions_df, 
                   left_on='questions_id', 
                   right_on = 'tag_questions_question_id',
                   how='inner'
                  ).drop(columns=['tag_questions_question_id','questions_date_added'])
temp_df.head(2)

Unnamed: 0,questions_id,questions_author_id,questions_title,questions_body,tag_questions_tag_id
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,Teacher career question,What is a maths teacher? what is a ma...,14147
1,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,Teacher career question,What is a maths teacher? what is a ma...,27490


Now we can merge `temp_df` and `tags_df` since `tag_questions_tag_id` column and `tags_tag_id` column has same information. And then we can extract tags of each question by groping by `questions_id`.

In [10]:
# Merge dataframe and remove unnecessary columns
temp_df = temp_df.merge(tags_df, 
              left_on='tag_questions_tag_id',
              right_on = 'tags_tag_id',
              how='inner'
             ).drop(columns=['tag_questions_tag_id','tags_tag_id'])
temp_df.head(2)

Unnamed: 0,questions_id,questions_author_id,questions_title,questions_body,tags_tag_name
0,332a511f1569444485cf7a7a556a5e54,8f6f374ffd834d258ab69d376dd998f5,Teacher career question,What is a maths teacher? what is a ma...,lecture
1,6c98bbce49714e53a488a27e95ca4132,1bdfc4d56b0641df8e6de875776e4e00,Aeronatical career question,What are the benifits of Aeronatical engineeri...,lecture


In [11]:
# Extract question tag by grouping
question_tag_df = temp_df.groupby('questions_id')['tags_tag_name'].apply(set).reset_index()
question_tag_df.rename(columns={'tags_tag_name':'tag'}, inplace=True)
question_tag_df.head(2)

Unnamed: 0,questions_id,tag
0,0003e7bf48f24b5c985f8fce96e611f3,"{time, information-technology, majoring, inter..."
1,0006609dd4da40dcaa5a83e0499aba14,"{law, psychology}"


### Recommendation for a user

*Recommendation for `user_id=00009a0f9bda43eba47104e9ac62aff5`*

In [12]:
user_id = '001bd6f7f1ac4897b0b35dc665c64d2e'

#### Calculate Similarity score

Calculate similarity score with Jacard Similarity method.

In [13]:
def jacard_similarity(user_tag, question_tag):
    a_and_b = len(user_tag & question_tag)
    a_or_b = len(user_tag | question_tag)
    return (a_and_b/a_or_b)

In [14]:
#  Get tags of the user
user_tags = user_tag_df[user_tag_df.user_id==user_id]['tag'].values[0]

# Similarity score calculation for each question
sim_score = question_tag_df['tag'].apply(lambda question_tag: jacard_similarity(user_tags, question_tag) )

# Add the similarity score as a column to the dataframe
question_tag_df['sim_score'] = sim_score
question_tag_df.head()

Unnamed: 0,questions_id,tag,sim_score
0,0003e7bf48f24b5c985f8fce96e611f3,"{time, information-technology, majoring, inter...",0.0
1,0006609dd4da40dcaa5a83e0499aba14,"{law, psychology}",0.0
2,000af224bc2f4e94a19f8b62ba279cc4,"{marine, biology}",0.0
3,000b30fb534b41f7b716fa9ebf9c3f35,"{school, teaching, exercise-science, exercise}",0.0
4,0018752e44b44e26bb74a0a43232b4d6,"{math, puremathematics}",0.0


#### Remove user own questions 

*Get user own questions*

In [15]:
own_question_ids = questions_df[questions_df.questions_author_id==user_id].questions_id
print("Number of own question: ", len(own_question_ids))
own_question_ids

Number of own question:  4


2633    42337857c90f473aa0fa0b3d884d6045
6822    e8b9b90b219d4e35af16d78a44d77091
7599    f6370087fc4941c18ee174ff2dcc951d
7934    de05fd851aff4fc797e74fba4644dda1
Name: questions_id, dtype: object

Filter own questions

In [16]:
print('Number of questions before filter own question: ', len(question_tag_df))
question_tag_df = question_tag_df[~question_tag_df.questions_id.isin(own_question_ids)]
print('Number of questions after filter own question: ', len(question_tag_df))


Number of questions before filter own question:  23288
Number of questions after filter own question:  23284


#### Get 20 question with highest similarity score as recommended questions

In [17]:
# Sort by similarity score in descenging order
question_tag_df.sort_values(by='sim_score', ascending=False, inplace=True)
question_tag_df

Unnamed: 0,questions_id,tag,sim_score
20689,e40a218130834645bd74a5a196cec9e7,{college},1.0
8214,5bec833ac8da4595ba3945f9202732d1,{college},1.0
21681,ee814ad76e1245bd8d096ea82f757274,{college},1.0
8232,5c3c2d84b40f48f288740ec3980a36f5,{college},1.0
2896,20a8e38f5e3142d1aa9a3c3534718fac,{college},1.0
...,...,...,...
8436,5e5ec38774704f2c93329f2d2656f7f7,"{model, fashion, modeling, fashion-shows, mode...",0.0
8435,5e5e4275dc914ca8a2d601713ce7cd2f,{biology},0.0
8434,5e5a015467e24894abf5ab992964d7ad,"{event-management, communications, communicati...",0.0
8433,5e5832e189204de59a7516dd75faea6e,"{mathematics, school, math, higher-education}",0.0


In [18]:
# Top 20 question 
top20_question_ids = question_tag_df.questions_id[:20]
len(top20_question_ids)

20

#### User own questions

In [19]:
own_question = questions_df[questions_df.questions_id.isin(own_question_ids)]
own_question[['questions_id', 'questions_title']]

Unnamed: 0,questions_id,questions_title
2633,42337857c90f473aa0fa0b3d884d6045,"Do you prefer urban, suburban, or rural campus..."
6822,e8b9b90b219d4e35af16d78a44d77091,Where are the best internship opportunities in...
7599,f6370087fc4941c18ee174ff2dcc951d,Why do people prefer living off campus?
7934,de05fd851aff4fc797e74fba4644dda1,What are some resources to find a roommate?


#### Recommended Questions

In [20]:
top20_question = questions_df[questions_df.questions_id.isin(top20_question_ids)]
top20_question[['questions_id', 'questions_title']]

Unnamed: 0,questions_id,questions_title
3225,dd19c4e2623c450e94f46e709211aee5,How long does it take to get the full understa...
3282,95539a67cee449269346cd2e42b89014,"Is it better to graduate high school early, or..."
4603,c91a53ba77fc45b7978804de390d061a,Should I pick my college solely for its progra...
6074,ee814ad76e1245bd8d096ea82f757274,Motivational tips to keep me motivated.
6093,9d306c08820e4a6e8c91d1be625b5f3b,How do I go about finding scholarships that I ...
6851,b5044f24a92c46f2adedad06c3970745,What is a good university to attend a good soc...
7269,5d490567d3824cb8a6c2eed5c3a3e0a2,What is the longest process of getting in coll...
7733,5bec833ac8da4595ba3945f9202732d1,Which college/university is good to attend for...
8779,e40a218130834645bd74a5a196cec9e7,How do you decide what to major in?
9876,10e9c5a8760f4b319b0d4bf59d495f5d,Is it most convenient to go straight to Univer...
