In [105]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
nlp = spacy.load('en_core_web_sm')

%matplotlib inline

df = pd.read_csv('user-item-interactions.csv')
df_content = pd.read_csv('articles_community.csv')
del df['Unnamed: 0']
del df_content['Unnamed: 0']

df = df.fillna('no_email')

pre_data = dict(df.groupby(['email', 'article_id'])['article_id'].count())

list_email = []
list_article_id = []
list_nb_interactions = []

for key, val in pre_data.items():
    list_email.append(key[0])
    list_article_id.append(key[1])
    list_nb_interactions.append(val)
    
zipped_list = list(zip(list_email, list_article_id, list_nb_interactions))
interaction_user = pd.DataFrame(zipped_list, columns=['email','article_id','nb_interactions'])

df = pd.merge(df, interaction_user, on=['email', 'article_id'])

df_content = df_content.drop(labels='doc_status', axis=1)
df_content = df_content.drop_duplicates()

df = df.replace('no_email', np.nan)

def email_mapper():
    coded_dict = dict()
    cter = 1
    email_encoded = []
    
    for val in df['email']:
        if val not in coded_dict:
            coded_dict[val] = cter
            cter+=1
        
        email_encoded.append(coded_dict[val])
    return email_encoded

email_encoded = email_mapper()
del df['email']
df['user_id'] = email_encoded

dict_views = dict(df.article_id.value_counts())
df['nb_views'] = df.article_id.map(dict_views)

df = df.drop_duplicates()

df['importance_article'] = df.nb_interactions / df.nb_views

In [106]:
df.head()

Unnamed: 0,article_id,title,nb_interactions,user_id,nb_views,importance_article
0,1430.0,"using pixiedust for fast, flexible, and easier...",2,1,336,0.005952
2,1314.0,healthcare python streaming application demo,1,2,614,0.001629
3,1429.0,use deep learning for image classification,25,3,937,0.026681
28,1338.0,ml optimization using cognitive assistant,1,4,382,0.002618
29,1276.0,deploy your python model as a restful api,1,5,347,0.002882


In [107]:
df_content.head()

Unnamed: 0,doc_body,doc_description,doc_full_name,article_id
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,Detect Malfunctioning IoT Sensors with Streami...,0
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",Communicating data science: A guide to present...,1
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,"This Week in Data Science (April 18, 2017)",2
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,DataLayer Conference: Boost the performance of...,3
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,Analyze NY Restaurant data using Spark in DSX,4


#### Engineering the feature Using NLP

In [108]:
def nlp_task(data):
    sent = data
    doc=nlp(sent)
    sub_toks = [tok for tok in doc if ((tok.dep_ == "compound") or (tok.dep_ == "dobj") or (tok.dep_ == "pobj")) ]
    insight_words = [str(i) for i in sub_toks]
    return insight_words

df_content['insights'] = df_content.doc_full_name.apply(nlp_task)
df_content.head()

Unnamed: 0,doc_body,doc_description,doc_full_name,article_id,insights
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,Detect Malfunctioning IoT Sensors with Streami...,0,"[Detect, Malfunctioning, IoT, Streaming, Analy..."
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",Communicating data science: A guide to present...,1,"[data, science, work]"
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,"This Week in Data Science (April 18, 2017)",2,"[Data, Science]"
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,DataLayer Conference: Boost the performance of...,3,"[DataLayer, performance, database]"
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,Analyze NY Restaurant data using Spark in DSX,4,"[Analyze, NY, Restaurant, Spark, DSX]"


##### Fill df_content with columns of insight

In [109]:
#concat all words
insight_sum = np.sum(df_content['insights'])

#remove dupplicates
list_unique_insight = list(set(insight_sum))

#remove all non character from the list
regex = re.compile('[^a-zA-Z]')
list_unique_insight_clean = [regex.sub('non_charac', i) for i in list_unique_insight]
list_unique_insight_clean = set(list_unique_insight_clean)
list_unique_insight_clean.remove('non_charac')

#create a dataframe with the list as columns
df_temporary = pd.DataFrame(columns=list_unique_insight_clean)

df_content = pd.merge(df_content, df_temporary, how='outer', left_index=True, right_index=True)
del df_temporary

df_content.head()

Unnamed: 0,doc_body,doc_description,doc_full_name,article_id,insights_x,Indexes,Recurrent,storage,Jennifer,Python,...,Push,Twitter,Service,Group,scientist,authentication,Management,Hugo,Analytics,Leveraging
0,Skip navigation Sign in SearchLoading...\r\n\r...,Detect bad readings in real time using Python ...,Detect Malfunctioning IoT Sensors with Streami...,0,"[Detect, Malfunctioning, IoT, Streaming, Analy...",,,,,,...,,,,,,,,,,
1,No Free Hunch Navigation * kaggle.com\r\n\r\n ...,"See the forest, see the trees. Here lies the c...",Communicating data science: A guide to present...,1,"[data, science, work]",,,,,,...,,,,,,,,,,
2,☰ * Login\r\n * Sign Up\r\n\r\n * Learning Pat...,Here’s this week’s news in Data Science and Bi...,"This Week in Data Science (April 18, 2017)",2,"[Data, Science]",,,,,,...,,,,,,,,,,
3,"DATALAYER: HIGH THROUGHPUT, LOW LATENCY AT SCA...",Learn how distributed DBs solve the problem of...,DataLayer Conference: Boost the performance of...,3,"[DataLayer, performance, database]",,,,,,...,,,,,,,,,,
4,Skip navigation Sign in SearchLoading...\r\n\r...,This video demonstrates the power of IBM DataS...,Analyze NY Restaurant data using Spark in DSX,4,"[Analyze, NY, Restaurant, Spark, DSX]",,,,,,...,,,,,,,,,,


##### fill the columns added with 1 or 0 if the insight is present in the row

In [110]:
# iterate over the columns added and fill value
for i in list_unique_insight_clean:
    df_content[i] = df_content.insights_x.apply(lambda x: 1 if i in x else 0)

In [113]:
df_content.head()

0    983
1     73
Name: data, dtype: int64