In [55]:
import pandas as pd

articles_df = pd.read_csv('shared_articles.csv')
interactions_df = pd.read_csv('users_interactions.csv')

df_raw = pd.merge(articles_df, interactions_df, on='contentId', how='inner')

df = df_raw.drop_duplicates(subset=['title'], keep='first')
df.head()


Unnamed: 0,timestamp_x,eventType_x,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang,timestamp_y,eventType_y,personId,sessionId,userAgent,userRegion,userCountry
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en,1459192561,VIEW,4340306774493623681,8940341205206233829,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5...,SP,BR
4,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en,1459539433,VIEW,8414731042150985013,4543899740167763020,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,MG,BR
5,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,1459476271,VIEW,-7267769888748948232,-1350132153769633851,Android - Native Mobile App,MG,BR
18,1459194522,CONTENT SHARED,-2826566343807132236,4340306774493623681,8940341205206233829,,,,HTML,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en,1459271169,VIEW,2652684983165843086,-5801322284239306545,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR
20,1459194599,CONTENT SHARED,4119190424078847945,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/blockchai...,Blockchain Technology Could Put Bank Auditors ...,When most people think about computers and rob...,en,1460743763,VIEW,4340306774493623681,5613188264916540247,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5...,SP,BR


In [56]:
print(df.shape)
df.isna().sum().sort_values(ascending=False)

(2946, 20)


authorRegion       2293
authorCountry      2293
authorUserAgent    2293
userRegion          415
userCountry         415
userAgent           415
eventType_x           0
timestamp_x           0
authorPersonId        0
contentId             0
contentType           0
authorSessionId       0
text                  0
title                 0
url                   0
lang                  0
personId              0
eventType_y           0
timestamp_y           0
sessionId             0
dtype: int64

In [57]:
df.dropna(axis=1, inplace=True)

df.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=1, inplace=True)


timestamp_x        0
eventType_x        0
contentId          0
authorPersonId     0
authorSessionId    0
contentType        0
url                0
title              0
text               0
lang               0
timestamp_y        0
eventType_y        0
personId           0
sessionId          0
dtype: int64

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
          
      # Create a TfidfVectorizer and Remove stopwords
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the data to a tfidf matrix
tfidf_matrix = tfidf.fit_transform(df['text'])

# Print the shape of the tfidf_matrix
print(tfidf_matrix.shape)

# Preview the matrix by placing it into a DataFrame (which we won't need later)
df_tfidf = pd.DataFrame(tfidf_matrix.T.todense(), index=tfidf.get_feature_names_out(), columns=df['text'])

(2946, 71593)


In [65]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity between each movie description
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# For easier viewing, put it in a dataframe
cosine_sim = pd.DataFrame(cosine_sim)

In [66]:
def get_recommendations(item_id, sim_matrix, n=10, messages=True):
    if not item_id in sim_matrix[:]:  # Add some error checking for robustness
        print(f"Item {item_id} is not in the similarity matrix you provided")
        return

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(sim_matrix[item_id]))

    # Sort the items based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the n most similar items; start at 1 so that it skips itself
    top_similar = sim_scores[1:n+1]

    # Put the recommended item indices and similarity scores together in a dictionary using comprehension
    rec_dict = {i[0]:i[1] for i in top_similar}

    if messages:
        print(f"The top recommended item IDs are: {list(rec_dict.keys())}")
        print(f"Their similarity scores are:\t  {list(rec_dict.values())}")

    # Return the top n most similar items
    return rec_dict

In [67]:
# Change this value to any title you'd like to get recommendations
title = "Apple launches Safari Technology Preview, a browser for developers including experimental web features"

# Check if the title is valid; if not, suggest alternatives and use the last one for recommendations
if title in df['title'].to_list():
    id = df.index[df['title']==title][0] # Convert the title to an index (i.e. item ID)
else:
    print(f"\"{title}\" is not in the data set. Try one of these:\n")
    for row in df.sample(n=10).itertuples():  # Get a random 10 titles
        id = row[0]
        title = row.title
        print(f'\t{title}')

print(f"\nIf you like \"{title},\" then you may also like:\n")

# Call the function and return the dictionary; print out the dictionary if you want to see what it is
recommend_dict = get_recommendations(id, cosine_sim, n=10, messages=False)

# Add the dictionary to a new DataFrame; this isn't necessary, but it helps to see what movies are recommended
df_similarity = pd.DataFrame(data=recommend_dict.values(), columns=['similarity'], index=recommend_dict.keys())

# Create a subset of the original df DataFrame with only the recommended movies
df_recommendations = df.loc[df.index.isin(recommend_dict.keys()), ['title', 'text']]

# Join the original df results with the recommended movie similarity scores so that we can sort the list and view it
df_recommendations.join(df_similarity).sort_values(by=['similarity'], ascending=False)


If you like "Apple launches Safari Technology Preview, a browser for developers including experimental web features," then you may also like:



Unnamed: 0,title,text,similarity
