In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import string
import re

In [2]:
import gensim
from gensim import corpora, models, similarities

import nltk
from nltk.corpus import stopwords

In [3]:
import scipy
from scipy.spatial.distance import cdist
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from scipy.linalg import svd

In [4]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.model_selection import train_test_split

# Reading data sets

1. news_articles - Contains raw articles without user data amalgamation
2. user_interest - Contains raw articles mixed with user data as an amalgamation with ArticleID as Foreign Key

## news_articles

In [6]:
data = pd.read_csv(r'0_news_articles.csv')
data.head()

Unnamed: 0,Article_id,Title,Description,Date,Category,URL
0,0,Fire at Vaishno Devi shrine complex; cash coun...,"No one was injured in the fire, which broke ou...","June 8, 2021 7:28:32 pm",India,https://indianexpress.com/article/india/vaishn...
1,1,"Had not gone to meet Nawaz Sharif, says Uddhav...",Uddhav Thackeray led a delegation of his cabin...,"June 8, 2021 6:56:40 pm",India,https://indianexpress.com/article/india/had-no...
2,2,Corruption case: Former Haryana I-T deputy com...,It was in 2016 that the CBI had arrested Nitin...,"June 8, 2021 6:25:24 pm",India,https://indianexpress.com/article/india/corrup...
3,3,Kannur MP K Sudhakaran appointed chief of Cong...,Sudhakaran will replace Ramachandran who had a...,"June 8, 2021 5:04:40 pm",India,https://indianexpress.com/article/india/sudhak...
4,4,"Kerala girl of Class 5 writes to CJI, lauds SC...",Chief Justice N V Ramana responded to the Clas...,"June 8, 2021 4:43:10 pm",India,https://indianexpress.com/article/india/kerala...


## user_interest

In [7]:
user = pd.read_csv(r'2_user_interest.csv')
print(user.shape)
user.drop(columns= user.columns[0], 
        axis=1, 
        inplace=True)
user.head()


(1100, 11)


Unnamed: 0,Article_id,Title,Description,Date,URL,UserId,SessionId,Article Rank,Click,Time Spent (seconds)
0,1,"Had not gone to meet Nawaz Sharif, says Uddhav...",Uddhav Thackeray led a delegation of his cabin...,"June 8, 2021 6:56:40 pm",https://indianexpress.com/article/india/had-no...,1,1,2,True,24
1,2,Corruption case: Former Haryana I-T deputy com...,It was in 2016 that the CBI had arrested Nitin...,"June 8, 2021 6:25:24 pm",https://indianexpress.com/article/india/corrup...,1,1,3,True,18
2,6,Uddhav Thackeray meets PM Modi; discusses Mara...,Deputy Chief Minister and senior NCP leader Aj...,"June 8, 2021 2:25:23 pm",https://indianexpress.com/article/india/mahara...,1,1,7,True,47
3,7,"New Covid-19 vaccination guidelines out, alloc...","As per the revised guidelines, the Centre will...","June 8, 2021 4:27:29 pm",https://indianexpress.com/article/india/govt-r...,1,1,8,True,47
4,9,Fire at TMC MLA Madan Mitra’s residence in Kol...,Mitra along with his family members rushed out...,"June 8, 2021 12:53:24 pm",https://indianexpress.com/article/india/fire-m...,1,1,10,True,73


## Content Based

In [8]:
vectorizer = TfidfVectorizer(analyzer = 'word')
tfidf_matrix = vectorizer.fit_transform(user['Title'])
tfidf_matrix.shape

(1100, 4445)

In [9]:
# cosine-similarity (linear kernel)
cosine_similarity = linear_kernel(tfidf_matrix, tfidf_matrix)

# index of Title
indices = pd.Series(user['Title'].index)

In [10]:
# making predictions

def recommendation(index, method):
    id = indices[index]
    
    # fetching the top 10 articles
    
    similarity_score = list(enumerate(method[id]))
    similarity_score = sorted(similarity_score, key = lambda x: x[1], reverse = True)
    similarity_score = similarity_score[1:11]
    
    # get the article index
    news_index = [i[0] for i in similarity_score]
    
    # returning the top 10 most similar books
    return user['Title'].iloc[news_index]

In [11]:
# obtaining random input
input = user.sample()
input

Unnamed: 0,Article_id,Title,Description,Date,URL,UserId,SessionId,Article Rank,Click,Time Spent (seconds)
256,512,Government expects speedy launch of single-dos...,An application seeking regulatory approval for...,"May 28, 2021 7:03:20 am",https://indianexpress.com/article/india/govern...,502,497,3,True,82


In [12]:
# passing the input
recommendation(224, cosine_similarity)

290    Surat: Dalit youths attempt suicide over actor...
61     J&K journalist booked for WhatsApp status over...
595    Saurabh Chaudhary leads way in stellar show by...
317                       Jayant Chaudhary new RLD chief
211    Two shot by gunmen in J&K’s Anantnag; police s...
83     Ghaziabad: Swimming pool owner booked for lett...
223    Arunachal court remands Punjab YouTuber to six...
313    Black fungus in Haryana: 413 cases, no Covid i...
272    Abduction by ‘fake CBI officers’: Kolkata Poli...
622    Elavenil Valarivan, Saurabh Chaudhary put up s...
Name: Title, dtype: object