In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')


class PD(object):
    def __init__(self, df_data):
        self.df = df_data[['SID','NAME','CATEGORY','company','des']]

    def lem_word(self,text):
        
        # Init the Wordnet Lemmatizer
        lem = WordNetLemmatizer()
        tokens = [lem.lemmatize(word, pos = 'a') for word in text]
        tokens = [lem.lemmatize(word, pos = 'v') for word in tokens]
        tokens = [lem.lemmatize(word, pos = 'n') for word in tokens]

        return tokens
    
    def clean_text(self, text):
        # lower text
        text = text.lower()
        # remove text in square brackets
        text = re.sub('\[.*?\]', ' ', text)
        # remove punctuation
        text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
        # remove special characters
        text = re.sub('[^a-zA-Z]', ' ', text)
        # remove empty line
        text = re.sub('\n', ' ', text)
        # # remove words containing numbers.
        text = re.sub('\w*\d\w*', ' ', text)
        # remove extra whitespaces
        text = re.sub(' +', ' ', text)
        # # remove single character
        text = ' '.join([word for word in text.split() if len(word) > 1])
        return text.strip()

    def remove_sw(self,text):
        filtered_words = [word for word in text if word not in ENGLISH_STOP_WORDS]
        return filtered_words

    def convert(self):
        # instantiating and generating the count matrix
        count = CountVectorizer()
        count_matrix = count.fit_transform(self.df['final'])
        return count_matrix
    
    def pre_process(self):
        self.df['clean_des'] = self.df['des'].apply(self.clean_text)
        self.df['split_des'] = self.df['clean_des'].apply(lambda x: x.split())
        self.df['lemmatize_text'] = self.df['split_des'].apply(self.lem_word)
        self.df['text_remove_stopwords'] = self.df['lemmatize_text'].apply(lambda row: self.remove_sw(row))
        self.df['combine'] = self.df['text_remove_stopwords'].apply(lambda x: ' '.join(x))

        self.df['final'] = self.df['CATEGORY'] + ' ' + self.df['company'].apply(lambda x: str(x)) + ' ' +  self.df['combine'] 
        # creating a Series for the movie titles so they are associated to an ordered numerical
        # list I will use later to match the indexes
        self.df.set_index('SID', inplace = True)
        self.indices = pd.Series(self.df.index)

    def calculate(self):
        self.count_matrix = self.convert()
        self.cos_sim = cosine_similarity(self.count_matrix, self.count_matrix)
        return self.cos_sim
    
    def fit(self):
        self.pre_process()
        self.calculate()

    # function that takes in movie title as input and returns the top 10 recommended movies
    def recommendations(self, sid, top = 10):
        cosine_sim = self.cos_sim
        recommend_for_users = pd.DataFrame(columns=sid)  

        for id in sid:
            recommended_items = []
            # recommended_SID = []
            # gettin the index of the movie that matches the title
            idx = self.indices[self.indices == id].index[0]

            # creating a Series with the similarity scores in descending order
            score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

            # getting the indexes of the 10 most similar movies
            top_10_indexes = list(score_series.iloc[1:(1+top)].index)
            
            # populating the list with the titles of the best 10 matching movies
            for i in top_10_indexes:
                recommended_items.append(list(self.df.index)[i])
                # recommended_SID.append(list(self.df['SID'])[i])
            recommend_for_users[id] = (recommended_items)
            
        return recommend_for_users

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
df = pd.read_json('/content/drive/MyDrive/Training AI/crawl/all.json')

In [None]:
test = PD(df)

In [None]:
test.fit()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [None]:
test.recommendations([2,3,4,5,6,20,3], 5)

Unnamed: 0,2,3,4,5,6,20,3.1
0,14,22,18,22,31,33,22
1,225,40,11,39,29,15,40
2,194,11,40,29,26,25,11
3,195,13,13,38,10,16,13
4,207,33,22,3,9,14,33


In [None]:
test.recommendations([140], 20)

Unnamed: 0,140
0,173
1,155
2,145
3,143
4,152
5,147
6,146
7,176
8,169
9,161
