# Data Recommender Engine


## Pre-proccessing

In [None]:
# import
import pandas as pd 
import numpy as np

In [None]:
# import dataframe
data=pd.read_csv('Data/data_cleaned.csv')

#create copy
df=data.copy()

In [None]:
# get rid of first column since its not needed

df = df.iloc[: , 1:]

In [None]:
df.head()

In [None]:
# de-dummies variables
# all the category variables were turned into dummy-variables in order to aggregate the duplicate reviews
# they now need to be turned back into categorical variables

#create new 
df['tags']=''

# takes dummmies and puts them in df.tags
for col_name in df.columns[7:]:
    df.loc[df[col_name]==1,'tags']= df['tags']+' '+col_name

# get rid of columns with dummy variables
df=df[['podcast_id', 'author_id', 'created_at', 'podcast', 'title', 'rating', #gets rid of dummie features
       'content','tags']]


# slice strings to get rid of "category_" part of the value

df['tags']=df['tags'].astype(str)

df['tags']=df['tags'].apply(lambda x: [w[9:] for w in x.split()])

In [None]:
# change date+time value in in the 'created_at' column into 'year'

# change to datetime
df['created_at']=pd.to_datetime(df['created_at'])

# change datetime to year
df['year']=df['created_at'].apply(lambda x: x.strftime('%Y'))

In [None]:
# turn rating into string 
df['rating'] = df.rating.astype(str)

In [None]:
# pre-processsing content

#installing all packages
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize.treebank import TreebankWordDetokenizer
import nltk
nltk.download('wordnet')

# pseudo-pipeline
def cleaning_content(x):
    
    # tokenize and remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized=tokenizer.tokenize(x)
    
    # turn all characters into lowercase
    lowered = [w.lower() for w in tokenized]
    
    #removing stopwords
    stop_words = stopwords.words('english')
    rm_stopwords=[w for w in lowered if not w in stop_words]
    
    #lemmatize
    lemmatizer = WordNetLemmatizer()  
    lemmatized=[lemmatizer.lemmatize(token) for token in rm_stopwords]

    return lemmatized

df['lemmatized']=[cleaning_content(x) for x in df['content']]



In [None]:
# keep only necessary rows
df=df[['podcast_id','author_id','podcast','year','rating','lemmatized','tags']]

# turn rating into string 
df['rating'] = df.rating.astype(str)

In [None]:
# create a soup column which combines all the features
# since the dataframe has some columns which are strings and others which are lists of strings, addditional processing needs to be performed

# combining all the stringed columns into a column which will be called 'soup'
df['string_soup']=df['author_id']+' '+df['year']+' '+df['podcast']+' '+df['rating']

# this soup column will not have the rating included. Later will be testing whether changes in parameters will create better results
df['string_soup_r']=df['author_id']+' '+df['year']+' '+df['podcast']


# combine all lists 
def extender(x):
    return ' '.join(x['tags'] + x['lemmatized'])
df['lst_soup']=df.apply(extender, axis=1)


# combine both columns into one soup
def extender_soup(x):
    return x['string_soup'] + ' ' + x['lst_soup']
df['soup']=df.apply(extender_soup, axis=1)

# combine both columns into one soup wihout rating
def extender_soup(x):
    return x['string_soup_r'] + ' ' + x['lst_soup']
df['soup_r']=df.apply(extender_soup, axis=1)

In [None]:
# making copy of dataframe

df_soup=df.copy()
df_soup_r=df.copy() #this will be the new dataframe with rating as it's own column

# dropping all columns except for the soup

df_soup=df_soup[['podcast_id','soup']] 
df_soup_r=df_soup_r[['podcast_id','soup','rating']]

In [None]:
df_soup_r.head()

In [None]:
df_soup.head()

## Models

### Cosine Similarity

In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

# Run Vectorizer
count = CountVectorizer(max_features=5000)
count_matrix = count.fit_transform(df_soup['soup'])
count_matrix.shape

In [None]:
count_matrix

In [None]:
#turn matrix into dataframe
df_countmatrix=pd.DataFrame(count_matrix, columns=['vectors'])

#merge podcast_id and countmatrix
df_countmatrix=df_soup[['podcast_id']].join(df_countmatrix)
df_countmatrix.head()

In [None]:
# import sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

def recommend_podcast_cosine(df, index):
    smin = None
    imax2 = None
    row = df.iloc[index]
    
    # loop through dataframe rows
    for index2, row2 in df.iterrows():
    
        #loop through column in those rows
        #index != index2:
        if row['podcast_id'] != row2['podcast_id']: #index != index2:
            s = cosine_similarity(row['vectors'], row2['vectors'])
            
            # find smallest distance between two vectors
            if smin == None or s < smin:
                smin = s
                imax2 = index2
    return imax2, smin

In [None]:
i = 1234
imax2 = recommend_podcast_cosine(df_countmatrix,i)
x=imax2[0]
print(list(df_soup.iloc[i]))
print(list(df_soup.iloc[x]))

### Jaccard Similarity

#### Jaccard Similarity

In [None]:
# function to create jaccard simualirty score
# there is no prebuilt jaccard function that takes both float points and strings, so I created my own

def jaccard_similarity(a, b): 
    # convert to set and tokenize soup
    a1 = set(a.split()) ####  can remove this and tokenize df_soup['soup']
    b1 = set(b.split())
    
    # calucate jaccard similarity
    j = float(len(a1.intersection(b1))) / len(a1.union(b1))
    return j

In [None]:
# Return recommended Podcast ID for input index of a review
def recommend_podcast_jaccard(df, index):
    smax = 0
    row = df.iloc[index]
    for index2, row2 in df.iterrows():
    # print(index, row['podcast_id'], row['soup'])
    
    #loop through column in those rows and get the cosine similarity 
        #index != index2:
        if row['podcast_id'] != row2['podcast_id']: #index != index2:
            s = jaccard_similarity(row['soup'], row2['soup'])
            # print(index, index2, row['podcast_id'], row['podcast_id'], s)
            
            #find greates jaccard scores
            if s > smax:
                smax = s
                imax2 = index2
    return imax2, smax

In [None]:
# results

i = 12345
imax2 = recommend_podcast_jaccard(df_soup, i)
x=imax2[0]
print(list(df_soup.iloc[i]))
print(list(df_soup.iloc[x]))

#### Jaccard score with weighted rating

In [None]:
# Return recommended Podcast ID for input index of a review with a weighted rating
# instead of including it within the jaccard score, the was multiplied by the jaccard score

def recommend_podcast_jaccard_rating(df, index):
    smax = 0
    row = df.iloc[index]
    for index2, row2 in df.iterrows():
        #print(index, row['podcast_id'], row['soup'])
    
    #loop through column in those rows and get the jaccard score
    #index != index2:
        if row['podcast_id'] != row2['podcast_id']:
            s = jaccard_similarity(row['soup'], row2['soup'])
            
            #weighted rating score
            s = float(s) * float(row2['rating'])
            
            if s > smax:
                smax = s
                imax2 = index2
    return imax2, smax

In [None]:
# no real difference in results

i = 1234
imax2 = recommend_podcast_jaccard_rating(df_countmatrix,i)
x=imax2[0]
print(list(df_soup_r.iloc[i]))
print(list(df_soup_r.iloc[x]))