In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('Sentiment.csv')
data = data[data.User != "None"]
data.head()

Unnamed: 0,Title,Comment,User,Sentiment
0,A Clockwork Orange,"Loved the book, movie was mediocre",19lins90,0.5994
1,A Clockwork Orange,My buddy told me it was one of his all time fa...,Anefor,0.6369
2,A Clockwork Orange,I think it's hard to put the extreme emotions ...,cheddarfire,-0.431
3,A Clockwork Orange,I haven't seen it but I'm tempted to watch it....,DanceFactory,-0.5965
4,A Clockwork Orange,"I saw it once, and really enjoyed it but I've ...",High7323,0.177


In [2]:
#remove None users
noNone = data[data.User != "None"]
noNone.head()
noNone.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 410 entries, 0 to 454
Data columns (total 4 columns):
Title        410 non-null object
Comment      410 non-null object
User         410 non-null object
Sentiment    410 non-null float64
dtypes: float64(1), object(3)
memory usage: 16.0+ KB


In [38]:
#Create a BOW representation for each title
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vec = CountVectorizer()
comment_count = count_vec.fit_transform(data['Comment'])


tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf.fit(comment_count)

item_profiles = [] #array where profiles will be stored
df_idf_array = [] #array of dataframes where the idf will be stored

titles = data['Title'].unique() #makes an array of titles
for t in titles:
    data1 = data[data['Title'] == t]
    comments = [' '.join(data1['Comment'])]#takes all topics with that title
    item_profiles.append(tfidf.transform(count_vec.transform(comments))) #append bow of all comments in the array
    df_idf = pd.DataFrame(tfidf.idf_, index=count_vec.get_feature_names(), columns=['idf_weights']) #create a dataframe with the idf_ values for the current topic
    df_idf.sort_values(by=['idf_weights'])
    df_idf_array.append(df_idf)
print(item_profiles[0].toarray())
print(len(count_vec.vocabulary_))
item_profiles[0].shape

[[0. 0. 0. ... 0. 0. 0.]]
3103


array(['A Clockwork Orange', 'A Serious Man', 'Battle Beyond the Stars',
       'Cannibal Holocaust', "Carlito's Way", 'Contagion',
       'Cowboy Bebop : The Movie', 'CRONOS', 'Dogtooth', 'Dreamcatcher',
       'Dunkirk', 'Edge of Tomorrow', 'Face/Off', 'Kenshin: Samurai X',
       'Monuments Men', 'Prince of Persia movie', 'RAN',
       'Requiem For a Dream', 'Rocky franchise', 'Shame!!!',
       'Solo: a Star Wars Story', 'Southern Comfort',
       'Star Wars The Force Awakens', 'The Godfather Part 3',
       'The Kingsman', 'The One', 'The Place Beyond the Pines',
       'The Princess Bride', 'Tombstone', 'Train to Busan',
       'Twilight Zone: The Movie', 'Vader: Resurrection', 'Wild At Heart'],
      dtype=object)

In [4]:
#get user unique ids
users = data['User'].unique()
users_id = {}

for i in range(0, len(users)):
    users_id.update({i:users[i]})

users_id[0]

#get title unique ids
titles_id = {}

for i in range(0, len(titles)):
    titles_id.update({i:titles[i]})

titles_id[0]

'A Clockwork Orange'

In [5]:
#assignign each comment author his id

u_id = []
for u in data['User']:
    for i in range(0,len(users_id)):
        if u == users_id[i]:
            u_id.append(i)


#assignign each comment author his id

t_id = []
for t in data['Title']:
    for i in range(0,len(titles_id)):
        if t == titles_id[i]:
            t_id.append(i)
            
data['User_id'] = u_id
data['Title_id'] = t_id

data


Unnamed: 0,Title,Comment,User,Sentiment,User_id,Title_id
0,A Clockwork Orange,"Loved the book, movie was mediocre",19lins90,0.5994,0,0
1,A Clockwork Orange,My buddy told me it was one of his all time fa...,Anefor,0.6369,1,0
2,A Clockwork Orange,I think it's hard to put the extreme emotions ...,cheddarfire,-0.4310,2,0
3,A Clockwork Orange,I haven't seen it but I'm tempted to watch it....,DanceFactory,-0.5965,3,0
4,A Clockwork Orange,"I saw it once, and really enjoyed it but I've ...",High7323,0.1770,4,0
...,...,...,...,...,...,...
450,Wild At Heart,I wish this would be on a streaming sight or r...,Slashman78,0.2431,382,32
451,Wild At Heart,I used to have a giant poster of this movie ye...,surfinbird,0.0000,383,32
452,Wild At Heart,Batshit crazy but always watchable.,T_Kelly_Lee,-0.1779,384,32
453,Wild At Heart,I think this is cool if only because Su Tissue...,vivnsam,0.3182,385,32


In [6]:
#calculate user profiles as follow:
#sum all the item profiles for the films he gave a rating and multiply each of that for the corresponding sentiment

user_profile = []

for i in range (0,len(users)):
    user_profile.append(0)

#we are using this indexing so that we take the user's id from the df and insert in his corresponing item
for index, row in data.iterrows():
    user_profile[row['User_id']] += (item_profiles[row['Title_id']] * row['Sentiment'])

print(user_profile[0].shape)
print(user_profile[26].toarray())

(1, 3103)

In [15]:
#we are going to use the cosine function given by scipy library
from scipy import spatial

cd = 1 - spatial.distance.cosine(user_profile[0].toarray(), item_profiles[4].toarray())
cd

0.4720464723938671

In [75]:
#creation of a matrix of nxm which contains the cosine distance between each user and each item (n = number of users, m = number of items)
#doing that we will have a rating for each item

matrix = np.zeros(shape=(len(users), len(titles)))
matrix.shape

for i in range(0,len(users)):
    for j in range(0,len(titles)):
        matrix[i,j] = 1 - spatial.distance.cosine(user_profile[i].toarray(), item_profiles[j].toarray())

print(1 - spatial.distance.cosine(user_profile[0].toarray(), item_profiles[0].toarray()))

(1, 3103)
0.9999999999999999


In [72]:
#create a df which contains all the ratings calculated before so that we can build a utility matrix

daf = pd.DataFrame(columns = ['user_id', 'item_id', 'rating'])
us_index = []
it_index = []
rating = []

for i in range(0,len(users)):
    for j in range(0,len(titles)):
        us_index.append(i)
        it_index.append(j)
        rating.append(matrix[i,j])

daf['user_id'] = us_index
daf['item_id'] = it_index
daf['rating'] = rating
daf.head()

Unnamed: 0,user_id,item_id,rating
0,0,0,1.0
1,0,1,0.417715
2,0,2,0.262985
3,0,3,0.439898
4,0,4,0.472046


In [74]:
#this is our utility matrix

um = daf.pivot_table(index='user_id', columns='item_id', values='rating')
um.head()

item_id,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.417715,0.262985,0.439898,0.472046,0.334766,0.38724,0.114935,0.175678,0.381682,...,0.506979,0.494291,0.225825,0.329254,0.527763,0.344943,0.166801,0.266061,0.266763,0.384255
1,1.0,0.417715,0.262985,0.439898,0.472046,0.334766,0.38724,0.114935,0.175678,0.381682,...,0.506979,0.494291,0.225825,0.329254,0.527763,0.344943,0.166801,0.266061,0.266763,0.384255
2,-1.0,-0.417715,-0.262985,-0.439898,-0.472046,-0.334766,-0.38724,-0.114935,-0.175678,-0.381682,...,-0.506979,-0.494291,-0.225825,-0.329254,-0.527763,-0.344943,-0.166801,-0.266061,-0.266763,-0.384255
3,-1.0,-0.417715,-0.262985,-0.439898,-0.472046,-0.334766,-0.38724,-0.114935,-0.175678,-0.381682,...,-0.506979,-0.494291,-0.225825,-0.329254,-0.527763,-0.344943,-0.166801,-0.266061,-0.266763,-0.384255
4,1.0,0.417715,0.262985,0.439898,0.472046,0.334766,0.38724,0.114935,0.175678,0.381682,...,0.506979,0.494291,0.225825,0.329254,0.527763,0.344943,0.166801,0.266061,0.266763,0.384255


In [79]:
#compare true rating with calculated rating using users who gave more than one rating
duplicated = data[data['User'].duplicated()].sort_values(by=['User'])
duplicated


Unnamed: 0,Title,Comment,User,Sentiment,User_id,Title_id
368,The Princess Bride,The best movie ever made.,2-Shanks,0.6369,313,27
110,Dunkirk,I liked it less and less the more times I saw it.,IWW4,0.4215,26,10
150,Edge of Tomorrow,It was great until Cruise and Blunt got to the...,IWW4,0.6249,26,11
203,RAN,"It is incredible.. The scale of the battles, t...",IWW4,-0.34,26,16
448,Wild At Heart,I don't really like it all.,IWW4,-0.3241,26,32
243,Solo: a Star Wars Story,I have always found origin stories to be a was...,IWW4,-0.6486,26,20
438,Twilight Zone: The Movie,"I liked the Nightmare at 50,000 feet segment. ...",JosephFurguson,-0.2354,89,30
360,The Place Beyond the Pines,"As a remake of Synecdoche*, NY, I admire it fo...",JosephFurguson,0.713,89,26
244,Solo: a Star Wars Story,>But i won't likely see it since i ultimately ...,Kylon1138,0.1531,103,20
206,RAN,I honestly can't tell if you're trolling or if...,Sabnitron,0.5859,149,16


In [86]:
print("True score: 0.4215 --- Calculated score: {}".format(matrix[26][10]))
print("True score: 0.6249 --- Calculated score: {}".format(matrix[26][11]))
print("True score:-0.3400 --- Calculated score: {}".format(matrix[26][16]))
print("True score:-0.6486 --- Calculated score: {}".format(matrix[26][32]))

True score: 0.4215 --- Calculated score: 0.23035013351183253
True score: 0.6249 --- Calculated score: 0.4291393513552464
True score:-0.3400 --- Calculated score: -0.2687844932175829
True score:-0.6486 --- Calculated score: -0.199706856860544
