In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate
from surprise import SVD, SVDpp
from sklearn.preprocessing import MinMaxScaler

In [3]:
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

In [244]:
colNames = ['user_id','criterion1', 'criterion2', 'criterion3', 'criterion4', 'overall', 'movie_id', 'number']
data_full = pd.read_csv("E:/Research Project/data_movies.txt", sep ="\t",header= None , names= colNames )
data_full.head()

Unnamed: 0,user_id,criterion1,criterion2,criterion3,criterion4,overall,movie_id,number
0,1,6,6,8,12,8,2,1
1,1,9,11,10,9,10,26,2
2,1,6,10,9,8,7,61,3
3,1,6,6,6,5,5,86,4
4,1,10,11,10,9,10,132,5


In [245]:
data_full_1 = data_full.loc[(data_full['user_id'] >= 1) & (data_full['user_id'] <= 5000)]
data_full_1.shape
print( "The data has ", len(data_full_1.movie_id.unique()), " uniques id and", len(data_full_1.user_id.unique()),"unique users")

The data has  975  uniques id and 5000 unique users


In [246]:
data_full = data_full_1 

In [247]:
data_full.shape

(51391, 8)

In [248]:
print( "The data has ", len(data_full.movie_id.unique()), " uniques id and", len(data_full.user_id.unique()),"unique users")

The data has  975  uniques id and 5000 unique users


In [249]:
# X = data_full[['user_id','criterion1', 'criterion2', 'criterion3', 'criterion4', 'movie_id']]
# y = data_full['overall']
# from sklearn.model_selection import train_test_split
# full_train, full_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# print(full_test.shape, full_train.shape)

In [250]:
data_full.shape

(51391, 8)

In [251]:
# creating a list of individual criteria values
list_of_criteria = {}
dfs = ('df1', 'df2', 'df3', 'df4')
criterion = ('criterion1', 'criterion2', 'criterion3', 'criterion4')
for df, ctn in zip(dfs, criterion):
    print(df, ctn)
    full_dataset = data_full[['user_id',"movie_id",ctn]]
    list_of_criteria[df] = full_dataset

df1 criterion1
df2 criterion2
df3 criterion3
df4 criterion4


In [252]:
list(list_of_criteria.keys())[0:]

['df1', 'df2', 'df3', 'df4']

In [253]:
list_of_criteria['df4'].head()

Unnamed: 0,user_id,movie_id,criterion4
0,1,2,12
1,1,26,9
2,1,61,8
3,1,86,5
4,1,132,9


## Individual criteria SVD predictions

In [254]:
def compute_svd_predictions(criteria_data):
    reader = Reader()
    #dataset creation
    criteria_data.columns =["userId", "movieId", "rating"]
    
    # scaling to a scale of 5 
    min_max_scaler = MinMaxScaler()
    rate = criteria_data[['rating']].values.astype(float)
    rate_scaled = min_max_scaler.fit_transform(rate)
    criteria_data[['rating']]=rate_scaled*5
    
    reader = Reader()
    svd = SVD()
        
    data = Dataset.load_from_df(criteria_data, reader)
    fullTrainSet = data.build_full_trainset()
    
    training_data, testing_data = train_test_split(data, test_size=1)
    svd.fit(training_data)
#     print(type(training_data))
    pred = svd.test(testing_data)
    pdf = pd.DataFrame(pred)
#     pred_data = pdf[['uid',"iid","est"]]
#     pd.merge(predicted_data, pred_data)
    
    return pdf

In [258]:
new_dataset = list_of_criteria['df1'][["userId","movieId"]]
# new_dataset = new_dataset.set_index(['userId','movieId'])
new_dataset.head()

Unnamed: 0,userId,movieId
0,1,2
1,1,26
2,1,61
3,1,86
4,1,132


##  Creating new dataset based on ealuated multiple criteria

In [256]:
new_full_data = data_full.drop(columns = "number")
new_full_data=new_full_data[["user_id", "movie_id", "criterion1", "criterion2", "criterion3", "criterion4", "overall"]]
new_full_data.columns =["userId", "movieId", "df1", "df2", "df3", "df4","overall"]
new_full_data.head()

Unnamed: 0,userId,movieId,df1,df2,df3,df4,overall
0,1,2,6,6,8,12,8
1,1,26,9,11,10,9,10
2,1,61,6,10,9,8,7
3,1,86,6,6,6,5,5
4,1,132,10,11,10,9,10


In [259]:
from surprise.model_selection import train_test_split
import random
random.seed(108)
for key in list_of_criteria.keys():
    print(key)
    pred_data = compute_svd_predictions(list_of_criteria[key])
    pred_data.columns =["userId", "movieId", "Orating", key,"details"]
    predicted_data = pred_data.drop(columns=["Orating","details"])
    predicted_data = predicted_data.set_index(['userId','movieId'])
#     print(predicted_data.head())
    new_dataset = predicted_data.combine_first(new_dataset.set_index(['userId','movieId'])).reset_index()
    print(new_dataset.shape)

df1
(51391, 3)
df2
(51391, 4)
df3
(51391, 5)
df4
(51391, 6)


In [260]:
# normalise the ratings of the new_full_data df columns on 1-5 scale
min_max_scaler = MinMaxScaler()

new_full_data[['df1']] = min_max_scaler.fit_transform(new_full_data[['df1']].values.astype(float))*5
new_full_data[['df2']] = min_max_scaler.fit_transform(new_full_data[['df2']].values.astype(float))*5
new_full_data[['df3']] = min_max_scaler.fit_transform(new_full_data[['df3']].values.astype(float))*5
new_full_data[['df4']] = min_max_scaler.fit_transform(new_full_data[['df4']].values.astype(float))*5
new_full_data[['overall']] = min_max_scaler.fit_transform(new_full_data[['overall']].values.astype(float))*5

new_full_data.head()


Unnamed: 0,userId,movieId,df1,df2,df3,df4,overall
0,1,2,2.083333,2.083333,2.916667,4.583333,2.916667
1,1,26,3.333333,4.166667,3.75,3.333333,3.75
2,1,61,2.083333,3.75,3.333333,2.916667,2.5
3,1,86,2.083333,2.083333,2.083333,1.666667,1.666667
4,1,132,3.75,4.166667,3.75,3.333333,3.75


In [261]:
new_dataset = new_dataset.set_index(['userId','movieId'])
new_full_data = new_dataset.combine_first(new_full_data.set_index(['userId','movieId'])).reset_index()
new_full_data.head()

Unnamed: 0,userId,movieId,df1,df2,df3,df4,overall
0,1,2,2.083333,2.083333,2.916667,4.583333,2.916667
1,1,26,3.333333,4.166667,3.75,3.333333,3.75
2,1,61,2.083333,3.75,3.333333,2.916667,2.5
3,1,86,2.083333,2.083333,2.083333,1.666667,1.666667
4,1,132,3.75,4.166667,3.75,3.333333,3.75


In [262]:
from sklearn.model_selection import train_test_split

X=new_full_data.iloc[:,:-1] ## independent features
y=new_full_data.iloc[:,-1] ## dependent features


In [263]:
X.shape

(51391, 6)

## Lasso Regression to compute the overall criteria

In [264]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

lasso=Lasso()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
lasso_regressor=GridSearchCV(lasso,parameters,scoring='neg_mean_squared_error',cv=5)

In [265]:
lasso_regressor.fit(X,y)
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)

{'alpha': 0.001}
-0.226096059373865


## Ridge regression to compute the overall criteria values 

In [266]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV


ridge=Ridge()
parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
ridge_regressor=GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=5)

In [267]:
ridge_regressor.fit(X,y)
print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)

{'alpha': 100}
-0.22609580649986538


In [268]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

prediction_lasso=lasso_regressor.predict(X_test)
prediction_ridge=ridge_regressor.predict(X_test)

In [269]:
y_test.shape

(15418,)

In [270]:
MSE_Lasso = np.square(np.subtract(y_test,prediction_lasso)).mean() 
RMSE_lasso = np.sqrt(np.mean((y_test-prediction_lasso)**2))
print(MSE_Lasso, ' is the MSE and ', RMSE_lasso, " is the RMSE for lasso regression")

0.22754742631309144  is the MSE and  0.4770193144025632  is the RMSE for lasso regression


In [271]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, prediction_lasso)

0.2844520254344407

In [272]:
MSE_Ridge = np.square(np.subtract(y_test,prediction_ridge)).mean() 
RMSE_ridge = np.sqrt(np.mean((y_test-prediction_ridge)**2))
print(MSE_Ridge, ' is the MSE and ', RMSE_ridge, " is the RMSE for Ridge regression")

0.2275468611354827  is the MSE and  0.47701872199682344  is the RMSE for Ridge regression


In [273]:
mean_absolute_error(y_test, prediction_ridge)

0.2844530245386916

In [274]:
type(y_test), type(prediction_lasso)

(pandas.core.series.Series, numpy.ndarray)

In [275]:
Actual = pd.DataFrame(y_test).reset_index(drop=True)
Actual['overall'] = np.where(Actual['overall'] <= 2.3, "Bad","Good")
Actual["overall"].unique(), Actual.shape

(array(['Good', 'Bad'], dtype=object), (15418, 1))

In [276]:
Predicted = pd.DataFrame({'overall': prediction_lasso})
Predicted['overall'] = np.where(Predicted['overall'] <= 2.3, "Bad","Good")
Predicted["overall"].unique(), Predicted.shape

(array(['Good', 'Bad'], dtype=object), (15418, 1))

In [277]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Actual["overall"], Predicted["overall"])

array([[ 2570,   463],
       [  193, 12192]], dtype=int64)

In [278]:
from sklearn.metrics import recall_score
recall_score(Actual["overall"], Predicted["overall"], average='macro')

0.9158812476032383

In [279]:
from sklearn.metrics import f1_score
f1_score(Actual["overall"], Predicted["overall"], average='macro')

0.9303102062235274

In [280]:
Predicted = pd.DataFrame({'overall': prediction_ridge})
Predicted['overall'] = np.where(Predicted['overall'] <= 2.3, "Bad","Good")
Predicted["overall"].unique(), Predicted.shape

(array(['Good', 'Bad'], dtype=object), (15418, 1))

In [281]:
from sklearn.metrics import f1_score
f1_score(Actual["overall"], Predicted["overall"], average='macro')

0.9302132247955739