In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

# Timer starts
a = datetime.datetime.now()

# compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)
def cosine_similarity(v1, v2):
    cos_sim = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
    return cos_sim

# Simple error
def _error(actual: np.ndarray, predicted: np.ndarray):
    return actual - predicted

# calculate Mean Absolute Error
def mae(actual: np.ndarray, predicted: np.ndarray):
    return np.mean(np.abs(_error(actual, predicted)))

# create similarity matrix
def pandas_sim(train_data, test_data):
    # fills similarity matrix pandas dataframe
    sim_matrix_df = pd.DataFrame(0, index=test_data.index, columns=train_data.index, dtype=float)
    for row in range(test_data.__len__()):
        for column in range(train_data.__len__()):
            sim_matrix_df.iat[row, column] = cosine_similarity(test_data.iloc[row].to_numpy(), train_data.iloc[column].to_numpy())
    # from pandas to numpy
    numpy_sim_matrix_df = sim_matrix_df.to_numpy()
    return numpy_sim_matrix_df

# Weighted k-NN algorithm
def weighted(user_based_table, train_data, test_data,numpy_sim_matrix_df, sorted_sim_matrix, k):
    averagemean_error = 0.0
    # find neigbours and calculate average mean error
    for i in range(len(sorted_sim_matrix)):
        total = np.zeros(user_based_table.keys().__len__(), dtype=float)
        # and select first k distances
        total_weight = 0
        for j in range(k):
            weight = 1/(1-(numpy_sim_matrix_df[i][sorted_sim_matrix[i][-j - 1]]))
            total = np.add(total, np.multiply(train_data.values[sorted_sim_matrix[i][-j - 1]], weight))
            total_weight += weight
        # average of the distances
        total = np.divide(total, total_weight)
        averagemean_error += mae(test_data.values[i], total)
    # converts % error
    averagemean_error = averagemean_error / len(sorted_sim_matrix) / 5 * 100
    return averagemean_error

# K-NN algorithm
def calc_mae(user_based_table, train_data, test_data, sorted_sim_matrix, k):
    averagemean_error = 0.0
    # find neigbours and calculate average mean error
    for i in range(len(sorted_sim_matrix)):
        total = np.zeros(user_based_table.keys().__len__(), dtype=float)
        # and select first k distances
        for j in range(k):
            total = np.add(total, train_data.values[sorted_sim_matrix[i][-j - 1]])
        # average of the distances
        total = np.divide(total, k)
        averagemean_error += mae(test_data.values[i], total)
    # converts % error
    averagemean_error = averagemean_error / len(sorted_sim_matrix) / 5 * 100
    return averagemean_error
# program starts
def main():

    # Handle Data
    ratings = pd.read_csv("ratings_train.csv", delimiter=',', encoding="utf8")
    user_based_table = pd.pivot_table(ratings, values="rating", index='userId', columns='movieId')
    # calculate average and fill for blocked NAN and similarity
    user_based_table = user_based_table.T.fillna(user_based_table.mean(axis=1)).T
    # k fold cross validation
    k = 7 # neighbors number
    k_fold_cross_value = 40
    piece = user_based_table.index.size // k_fold_cross_value # lenght of the every test piece
    # empty list for
    Avg_mean_error_list = [[], []]
    for i in range(1, k_fold_cross_value + 1):
        # split test data
        test_data = user_based_table[(i - 1) * piece:i * piece]
        # previously used test data
        piece_before_test_data = user_based_table[:(i - 1) * piece]
        # data -> after test piece
        piece_after_test_data = user_based_table[i * piece:]
        # combine piece_before_test_data and piece_after_test_data
        frames = [piece_before_test_data, piece_after_test_data]
        # train data
        train_data = pd.concat(frames)
        # numpy similarity matrix
        numpy_sim_matrix_df = pandas_sim(train_data, test_data)
        # output sorted array indices
        sorted_sim_matrix = np.argsort(numpy_sim_matrix_df, kind='heapsort', axis=1)
        b_timer = datetime.datetime.now()
        # add list calculated average MSE for k-NN
        Avg_mean_error_list[0].append(calc_mae(user_based_table, train_data, test_data, sorted_sim_matrix, k))
        c_timer = datetime.datetime.now()
        print("Test piece group=", i, "k-NN finished MAE: %", Avg_mean_error_list[0][i-1],"calculation time:", c_timer-b_timer , " ms", (c_timer-b_timer).seconds, " second")
        b_timer = datetime.datetime.now()
        # add list calculated average MSE for Weighted k-NN
        Avg_mean_error_list[1].append(weighted(user_based_table, train_data, test_data, numpy_sim_matrix_df, sorted_sim_matrix, k))
        c_timer = datetime.datetime.now()
        print("Test piece group=", i, "Weighted k-NN finished MAE: %", Avg_mean_error_list[1][i-1],"calculation time:", c_timer-b_timer, " ms", (c_timer-b_timer).seconds, " second")

    print("Average k-NN MAE: %", np.average(Avg_mean_error_list[0]))
    print("Average Weighted k-NN MAE: %", np.average(Avg_mean_error_list[1]))
    y_limit = round(np.max(Avg_mean_error_list))+5

    plt.plot(Avg_mean_error_list[0], "ro", Avg_mean_error_list[1], "bs")
    plt.axis([0, k_fold_cross_value, 0, y_limit])
    plt.ylabel('percent numbers')
    plt.xlabel("Every pieces MAE errors")
    plt.legend(("k-NN MAE", "Weighted k-NN MAE"))
    plt.show()

# program starts
main()
# Program and Timer end
b = datetime.datetime.now()
print("finished", b - a, " ms", (b - a).seconds, " second")



Test piece group= 1 k-NN finished MAE 0:00:00.024019  ms 0  second
Test piece group= 1 Weighted k-NN finished MAE 0:00:00.022017  ms 0  second
Test piece group= 2 k-NN finished MAE 0:00:00.009007  ms 0  second
Test piece group= 2 Weighted k-NN finished MAE 0:00:00.012024  ms 0  second
Test piece group= 3 k-NN finished MAE 0:00:00.011009  ms 0  second
Test piece group= 3 Weighted k-NN finished MAE 0:00:00.015007  ms 0  second
Test piece group= 4 k-NN finished MAE 0:00:00.010005  ms 0  second
Test piece group= 4 Weighted k-NN finished MAE 0:00:00.011010  ms 0  second
Test piece group= 5 k-NN finished MAE 0:00:00.012009  ms 0  second
Test piece group= 5 Weighted k-NN finished MAE 0:00:00.039028  ms 0  second
Test piece group= 6 k-NN finished MAE 0:00:00.009007  ms 0  second
Test piece group= 6 Weighted k-NN finished MAE 0:00:00.009008  ms 0  second
Test piece group= 7 k-NN finished MAE 0:00:00.009009  ms 0  second
Test piece group= 7 Weighted k-NN finished MAE 0:00:00.008008  ms 0  second

# BBM409 Assignment 1
#Mehmet Taha USTA B21527472
# THEORY QUESTİON
# k-	Nearest Neighbor Classification
#Question1 -> dataset grows efficiency or speed of algorithm declines very fast.
#Question1 -> K-NN is to choose the optimal number of neighbors to be consider while classifying the new data entry.
#Question1 -> k-NN doesn’t perform well on imbalanced data. If we consider two classes, A and B, and the majority of the training data is labeled as A, then the model will ultimately give a lot of preference to A. This might result in getting the less common class B wrongly classified.
#Question1 -> K-NN algorithm is very sensitive to outliers as it simply chose the neighbors based on distance criteria.

#Question2
#Leave-one-out cross-validation is approximately unbiased, because the difference in size between the training set used in each fold and the entire dataset is only a single pattern. 
#However, while leave-one-out cross-validation is approximately unbiased, it tends to have a high variance (so you would get very different estimates if you repeated the estimate with different initial samples of data from the same distribution). As the error of the estimator is a combination of bias and variance, whether leave-one out cross-validation is better than 10-fold cross-validation depends on both quantities.

#Question3
#k=1 -> o will be negative(-) because nearest 1 point negative(-),
#k=3 -> o will be negative(-) because nearest 2 point negative(-), 1 point positive(+)    
#k=5 -> o will be positive(+) because nearest 2 point negative(-), 3 point positive(+)  
#Question4
#-True
#-False
#-False

# Linear Regression
#Question1 -> x' = (x-min(x)) / ( max(x) - min(x) ) --> x' = (4489 - 2025) / (8464 - 2025) --> 2464 / 6439 = 0,3826681161671067
#Question1 -> x' = (x-mean(x)) / ( max(x) - min(x) ) --> x' = (4489 - 4900) / (8464 - 2025) --> -411 / 6439 = - 0,0638297872340426

#Question2
#-> two methods are used for least square line fit
#-> vertical offset is best lineer Model will have the smallest value for lineer model vector
#-> The vertical offsets fitting is more simple and more often used method.
#-> A condition for finding the line with the least sum of squared perpendicular distances to a point-cloud of data is that you know / assume / demand that the line passes through the centre of gravity of the point-cloud. This assumption will not be proved here but makes good sense if you think of the point-cloud as a physical body and the deduced lines as principal axis of inertia.

#Question3
#y= a+bx
#0.5 = a + b
#1 = a + 2b
#2 = a + 4b
#0 = a + 0
#a= 0 b = 0.5
#
#Question4
#It basically helps to normalise the data within a particular range. Sometimes, it also helps in speeding up the calculations in an algorithm.
#It makes interpretation much easier. 
#It can make the analysis of coefficients easier.
#The features with high magnitudes will weigh in a lot more in the distance calculations than features with low magnitudes.To supress this effect, we need to bring all features to the same level of magnitudes.

# Movie Recommendation System
#Program starts by running main function
#file ("ratings_train.csv") is read by pandas
#create dataframe according to user and movies
#The values k and k_fold_cross_value are set manually
#The data is divided by applying k_fold_cross_validation.
#The similarities of each test data are calculated by the pandas_sim (train_data, test_data) function
#similarity matrix is sorted by np.argsort
#For k-NN, the nearest neighbors will be calculated by the calc_mae (...) function. then the percentage average MSE will be calculated
#Weighted For k-NN, the nearest neighbors will be calculated by the weighted (...) function. then the percentage average MSE will be calculated
#Each test piece MSE results are printed to the console
#Finally, Average k-NN MAE and Average Weighted k-NN MAE are printed to the console and graphically visualized
#Program ends and run time is printed to the console

#computer system features -> 8 gb ram, intel i5 4210U 1.7ghz to 2.4ghz (2 core 4 thread)
#Average working time = 1min
#k=1 k_fold_Cross_validation=10 time= 1min 2s MAE = %22.44
#k=1 k_fold_Cross_validation=20 time= 1min 2s MAE = %22.62
#k=1 k_fold_Cross_validation=30 time= 1min MAE = %22.63
#k=1 k_fold_Cross_validation=40 time= 59s MAE = %22.8

#k=3 k_fold_Cross_validation=10 time= 1min 1s MAE = %18.
#k=3 k_fold_Cross_validation=20 time= 1min 4s MAE = %19.
#k=3 k_fold_Cross_validation=30 time= 1min 2s MAE = %19.
#k=3 k_fold_Cross_validation=40 time= 59s MAE = %19.

#k=5 k_fold_Cross_validation=10 time= 58s MAE = %17.5
#k=5 k_fold_Cross_validation=20 time= 1min MAE = %17.6
#k=5 k_fold_Cross_validation=30 time= 1min 1s MAE = %17.7
#k=5 k_fold_Cross_validation=40 time= 59s MAE = %17.9

#k=7 k_fold_Cross_validation=10 time= 58s MAE = %16.3
#k=7 k_fold_Cross_validation=20 time= 1min MAE = %16.5
#k=7 k_fold_Cross_validation=30 time= 1min 1s MAE = %16.7
#k=7 k_fold_Cross_validation=40 time= 58s MAE = %16.8