In [48]:
import pandas as pd
import numpy as np

In [49]:
list_of_results = ['random_forest_final_predictions_0859','decision_tree_final_predictions_0859',
                   'grad_boost_final_predictions_0863','log_reg_genre_final_predictions_0869']

In [50]:
scores = []
solutions_df = pd.DataFrame()  # Initialize as an empty DataFrame
index_prediction = 0

In [51]:
for result in list_of_results:
    columns = ['TrackID', 'Predictor' + str(index_prediction)]  # append index at end of 'Predictor' column name
    file_name = 'C:/Users/navne/Python Files/Data Acquisition BIA 627/HW10_1/' + result + '.csv'
    new_result = pd.read_csv(file_name, names=columns, dtype={1:np.int64}, header=0)  # create temp df of current result file
    
    # Join the new prediction with the solutions_df
    if not solutions_df.empty:
        solutions_df = solutions_df.join(new_result.set_index('TrackID'), on='TrackID')
    else:
        solutions_df = pd.read_csv(file_name)  # If it's the first prediction, create the df
    
    # Correctly extract the numerical part of the filename to calculate the score
    score = float(result.split('_')[-1]) / (10 ** 4)  # Convert last part after underscore to float and normalize
    scores.append(score)
    index_prediction += 1  # increment index


In [52]:
solutions_df

Unnamed: 0,TrackID,Predictor,Predictor1,Predictor2,Predictor3
0,199810_208019,0,0,0,0
1,199810_9903,0,0,0,0
2,199810_242681,0,0,0,0
3,199810_74139,1,1,1,1
4,199810_18515,1,1,1,1
...,...,...,...,...,...
119995,249010_86104,0,0,0,0
119996,249010_293818,0,0,0,0
119997,249010_110470,1,1,1,1
119998,249010_186634,1,1,1,1


In [53]:
# Create initial S matrix with first Predictor column
S = np.array((solutions_df.iloc[:, 1] * 2 - 1))    # convert 0 to -1

In [54]:
S

array([-1, -1, -1, ...,  1,  1,  1], dtype=int64)

In [55]:
# Loop over the rest of the predictor columns to create the rest of the S matrix
for index in range(2, solutions_df.shape[1]):    # .shape[1] gives the # of columns in df
    S = np.c_[S, (solutions_df.iloc[:, index] * 2 - 1)]

In [56]:
S.shape

(120000, 4)

In [57]:
S

array([[-1, -1, -1, -1],
       [-1, -1, -1, -1],
       [-1, -1, -1, -1],
       ...,
       [ 1,  1,  1,  1],
       [ 1,  1,  1,  1],
       [ 1,  1,  1,  1]], dtype=int64)

In [58]:
N = len(S)    # 120,000 rows in this case
ST_x = []

# Generate S(transpose) * x -> N(2P_i - 1)
ST_x = [N * (2 * P - 1) for P in scores]    # list comprehension

# Generate S(transpose) * S
ST_S = np.dot(S.T, S).astype('float') + np.eye(S.shape[1]) * (10 ** -6)    # to prevent singular matrices

# Generate (S(transpose) * S)^-1
ST_S_inv = np.linalg.inv(ST_S)

# Generate a_LS = (S(transpose) * S)^-1 * N(2P_i - 1) 
# a_LS is the Least Squares solution
a_LS = np.dot(ST_S_inv, ST_x)

In [59]:
a_LS

array([-0.15865326, -0.15866089, -0.18534534, -0.35122798])

In [60]:
s_ensemble = np.dot(S, a_LS)

In [61]:
s_ensemble

array([ 0.85388747,  0.85388747,  0.85388747, ..., -0.85388747,
       -0.85388747, -0.85388747])

In [62]:
s_ensemble_len = len(s_ensemble)    # store length of s_ensemble
s_ensemble_len

120000

In [63]:
final_predictions = np.zeros(s_ensemble_len)    # initialize final solution with list of zeroes

In [64]:
# Loop through all 6 tracks for each user to get top 3 for each user
for index in range(s_ensemble_len // 6):    # floor division
    # Threshold is the third element in the sorted array
    user_score_threshold = np.sort(s_ensemble[index * 6 : index * 6 + 6])[2]    # sort the 6 values for each user and grab the third element
    for index_user in range(6):
        if s_ensemble[index * 6 + index_user] > user_score_threshold:
            final_predictions[index * 6 + index_user] = 1    # set top 3 to 1 (other 3 will be 0)

In [65]:
# Generate the final prediction df
final_predictions_df = pd.DataFrame(solutions_df.iloc[:,0])    # make new df using first column of solutions_df
final_predictions_df['Predictor'] = np.array(final_predictions, dtype=int)

In [66]:
final_predictions_df

Unnamed: 0,TrackID,Predictor
0,199810_208019,1
1,199810_9903,1
2,199810_242681,1
3,199810_74139,0
4,199810_18515,0
...,...,...
119995,249010_86104,1
119996,249010_293818,1
119997,249010_110470,0
119998,249010_186634,0


In [67]:
final_predictions_df.to_csv('Ensemble_Predictions.csv', index=False)