In [None]:
import pandas as pd
import numpy as np
import os
import csv

from matplotlib import pyplot as plt

############################################### Question 3.a #######################################################
'''reading the data and changing the values of Male, Female and Infant to corresponding binary values'''
def read_the_data():
    linreg_data = []
    with open('/linregdata') as file:
        data = csv.reader(file, delimiter = ',')
        for row in data:
            linreg_data.append(row)
    
    return linreg_data

def get_a_df_with_binary_values_corresponding_to_gender(linreg_data):
    
    
    binary_values = [[1, 0, 0] if item[0] == 'F' 
                          else ([0, 1, 0] if item[0] == 'I' 
                                else ([0, 0, 1] if item[0] == 'M' 
                                      else (item[0]))) 
                                           for item in linreg_data]
    
    return pd.DataFrame(binary_values)



def replace_gender_in_linreg_with_binary_values(binary_values, linreg_data):
    
    binary_values_df = pd.DataFrame(binary_values)
    linreg_df = pd.DataFrame(linreg_data)
    linreg_df.drop(0, axis = 1, inplace = True)
    
    for index in range(3):
        linreg_df.insert(loc = index, column = index + 10, value = binary_values_df.iloc[:,index])
    
    linreg_df.columns = np.arange(0, len(linreg_df.columns))
    return linreg_df


######################################## Question 3.b ######################################################

'''standardizing the data'''
def get_the_convert_data_types_dict(no_of_columns):
    convert_to = {}
    
    for i in range(3, no_of_columns):
        convert_to[i] = np.float32
        
    return convert_to


def convert_data_type_from_str_to_float(linreg_data):
    
    no_of_columns = np.shape(linreg_data)[1]
    convert_data_types = get_the_convert_data_types_dict(no_of_columns)
    linreg_data = linreg_data.astype(convert_data_types)
    return linreg_data


def get_the_mean_of_all_the_columns_in_the_data(linreg_data):
    
    mean_of_every_column = list(linreg_data.mean(axis = 0))
    standard_deviation_of_columns = list(round(linreg_data.std(axis = 0), 2))
    
    return mean_of_every_column, standard_deviation_of_columns
    



def standardize_the_data(linreg_data):
    
    standardized_linreg_data = pd.DataFrame([])
    mean_of_all_columns, standard_deviation_of_all_columns = get_the_mean_of_all_the_columns_in_the_data(linreg_data)
    column_names = list(linreg_data.columns.values)
    count = 0
    
    for column_name in column_names:
        standardized_linreg_data = linreg_data.apply(lambda column : ((mean_of_all_columns[count] - column)/standard_deviation_of_all_columns[count]), 
                                                                              axis = 0)
        count = count + 1
        
    
    return pd.DataFrame(standardized_linreg_data)
    

 ####################################### QUESTION 3. D ###############################################################

'''partitioned data into training set (80%) and test set (20%)'''
def split_data_into_features_taret_variables(linreg_data):
    input_features = linreg_data.iloc[:,0:len(list(linreg_data.columns.values)) - 1]
    target_variable = linreg_data.iloc[:, -1]
    
    return input_features, target_variable



def split_data_into_training_and_test_set(input_features, target_variable, fraction):
    no_of_rows_in_entire_set = np.shape(input_features)[0]
    
    no_of_rows_be_in_training_set = np.ceil(no_of_rows_in_entire_set * fraction)
    #no_of_rows_be_in_test_set = no_of_rows_in_entire_set - no_of_rows_be_in_training_set
    
    train_X = input_features.iloc[:int(no_of_rows_be_in_training_set), ]
    test_X = input_features.iloc[int(no_of_rows_be_in_training_set):,]
    
    train_Y = target_variable.iloc[0 : int(no_of_rows_be_in_training_set)]
    test_Y = target_variable.iloc[int(no_of_rows_be_in_training_set) : ]
    
    return [[train_X, train_Y], [test_X, test_Y]], no_of_rows_be_in_training_set




def reshape_the_size_of_W(W, to_no_of_columns, to_no_of_rows):
    
    no_of_rows_in_W, no_of_columns_in_W = np.shape(W)
    
    no_of_rows_to_be_added_to_W    = to_no_of_rows - no_of_rows_in_W
    no_of_columns_to_be_added_to_W = to_no_of_columns - no_of_columns_in_W
    
    W_row_reshaped = np.zeros((no_of_rows_to_be_added_to_W, 1), dtype=W.dtype)
    W = np.concatenate([W, W_row_reshaped])
    
    return W
    
    
    
    
    
####################################### QUESTION 3. C ###############################################################

'''performed least squares ridge regression with penalty parameter and predicted target variable by finding weights'''
def ridge_regression_cost_and_derivative(X, Y, lambda_value, W, no_of_rows_columns_to_be_added_to_W):

    temp = (np.dot(X, W) - np.array(Y).reshape(-1, 1))
    W_reshaped = reshape_the_size_of_W(W, no_of_rows_columns_to_be_added_to_W, no_of_rows_columns_to_be_added_to_W)
    
    
    '''ridge_reg_cost_func = [(y_i - w * x_i) ^ 2]+ lambda_value * w ^2
       partial derivative of ridge_reg_cost_func = 2[((y_i - w * x_i) * x_i.T)) + lambdata_value * w]'''
    partial_derivative_of_cost = 2 * ((np.dot(X.T, temp)/ np.shape(Y)[0]) + (lambda_value * W))
    
    return partial_derivative_of_cost


def mylinridgereg(X, Y, lambda_value, no_of_rows_be_in_training_set):
    
    no_of_columns_in_X = np.shape(X)[1]
    
    initial_W = np.random.uniform(-1, 1, size= no_of_columns_in_X).reshape(-1, 1)
    
    final_W = perform_gradient_descent_for_weights(X, Y, lambda_value, initial_W, no_of_rows_be_in_training_set)
    
    return final_W


def mylinridgeregval(X, W):
    dot_X_W = np.dot(X, W)
    return dot_X_W

    
def perform_gradient_descent_for_weights(X, Y, lambda_value, W, no_of_rows_be_in_training_set):
    learning_rate = 0.001
    no_of_iterations = 10
    final_weights = W
    
    for iteration_count in range(no_of_iterations):
        
        W_from_derivative = ridge_regression_cost_and_derivative(X, Y, lambda_value, 
                                                                       final_weights, no_of_rows_be_in_training_set)
        final_weights = final_weights - (np.multiply(learning_rate, W_from_derivative))
    
    return final_weights



def mean_squared_error(train_Y, predicted_Y):
    
    error = np.float64(0.0)
    len_of_Y = np.shape(predicted_Y)[0]
    train_Y = np.array(train_Y).reshape(-1, 1)

    for i in range(len_of_Y):
        error = error + pow((train_Y[i, 0] - predicted_Y[i, 0]), 2)
    
    return error/(2 * len_of_Y)
                               


def find_weights_with_different_lambda_values(train_X, train_Y, test_X, test_Y, no_of_rows_be_in_training_set):
    
    lambda_values = [value for value in range(0, 10)]

    train_Error = []
    test_Error = []

    min_error_diff_lambda_value = 0
    min_test_error = 0
    min_error_diff = 1
    predicted_train_Y_min_error_diff = np.array([])
    predicted_test_Y_min_error_diff = np.array([])
    best_weights_for_a_lambda = np.array([])
    for iteration in range(0, 10):
        
        final_weights = mylinridgereg(train_X, train_Y, lambda_values[iteration], no_of_rows_be_in_training_set)
        train_predicted_Y = mylinridgeregval(train_X, final_weights)
        train_error = mean_squared_error(train_Y, train_predicted_Y)
    
        
        test_predicted_Y = mylinridgeregval(test_X, final_weights)
        test_error = mean_squared_error(test_Y, test_predicted_Y)
        
 
        train_Error.append(train_error)
        test_Error.append(test_error)
        
#         if min_error_diff > (train_error - test_error) or iteration == 0:
#             min_error_diff = (train_error - test_error)
#             predicted_train_Y_min_error_diff = train_predicted_Y
#             predicted_test_Y_min_error_diff = test_predicted_Y
        
        if min_test_error > test_error or iteration == 0:
            min_error_diff = (train_error - test_error)
            predicted_train_Y_min_error_diff = train_predicted_Y
            predicted_test_Y_min_error_diff = test_predicted_Y
            min_test_error = test_error
            min_error_diff_lambda_value = iteration
            best_weights_for_a_lambda = final_weights
            
    best_values_predicted = [min_error_diff_lambda_value, min_error_diff, train_X, train_Y, predicted_train_Y_min_error_diff, 
                                                             test_X, test_Y, predicted_test_Y_min_error_diff, min_test_error, best_weights_for_a_lambda]
    
    return best_values_predicted, train_Error, test_Error
        

####################################### QUESTION 3. E ###############################################################  
'''Identify the λ with the best performance and examine the weights of the ridge regression model'''

def delete_least_significant_columns_from_data_using_final_weights(final_weights):
    
    indices_that_would_sort_weights_list = np.argsort(np.array(final_weights))
    indices_of_least_significant_weights = indices_that_would_sort_weights_list[3:6]
    
    return indices_of_least_significant_weights


def remove_least_weighted_columns_from_the_data(train_X, test_X, indices_of_top_3_minimum_weights):
    train_X.drop(indices_of_top_3_minimum_weights, axis = 1, inplace = True)
    test_X.drop(indices_of_top_3_minimum_weights, axis = 1, inplace = True)
    
    return train_X, test_X




####################################### QUESTION 3. F ###############################################################

'''plotting a graph between different lambda_values and corresponding MSE for both train_data and test_data'''

def plot_graph_for_lambda_and_errors(fraction_values, lambda_values, train_errors, test_errors):
    
    count = 0
    for fraction in fraction_values:
        train_error = train_errors[count]
        test_error = test_errors[count]
    
        plot_MSE(fraction, train_error, lambda_values, True)
        plot_MSE(fraction, test_error, lambda_values, False)
        count = count + 1
    

def plot_MSE(fraction, error, lambda_values, is_train_error):
    plt.plot(lambda_values, error) 
    # naming the x axis 
    plt.xlabel('lambda_values') 
    # naming the y axis 
    if is_train_error:
        plt.ylabel('train_MSE') 
        plt.savefig(str(fraction * 100) + '_TrMSE' + '.png')
    else:
        plt.ylabel('test_MSE')
        plt.savefig(str(fraction * 100) + '_TsMSE' + '.png')

    # giving a title to my graph 
    plt.title(str(fraction * 100) + ' training set ' + str(np.ceil((1 - fraction) * 100)) + ' test_set') 
    
    plt.show()
    
    

####################################### QUESTION 3. H ###############################################################

'''plotting graph between actual and predicted target values for both training and test set for various partitions'''   
def plot_graph_for_actual_and_predicted_values(best_values, fraction_values):
    
    for fraction_value, best_values_list in best_values.items():

        lambda_value = best_values_list[0]
        predicted_train_Y = best_values_list[4]
        actual_train_Y = best_values_list[3]
        
        predicted_test_Y = best_values_list[7]
        actual_test_Y = best_values_list[6]
        
        plot_actual_VS_predicted_values(fraction_value, actual_train_Y, predicted_train_Y, lambda_value, True)
        plot_actual_VS_predicted_values(fraction_value, actual_test_Y, predicted_test_Y, lambda_value, False)
        

def plot_actual_VS_predicted_values(fraction, actual, predicted, value, are_training_values):
   
    plt.plot(predicted, actual.tolist(), marker = 'o') 
    # naming the x axis 
    plt.xlabel('predicted') 
    plt.ylabel('actual') 
    # naming the y axis 
    if are_training_values:
        plt.title('training_set_predicted_VS_actual') 
        plt.savefig('lambda_' +str(value) + '_training_set' + '.png')
    else:
        plt.title('test_set_predicted_VS_actual') 
        plt.savefig('lambda_' +str(value) + '_test_set' + '.png')

    plt.show() 



    
####################################### QUESTION 3. G ###############################################################

def plot_graph_between_min_MSE_fraction_and_lambda_value(best_values_dict):
    data_partition_fraction_list = []
    lambda_values_list = []
    min_test_error_list = []
    for fraction_value, best_values_list in best_values_dict.items():
        data_partition_fraction_list.append(fraction_value)
        lambda_values_list.append(best_values_list[0])
        min_test_error_list.append(best_values_list[8])
        
    plot_min_MSE_fraction_lambda(min_test_error_list, lambda_values_list, True)
    plot_min_MSE_fraction_lambda(min_test_error_list, data_partition_fraction_list, False)
        

def plot_min_MSE_fraction_lambda(min_test_error, value, is_lambda_value):
   
    plt.plot(value, min_test_error, marker = 'o') 
    # naming the x axis 
    if is_lambda_value:
        plt.xlabel('lambda_value') 
    plt.ylabel('min_test_error') 
    # naming the y axis 
    if is_lambda_value:
        plt.title('lambda_VS_min_MSE') 
        plt.savefig('lambda_'+ 'MSE' + '.png')
    else:
        plt.title('partition_fraction_VS_MSE') 
        plt.savefig('fraction'  + 'MSE' + '.png')

    plt.show() 
        



In [5]:

################## Question 3. a

linreg_data_list = read_the_data()
binary_values = get_a_df_with_binary_values_corresponding_to_gender(linreg_data_list)
linreg_df = replace_gender_in_linreg_with_binary_values(binary_values, linreg_data_list)



In [6]:
################## Question 3. b

linreg_df = convert_data_type_from_str_to_float(linreg_df)
input_features, target_variable = split_data_into_features_taret_variables(linreg_df)
del(linreg_df)
del(binary_values)
train_data_standardized = standardize_the_data(input_features)




In [7]:
################## Question 3. d

training_test_set, no_of_rows_be_in_training_set = split_data_into_training_and_test_set(train_data_standardized, target_variable, 0.8)
train_X, train_Y = training_test_set[0]
test_X, test_Y = training_test_set[1]



In [9]:
################## Question 3. c

best_values_predicted, train_Error, test_Error = find_weights_with_different_lambda_values(train_X, 
                                                                        train_Y, test_X, test_Y, int(no_of_rows_be_in_training_set))








In [12]:
################## Question 3. e
'''best_values_predicted = [min_error_diff_lambda_value, min_error_diff, train_X, train_Y, 
                            predicted_train_Y_min_error_diff, test_X, test_Y, 
                            predicted_test_Y_min_error_diff, min_test_error]'''

#min_difference_error = min(best_values_predicted[8]
#index_of_min_difference_error  = error_Difference.index(min_difference_error)

weights_corresponding_min_test_error = best_values_predicted[9]

weights_list = [value[0] for value in weights_corresponding_min_test_error.tolist()]


indices_of_least_significant_columns = delete_least_significant_columns_from_data_using_final_weights(weights_list)

train_X_copy, test_X_copy = train_X, test_X

train_X_trimmed, test_X_trimmed = remove_least_weighted_columns_from_the_data(train_X_copy, 
                                                                              test_X_copy, indices_of_least_significant_columns)

best_values_predicted_t, train_Error_t, test_Error_t = find_weights_with_different_lambda_values(train_X_trimmed, 
                                                                    train_Y, test_X_trimmed, test_Y, int(no_of_rows_be_in_training_set))






A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [None]:
######################## question 3. g

plot_graph_between_min_MSE_fraction_and_lambda_value(best_values)


######################## question 3. h

plot_graph_for_actual_and_predicted_values(best_values, fraction_values)

In [None]:
######################## question 3. f

fraction_values = [0.5, 0.6, 0.75, 0.8, 0.9]
lambda_values_fraction = [value for value in range(0, 10)]
#final_Weights_fraction = []
train_Error_fraction = []
test_Error_fraction = []
best_values = {}

for fraction_value in fraction_values:
    training_test_set, no_of_rows_be_in_training_set = split_data_into_training_and_test_set(train_data_standardized, 
                                                                                             target_variable, fraction_value)
    train_X, train_Y = training_test_set[0]
    test_X, test_Y = training_test_set[1]
    
    best_values_predicted, train_Error, test_Error = find_weights_with_different_lambda_values(train_X, 
                                                                        train_Y, test_X, test_Y, int(no_of_rows_be_in_training_set))

    
    #final_Weights_fraction.append(final_Weights)
    train_Error_fraction.append(train_Error)
    test_Error_fraction.append(test_Error)
    best_values[fraction_value] = best_values_predicted
    
    

plot_graph_for_lambda_and_errors(fraction_values, lambda_values_fraction, train_Error_fraction, test_Error_fraction)



