# Task 1 (Fixed, half of Task 2)

# Data exploration of features obtained via transfer learning

The objective of this notebook is to:
- identify the type of data we are ingesting
- test the model with different hyperparameters and calculate each test root mean square error per cristaline system
- present the best 3 models and their performance by category(cristaline system)

In order to achieve this objectives we are going to use the notebooks made/provided by Dr.Juan Ivan Gomez and his analysis of features using transfer learnging techniques

In [None]:
import tensorflow as tf

# Verificar si se está utilizando la GPU
if tf.test.gpu_device_name():
    print('GPU encontrada.')
else:
    print("No se encontró GPU. Se utilizará la CPU.")

# Librerias adicionales:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.linear_model import LinearRegression as LR
from matplotlib.backends.backend_pdf import PdfPages
import csv

We retrieve the features obtained in the previous notebooks

In [2]:
directorio = "C:/Users/marit/Documents/UPY Estancia I/latpar_project/LatPars_SuperModel/"

In [3]:
raw_features = {}

for size in ['0050','0100','0250','macro']:
    raw_features[f'hf_{size}'] = np.load(directorio + f'hf_{size}.npy')
    y = np.load("C:/Users/marit/Documents/UPY Estancia I/latpar_project/latpars.npy")    # Not needed.
    print(raw_features[f'hf_{size}'].shape)

(231632, 480)
(231632, 480)
(231632, 480)
(231632, 480)


Exploring a little bit of the first array to have a better vision of what we're working with.

In [4]:
df = pd.DataFrame(raw_features['hf_0100'])
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,470,471,472,473,474,475,476,477,478,479
0,-2.58456,-2.632532,-1.014055,9.01744,-1.718816,6.906417,-2.970533,-1.071763,-2.386293,-0.703034,...,-1.185041,-2.106664,7.848785,-2.494743,-3.143651,6.798447,-3.644854,0.226932,-2.160397,-1.425291
1,-1.412087,-3.524274,-0.154388,-0.038311,-1.923892,-1.22642,0.839404,-1.3152,5.070639,-2.445714,...,0.322201,0.242484,0.45899,-2.178892,-2.870002,6.599074,-2.306636,0.192461,-1.828919,-3.663251
2,-2.422111,-2.329903,-1.268017,-1.909266,-1.972611,-1.261817,2.268379,-1.373033,0.667226,-2.418815,...,-0.627661,0.045241,0.861677,-2.268018,-2.904607,6.590966,-2.628865,0.383755,-1.922455,-4.029851
3,0.445722,-2.811319,0.904307,0.676237,-1.029232,-0.370148,-2.617837,-1.300127,5.016732,2.322985,...,-1.458181,-1.675237,1.538714,-2.200254,-2.892629,7.054496,-3.398879,0.550226,-1.851339,-3.225384
4,-1.147546,-3.646213,-0.466969,-2.897175,-2.012797,-1.412958,7.176705,-1.619964,1.425452,-1.206494,...,-0.081721,0.198071,1.172589,-2.38165,-2.207114,6.628587,-2.073622,0.563037,-2.041709,-4.154495


Now we take a look into the distributions of our data using histograms for the first array 'hf_0100' from the 'raw_features' dictionary. Also we're going to take all this histograms and standard deviations data and save it into a PDF. And lastly it will take all the computed statistics values and save them into a CSV file to have better management of these values to future tasks.

In [19]:
data = raw_features['hf_0100']    # Array used in this cell.
all_mean = {}
all_median = {}
all_std_dev = {}    # Empty dictionaries to put all statistics values.


with PdfPages('histograms_hf_0100.pdf') as pdf:
    for i, values in enumerate(data.T):
        
        # Compute basic statistics.
        mean = np.mean(values)
        median = np.median(values)
        std_dev = np.std(values)
        all_mean[i] = mean
        all_median[i] = median 
        all_std_dev[i] = std_dev    # Saving the values per iteration in its respective dictionary.

        # Plotting histogram of the array.
        plt.figure(figsize=(5, 3))
        plt.hist(values, bins=100, color='skyblue', edgecolor='black', alpha=0.7)
        plt.title('Histogram of Data Distribution')
        plt.xlabel('Values')
        plt.ylabel('Frequency')
        plt.grid(True)

        # Add standard deviation to the plot.
        plt.text(0.5, 0.95, f'Standard Deviation: {std_dev:.2f}', horizontalalignment='center', verticalalignment='center', transform=plt.gca().transAxes)

        # Save the current figure to the PDF.
        pdf.savefig()
        
        # Close the current figure to release memory.
        plt.close()

print("PDF file with histograms and standard deviations saved successfully.")

# Name of the output csv file.
csv_filename = 'statistics_hf_0100.csv'

# Open CSV file in write mode.
with open(csv_filename, mode='w', newline='') as csv_file:
    fieldnames = ['Index', 'Mean', 'Median', 'Standard Deviation']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write the header row.
    writer.writeheader()

    # Write statistics data to CSV file.
    for i in range(len(data.T)):
        writer.writerow({'Index': i, 'Mean': all_mean[i], 'Median': all_median[i], 'Standard Deviation': all_std_dev[i]})

print("CSV file with statistics saved successfully.")

PDF file with histograms and standard deviations saved successfully.
CSV file with statistics saved successfully.


Now we take a look into the distributions of our data using histograms for the first array 'hf_0050' from the 'raw_features' dictionary. Also we're going to take all this histograms and standard deviations data and save it into a PDF. And lastly it will take all the computed statistics values and save them into a CSV file to have better management of these values to future tasks.

In [20]:
data = raw_features['hf_0050']    # Array used in this cell.
all_mean = {}
all_median = {}
all_std_dev = {}    # Empty dictionaries to put all statistics values.


with PdfPages('histograms_hf_0050.pdf') as pdf:
    for i, values in enumerate(data.T):
        
        # Compute basic statistics.
        mean = np.mean(values)
        median = np.median(values)
        std_dev = np.std(values)
        all_mean[i] = mean
        all_median[i] = median 
        all_std_dev[i] = std_dev    # Saving the values per iteration in its respective dictionary.

        # Plotting histogram of the array.
        plt.figure(figsize=(5, 3))
        plt.hist(values, bins=100, color='skyblue', edgecolor='black', alpha=0.7)
        plt.title('Histogram of Data Distribution')
        plt.xlabel('Values')
        plt.ylabel('Frequency')
        plt.grid(True)

        # Add standard deviation to the plot.
        plt.text(0.5, 0.95, f'Standard Deviation: {std_dev:.2f}', horizontalalignment='center', verticalalignment='center', transform=plt.gca().transAxes)

        # Save the current figure to the PDF.
        pdf.savefig()
        
        # Close the current figure to release memory.
        plt.close()

print("PDF file with histograms and standard deviations saved successfully.")

# Name of the output csv file.
csv_filename = 'statistics_hf_0050.csv'

# Open CSV file in write mode.
with open(csv_filename, mode='w', newline='') as csv_file:
    fieldnames = ['Index', 'Mean', 'Median', 'Standard Deviation']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write the header row.
    writer.writeheader()

    # Write statistics data to CSV file.
    for i in range(len(data.T)):
        writer.writerow({'Index': i, 'Mean': all_mean[i], 'Median': all_median[i], 'Standard Deviation': all_std_dev[i]})

print("CSV file with statistics saved successfully.")

PDF file with histograms and standard deviations saved successfully.
CSV file with statistics saved successfully.


Now we take a look into the distributions of our data using histograms for the first array 'hf_0250' from the 'raw_features' dictionary. Also we're going to take all this histograms and standard deviations data and save it into a PDF. And lastly it will take all the computed statistics values and save them into a CSV file to have better management of these values to future tasks.

In [21]:
data = raw_features['hf_0250']    # Array used in this cell.
all_mean = {}
all_median = {}
all_std_dev = {}    # Empty dictionaries to put all statistics values.


with PdfPages('histograms_hf_0250.pdf') as pdf:
    for i, values in enumerate(data.T):
        
        # Compute basic statistics.
        mean = np.mean(values)
        median = np.median(values)
        std_dev = np.std(values)
        all_mean[i] = mean
        all_median[i] = median 
        all_std_dev[i] = std_dev    # Saving the values per iteration in its respective dictionary.

        # Plotting histogram of the array.
        plt.figure(figsize=(5, 3))
        plt.hist(values, bins=100, color='skyblue', edgecolor='black', alpha=0.7)
        plt.title('Histogram of Data Distribution')
        plt.xlabel('Values')
        plt.ylabel('Frequency')
        plt.grid(True)

        # Add standard deviation to the plot.
        plt.text(0.5, 0.95, f'Standard Deviation: {std_dev:.2f}', horizontalalignment='center', verticalalignment='center', transform=plt.gca().transAxes)

        # Save the current figure to the PDF.
        pdf.savefig()
        
        # Close the current figure to release memory.
        plt.close()

print("PDF file with histograms and standard deviations saved successfully.")

# Name of the output csv file.
csv_filename = 'statistics_hf_0250.csv'

# Open CSV file in write mode.
with open(csv_filename, mode='w', newline='') as csv_file:
    fieldnames = ['Index', 'Mean', 'Median', 'Standard Deviation']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write the header row.
    writer.writeheader()

    # Write statistics data to CSV file.
    for i in range(len(data.T)):
        writer.writerow({'Index': i, 'Mean': all_mean[i], 'Median': all_median[i], 'Standard Deviation': all_std_dev[i]})

print("CSV file with statistics saved successfully.")

PDF file with histograms and standard deviations saved successfully.
CSV file with statistics saved successfully.


Now we take a look into the distributions of our data using histograms for the first array 'hf_macro' from the 'raw_features' dictionary. Also we're going to take all this histograms and standard deviations data and save it into a PDF. And lastly it will take all the computed statistics values and save them into a CSV file to have better management of these values to future tasks.

In [22]:
data = raw_features['hf_macro']    # Array used in this cell.
all_mean = {}
all_median = {}
all_std_dev = {}    # Empty dictionaries to put all statistics values.


with PdfPages('histograms_hf_macro.pdf') as pdf:
    for i, values in enumerate(data.T):
        
        # Compute basic statistics.
        mean = np.mean(values)
        median = np.median(values)
        std_dev = np.std(values)
        all_mean[i] = mean
        all_median[i] = median 
        all_std_dev[i] = std_dev    # Saving the values per iteration in its respective dictionary.

        # Plotting histogram of the array.
        plt.figure(figsize=(5, 3))
        plt.hist(values, bins=100, color='skyblue', edgecolor='black', alpha=0.7)
        plt.title('Histogram of Data Distribution')
        plt.xlabel('Values')
        plt.ylabel('Frequency')
        plt.grid(True)

        # Add standard deviation to the plot.
        plt.text(0.5, 0.95, f'Standard Deviation: {std_dev:.2f}', horizontalalignment='center', verticalalignment='center', transform=plt.gca().transAxes)

        # Save the current figure to the PDF.
        pdf.savefig()
        
        # Close the current figure to release memory.
        plt.close()

print("PDF file with histograms and standard deviations saved successfully.")

# Name of the output csv file.
csv_filename = 'statistics_hf_macro.csv'

# Open CSV file in write mode.
with open(csv_filename, mode='w', newline='') as csv_file:
    fieldnames = ['Index', 'Mean', 'Median', 'Standard Deviation']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write the header row.
    writer.writeheader()

    # Write statistics data to CSV file.
    for i in range(len(data.T)):
        writer.writerow({'Index': i, 'Mean': all_mean[i], 'Median': all_median[i], 'Standard Deviation': all_std_dev[i]})

print("CSV file with statistics saved successfully.")

PDF file with histograms and standard deviations saved successfully.
CSV file with statistics saved successfully.


# Task 2 (Continuation)

After having obtained the average values and standard deviation of each trait, samples that are outside three standard deviations (outliers) will be eliminated.

To achive this first, we're going to load the CSVs with the statistical values to look at the outliers of the standard deviation.

In [5]:
csv_hf_0100 = pd.read_csv("C:/Users/marit/Documents/UPY Estancia I/statistics_hf_0100.csv") # Loading CSVs as pandas objects.
csv_hf_0050 = pd.read_csv("C:/Users/marit/Documents/UPY Estancia I/statistics_hf_0050.csv")
csv_hf_0250 = pd.read_csv("C:/Users/marit/Documents/UPY Estancia I/statistics_hf_0250.csv")
csv_hf_macro = pd.read_csv("C:/Users/marit/Documents/UPY Estancia I/statistics_hf_macro.csv")

Now that we have loaded the respective CSV files of our statistics values from the first part of this notebook, we will look into the standard deviations and with the rows that are outside three standard deviation we're going to eliminate them.

In [6]:
new_features = {}    # Empty dictionary to save the new and fixed arrays.

In [10]:
new_features_2 = {}    # Empty dictionary to save the new and fixed arrays.

In [7]:
csv_hf_0100.head()    # Taking a look into the CSV.

Unnamed: 0,Index,Mean,Median,Standard Deviation
0,0,-0.287814,-1.391897,2.925534
1,1,-0.610887,-1.525966,3.001665
2,2,-0.390859,-0.915662,1.811028
3,3,-0.980028,-2.31243,2.743423
4,4,-1.130426,-1.632416,2.144145


In [27]:
# Collecting the mean and the standard deviation from the CSV file.
mean = csv_hf_0100.iloc[:,1]
std_dev = csv_hf_0100.iloc[:,3]

all_limsup = {}    # Empty dicts to save indices and the range of the limits.
all_liminf = {}

# Getting superior and inferior limits.
for i in range(csv_hf_0100.shape[0]):
    limsup = mean[i] + (3 * std_dev[i])
    liminf = mean[i] - (3 * std_dev[i])
    all_limsup[i] = limsup
    all_liminf[i] = liminf

# Create a new array to store the values within the range.
new_features['hf_0100'] = np.zeros_like(raw_features['hf_0100'])

# Iterate over the columns of the original array.
for i in range(raw_features['hf_0100'].shape[1]):
    # Get the upper and lower limits for this column.
    limsup = all_limsup[i]
    liminf = all_liminf[i]
    
    # Filter values that are within the range.
    column_vals = raw_features['hf_0100'][:, i]
    column_vals_filtrados = column_vals[(column_vals >= liminf) & (column_vals <= limsup)]
    
    # Assign the filtered values to the corresponding column of the new array.
    new_features['hf_0100'][:len(column_vals_filtrados), i] = column_vals_filtrados

print("Raw Array:", raw_features['hf_0100'].shape)
print("Filtered Array:", new_features['hf_0100'].shape)

Array original: (231632, 480)
Array filtrado: (231632, 480)


In [28]:
# Collecting the mean and the standard deviation from the CSV file.
mean = csv_hf_0050.iloc[:,1]
std_dev = csv_hf_0050.iloc[:,3]

all_limsup = {}    # Empty dicts to save indices and the range of the limits.
all_liminf = {}

# Getting superior and inferior limits.
for i in range(csv_hf_0050.shape[0]):
    limsup = mean[i] + (3 * std_dev[i])
    liminf = mean[i] - (3 * std_dev[i])
    all_limsup[i] = limsup
    all_liminf[i] = liminf

# Create a new array to store the values within the range.
new_features['hf_0050'] = np.zeros_like(raw_features['hf_0050'])

# Iterate over the columns of the original array.
for i in range(raw_features['hf_0050'].shape[1]):
    # Get the upper and lower limits for this column.
    limsup = all_limsup[i]
    liminf = all_liminf[i]
    
    # Filter values that are within the range.
    column_vals = raw_features['hf_0050'][:, i]
    column_vals_filtrados = column_vals[(column_vals >= liminf) & (column_vals <= limsup)]
    
    # Assign the filtered values to the corresponding column of the new array.
    new_features['hf_0050'][:len(column_vals_filtrados), i] = column_vals_filtrados

print("Raw Array:", raw_features['hf_0050'].shape)
print("Filtered Array:", new_features['hf_0050'].shape)

Raw Array: (231632, 480)
Filtered Array: (231632, 480)


In [29]:
# Collecting the mean and the standard deviation from the CSV file.
mean = csv_hf_0250.iloc[:,1]
std_dev = csv_hf_0250.iloc[:,3]

all_limsup = {}    # Empty dicts to save indices and the range of the limits.
all_liminf = {}

# Getting superior and inferior limits.
for i in range(csv_hf_0250.shape[0]):
    limsup = mean[i] + (3 * std_dev[i])
    liminf = mean[i] - (3 * std_dev[i])
    all_limsup[i] = limsup
    all_liminf[i] = liminf

# Create a new array to store the values within the range.
new_features['hf_0250'] = np.zeros_like(raw_features['hf_0250'])

# Iterate over the columns of the original array.
for i in range(raw_features['hf_0250'].shape[1]):
    # Get the upper and lower limits for this column.
    limsup = all_limsup[i]
    liminf = all_liminf[i]
    
    # Filter values that are within the range.
    column_vals = raw_features['hf_0250'][:, i]
    column_vals_filtrados = column_vals[(column_vals >= liminf) & (column_vals <= limsup)]
    
    # Assign the filtered values to the corresponding column of the new array.
    new_features['hf_0250'][:len(column_vals_filtrados), i] = column_vals_filtrados

print("Raw Array:", raw_features['hf_0250'].shape)
print("Filtered Array:", new_features['hf_0250'].shape)

Raw Array: (231632, 480)
Filtered Array: (231632, 480)


In [30]:
# Collecting the mean and the standard deviation from the CSV file.
mean = csv_hf_macro.iloc[:,1]
std_dev = csv_hf_macro.iloc[:,3]

all_limsup = {}    # Empty dicts to save indices and the range of the limits.
all_liminf = {}

# Getting superior and inferior limits.
for i in range(csv_hf_macro.shape[0]):
    limsup = mean[i] + (3 * std_dev[i])
    liminf = mean[i] - (3 * std_dev[i])
    all_limsup[i] = limsup
    all_liminf[i] = liminf

# Create a new array to store the values within the range.
new_features['hf_macro'] = np.zeros_like(raw_features['hf_macro'])

# Iterate over the columns of the original array.
for i in range(raw_features['hf_macro'].shape[1]):
    # Get the upper and lower limits for this column.
    limsup = all_limsup[i]
    liminf = all_liminf[i]
    
    # Filter values that are within the range.
    column_vals = raw_features['hf_macro'][:, i]
    column_vals_filtrados = column_vals[(column_vals >= liminf) & (column_vals <= limsup)]
    
    # Assign the filtered values to the corresponding column of the new array.
    new_features['hf_macro'][:len(column_vals_filtrados), i] = column_vals_filtrados

print("Raw Array:", raw_features['hf_macro'].shape)
print("Filtered Array:", new_features['hf_macro'].shape)

Raw Array: (231632, 480)
Filtered Array: (231632, 480)


In [31]:
# Now, saving each fixed array from the new dictionary into a (.npy) file.
np.save('hf_0100_fix', new_features['hf_0100'])
np.save('hf_0050_fix', new_features['hf_0050'])
np.save('hf_0250_fix', new_features['hf_0250'])
np.save('hf_macro_fix', new_features['hf_macro'])

In [2]:
# For last, just cleaning the fixed array files taking off the rows that contains the zeros.
import numpy as np    # Just to not have to rerun all the notebook.

def clean_rows_with_zeros(array):
    # Find rows that do not contain zeros.
    non_zero_rows = np.all(array != 0, axis=1)
    # Filter the array to get only rows that do not contain zeros.
    clean_array = array[non_zero_rows]
    return clean_array

# Load .npy files.
file1 = np.load('C:/Users/marit/Documents/UPY Estancia I/hf_0100_fix.npy')
file2 = np.load('C:/Users/marit/Documents/UPY Estancia I/hf_0050_fix.npy')
file3 = np.load('C:/Users/marit/Documents/UPY Estancia I/hf_0250_fix.npy')
file4 = np.load('C:/Users/marit/Documents/UPY Estancia I/hf_macro_fix.npy')

# Clean the files.
file1_clean = clean_rows_with_zeros(file1)
file2_clean = clean_rows_with_zeros(file2)
file3_clean = clean_rows_with_zeros(file3)
file4_clean = clean_rows_with_zeros(file4)

# Check shapes.
print("Shape of file 1 before cleaning:", file1.shape)
print("Shape of file 1 after cleaning:", file1_clean.shape)
print("Shape of file 2 before cleaning:", file2.shape)
print("Shape of file 2 after cleaning:", file2_clean.shape)
print("Shape of file 3 before cleaning:", file3.shape)
print("Shape of file 3 after cleaning:", file3_clean.shape)
print("Shape of file 4 before cleaning:", file4.shape)
print("Shape of file 4 after cleaning:", file4_clean.shape)

# Now, again saving each fixed array from the new dictionary into a (.npy) file.
np.save('hf_0100_full_fix', file1_clean)
np.save('hf_0050_full_fix', file2_clean)
np.save('hf_0250_full_fix', file3_clean)
np.save('hf_macro_full_fix', file4_clean)

Shape of file 1 before cleaning: (231632, 480)
Shape of file 1 after cleaning: (221112, 480)
Shape of file 2 before cleaning: (231632, 480)
Shape of file 2 after cleaning: (221113, 480)
Shape of file 3 before cleaning: (231632, 480)
Shape of file 3 after cleaning: (221113, 480)
Shape of file 4 before cleaning: (231632, 480)
Shape of file 4 after cleaning: (221113, 480)
