## Import data

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from utils import *
import matplotlib.pyplot as plt
import matplotlib

data_path = '../data/SNL_MSU_DOE_raw.xlsx'

dfs = pd.read_excel(
    data_path, engine='openpyxl', sheet_name=None)

sheet_names = list(dfs.keys())
sheet_names

name_mapping = {
    'Material': 'Material',
    'Resin Type': 'Resin Type',
    'Vf, %': 'Fibre Volumn Fraction',
    '%, 0 Deg': 'Percentage of Fibre in 0-deg Direction',
    '%, 45 Deg': 'Percentage of Fibre in 45-deg Direction',
    '%, 90 Deg': 'Percentage of Fibre in 90-deg Direction',
    'other %': 'Percentage of Fibre in Other Direction',
    'Thickness, mm': 'Thickness',
    'Max. Stress, MPa': 'Maximum Stress',
    'Min. Stress, MPa': 'Minimum Stress',
    'R-value': 'Minimum/Maximum Stress',
    'Freq., Hz': 'Frequency',
    'E, GPa': 'Initial Elastic Modulus',
    'Max. % Strain': 'Maximum Strain',
    'Min. % Strain': 'Minimum Strain',
    'Cycles': 'Cycles to Failure',
    'Moisture Gain, %': 'Moisture Gain',
    'Testing Temperature, OC': 'Temperature',
    'Width, mm': 'Width',
    'Static Max. Stress, MPa': 'Static Maximum Tensile Stress',
    'Static Min. Stress, MPa': 'Static Maximum Compressive Stress',
    'Static E, GPa': 'Static Elastic Modulus',
    'Static Max. % Strain': 'Static Maximum Tensile Strain',
    'Static Min. % Strain': 'Static Maximum Compressive Strain',
    'Absolute Maximum Stress': 'Absolute Maximum Stress',
    'Absolute Peak-to-peak Stress': 'Absolute Peak-to-peak Stress',
    'Relative Maximum Stress': 'Relative Maximum Stress',
    'Relative Peak-to-peak Stress': 'Relative Peak-to-peak Stress'
}

In [2]:
df_all = pd.concat([dfs[name] for name in sheet_names[2:]], axis=0, ignore_index=True)

col_to_del = [x for x in df_all.columns if x not in name_mapping.keys()]
print('Deleted features', col_to_del)

df_tmp = replace_column_name(df_all, name_mapping)
for col in col_to_del:
    del df_tmp[col]
    
df_presence = calculate_absence_ratio(df_tmp)

clr = sns.color_palette("deep")

plt.figure(figsize=(5,4),dpi=300)
ax = plt.subplot(111)
plot_absence_ratio(ax, df_presence, orient='h', palette=clr, linewidth=1, edgecolor=[0, 0, 0])
plt.tight_layout()

plt.savefig('../output/absence_ratio_initial.png')
# plt.close()
# plt.show()
plt.close()

Deleted features ['Lay-up', 'Resin', '0 Deg fabric', '45 Deg fabric', '90 deg fabric', 'Cure / Post Cure', 'Process', 'Test #', 'Coupon', 'Runout', 'Cycles for 50% strain increase', 'Cycles for 100% strain increase', 'Cycles for 25% reduction in Modulus', 'Cycles for 1% strain', 'Freq., Hz or mm/s', 'E, GPa (0.1-0.3%)', 'Initial cracking strain, %', 'Modulus after initial cracking, GPa']




## Preprocess

### Fix missing and useless cells

In [3]:
df_all = pd.concat([dfs[name] for name in sheet_names[2:]], axis=0, ignore_index=True)

def remove_s(x, s):
    if type(x) == str:
        if s in x:
            x = x.replace(s,'')
            x = float(x)
    return x

def cal_fraction(x, s):
    if type(x) == str:
        if s in x:
            x = x.split(s)
            x = (float(x[0])+float(x[1]))/2
    return x

def conditional_remove(x, s):
    if type(x) == str:
        if s in x:
            return np.nan
    return x

def conditional_replace(x, s1, s2):
    if type(x) == str:
        if s1 in x:
            x = x.replace(s1, s2)
    return x

def str2num(x, s, n):
    if type(x) == str:
        if s == x:
            x = n
    return x

def remove_strs(x):
    if type(x) == str:
        return np.nan
    else:
        return x

def fill_na(x, n):
    if np.isnan(x):
        return n
    else:
        return x
        

def modify_col(df, column_name, func, **kargs):
    col = df[column_name]
    col = [func(x,**kargs) for x in col]
    df.loc[:, column_name] = col


modify_col(df_all, 'Testing Temperature, OC', remove_s, s=' ̊C')
modify_col(df_all, '%, 45 Deg', remove_s, s=' G')
modify_col(df_all, '%, 0 Deg', remove_s, s=' C')
modify_col(df_all, 'other %', remove_s, s=' G')
modify_col(df_all, 'Vf, %', cal_fraction, s='/')
modify_col(df_all, '%, 0 Deg', cal_fraction, s='-')
modify_col(df_all, '%, 45 Deg', cal_fraction, s='-')
modify_col(df_all, 'Thickness, mm', conditional_remove, s='mm dia') #
modify_col(df_all, 'Thickness, mm', conditional_remove, s='/') #
modify_col(df_all, 'Max. Stress, MPa', remove_s, s='*')
modify_col(df_all, 'Max. Stress, MPa', remove_s, s='+')
modify_col(df_all, 'Max. Stress, MPa', conditional_remove, s='Newtons')
modify_col(df_all, 'Min. Stress, MPa', conditional_remove, s='v')
modify_col(df_all, 'R-value', conditional_remove, s='*')
modify_col(df_all, 'R-value', conditional_replace, s1='static compression', s2='static')
modify_col(df_all, 'Max. % Strain', conditional_remove, s='----')
modify_col(df_all, 'Max. % Strain', remove_s, s='+')
modify_col(df_all, 'Min. % Strain', conditional_remove, s='Runout')
modify_col(df_all, 'E, GPa (0.1-0.3%)', conditional_remove, s='----')
modify_col(df_all, 'Initial cracking strain, %', remove_strs)
modify_col(df_all, 'Runout', str2num, s='Runout',n=1)
modify_col(df_all, 'Runout', str2num, s='runout',n=1)
# modify_col(df_all, 'Testing Temperature, OC', fill_na, n=20)

# These columns use NaN to represent absence of fibre in the direction, so simply fillna by 0.
fill_na_col = ['%, 0 Deg', '%, 45 Deg','%, 90 Deg','other %','45 Deg fabric','90 deg fabric','Runout']
tmp = df_all[fill_na_col].fillna(0)
df_all.loc[:,fill_na_col] = tmp

def merge_col(df, from_col, to_col):
    where = np.where(df[from_col].notna())[0]
    df.loc[where, to_col] = df.loc[where, from_col]
    del df[from_col]

# These columns have different names in different sheets
merge_col(df_all, 'E, GPa (0.1-0.3%)', 'E, GPa')
merge_col(df_all, 'Freq., Hz or mm/s', 'Freq., Hz')

# Some static experiments do not have R-value and cycles to failure are 1.
df_all.loc[np.where(df_all['Cycles'] == 1)[0], 'R-value'] = 'static'

### Calculate missing Max/Min stress data using R-value

In [4]:
static_indexes = np.where(df_all['R-value']=='static')[0]
non_static_indexes = np.setdiff1d(df_all.index, static_indexes)
miss_max_stress_indexes = np.where(np.isnan(df_all['Max. Stress, MPa']))[0]
miss_min_stress_indexes = np.where(np.isnan(df_all['Min. Stress, MPa']))[0]
miss_R_value_indexes = np.where(pd.isna(df_all['R-value']))[0]

for idx in miss_max_stress_indexes:
    if idx in non_static_indexes and type(df_all.loc[idx,'R-value'])!=str:
        df_all.loc[idx,'Max. Stress, MPa'] = df_all.loc[idx,'Min. Stress, MPa']/df_all.loc[idx,'R-value']
        
for idx in miss_min_stress_indexes:
    if idx in non_static_indexes and type(df_all.loc[idx,'R-value'])!=str:
        df_all.loc[idx,'Min. Stress, MPa'] = df_all.loc[idx,'Max. Stress, MPa']*df_all.loc[idx,'R-value']
        
for idx in miss_R_value_indexes:
    if idx in non_static_indexes:
        df_all.loc[idx,'R-value'] = df_all.loc[idx,'Min. Stress, MPa']/df_all.loc[idx,'Max. Stress, MPa']

In [5]:
df_all.to_excel('../data/SNL_MSU_DOE_combine.xlsx', engine='openpyxl', index=False)

### Extract and save static and fatigue data respectively

In [6]:

# hide_cols = ['Lay-up','Resin Type','Resin','0 Deg fabric','45 Deg fabric','90 deg fabric','Cure / Post Cure','Process','Test #','Coupon']

df_tmp = df_all.copy()
# for col in hide_cols:
#     del df_tmp[col]

df_static = df_tmp.loc[static_indexes].copy()
df_fatigue = df_tmp.loc[non_static_indexes].copy()

df_static.reset_index(drop=True, inplace=True)
df_fatigue.reset_index(drop=True, inplace=True)

modify_col(df_fatigue, 'Freq., Hz', conditional_remove, s='mm/s')

# df_fatigue.to_excel('../data/SNL_MSU_DOE_fatigue.xlsx', engine='openpyxl', index=False)
df_static.to_excel('../data/SNL_MSU_DOE_static.xlsx', engine='openpyxl', index=False)

### Extract material properties from static experiments

In [7]:
static_material_names = df_static['Material'].copy()
static_lay_up = df_static['Lay-up'].copy()
static_mat_lay = np.array([x+y for x,y in zip(static_material_names, static_lay_up)], dtype=str)
static_properties = {}

static_features = ['Max. Stress, MPa', 'Min. Stress, MPa', 'E, GPa', 'Max. % Strain', 'Min. % Strain']

for material in list(set(static_mat_lay)):
    where_material = np.where(static_mat_lay == material)[0]
    # print(material, len(where_material))
    material_data = df_static.loc[where_material, static_features].copy()
    material_data.reset_index(drop=True, inplace=True)
    material_df = {}
    for feature in static_features:
        for idx in range(len(material_data[feature])):
            if type(material_data.loc[idx, feature]) == str:
                material_data.loc[idx, feature] = np.nan
        
        presence_indexes = np.where(material_data[feature])[0]
        mean_value = np.mean(material_data.loc[presence_indexes, feature])
        material_df[feature] = mean_value
    
    material_df = pd.DataFrame(material_df, index=[0])
    static_properties[material]=material_df

fatigue_static_features = ['Static '+x for x in static_features]
fatigue_material_names = df_fatigue['Material'].copy()
fatigue_lay_up = df_fatigue['Lay-up'].copy()
fatigue_mat_lay = np.array([x+y for x,y in zip(fatigue_material_names, fatigue_lay_up)], dtype=str)

df_fatigue[fatigue_static_features] = np.nan
            
for material in list(set(static_mat_lay)):
    where_material = np.where(fatigue_mat_lay == material)[0]
    if len(where_material) > 0:
        static_property = static_properties[material]
        for feature in static_features:
            df_fatigue.loc[where_material,'Static '+feature] = static_property[feature].values[0]


## Calculate real max stress and relative p2pstress

In [8]:
df_fatigue['Absolute Maximum Stress'] = np.nan
df_fatigue['Absolute Peak-to-peak Stress'] = np.nan
df_fatigue['Relative Maximum Stress'] = np.nan
df_fatigue['Relative Peak-to-peak Stress'] = np.nan

df_fatigue.loc[np.where(df_fatigue['Static Max. Stress, MPa']<0)[0],'Static Max. Stress, MPa'] = np.nan
df_fatigue.loc[np.where(df_fatigue['Static Min. Stress, MPa']>0)[0],'Static Min. Stress, MPa'] = np.nan
df_fatigue.loc[np.where(df_fatigue['Static Max. % Strain']<0)[0],'Static Max. % Strain'] = np.nan
df_fatigue.loc[np.where(df_fatigue['Static Min. % Strain']>0)[0],'Static Min. % Strain'] = np.nan

for idx in range(df_fatigue.values.shape[0]):
    s = np.array([df_fatigue.loc[idx, 'Max. Stress, MPa'],df_fatigue.loc[idx, 'Min. Stress, MPa']])
    which_max_stress = np.where(np.abs(s) == np.max(np.abs(s)))[0]
    if len(which_max_stress) == 0:
        which_max_stress = 1 - int(np.isnan(s[1])) # when nan appears in s
    else:
        which_max_stress = which_max_stress[0]
        
    relative_to = np.abs(df_fatigue.loc[idx,'Static Max. Stress, MPa']) if s[which_max_stress] > 0 else np.abs(df_fatigue.loc[idx,'Static Min. Stress, MPa'])
    if np.isnan(relative_to) and s[0] + s[1] < 1e-5 and s[which_max_stress] > 0:
        relative_to = np.abs(df_fatigue.loc[idx,'Static Min. Stress, MPa'])
    df_fatigue.loc[idx,'Absolute Maximum Stress'] = s[which_max_stress]
    df_fatigue.loc[idx,'Relative Maximum Stress'] = s[which_max_stress]/relative_to
    
    p2p = np.abs(s[0]-s[1])
    if np.isnan(p2p):
        p2p = np.abs(s[1 - int(np.isnan(s[1]))])
        
    df_fatigue.loc[idx,'Absolute Peak-to-peak Stress'] = p2p
    df_fatigue.loc[idx,'Relative Peak-to-peak Stress'] = p2p/relative_to
    
df_fatigue.to_excel('../data/SNL_MSU_DOE_fatigue.xlsx', engine='openpyxl', index=False)

### Calculate and plot absence ratios

In [9]:
col_to_del = [x for x in df_fatigue.columns if x not in name_mapping.keys()]
print('Deleted features', col_to_del)

df_tmp = replace_column_name(df_fatigue, name_mapping)
for col in col_to_del:
    del df_tmp[col]
    
df_presence = calculate_absence_ratio(df_tmp)

clr = sns.color_palette("deep")

plt.figure(figsize=(5,4),dpi=300)
ax = plt.subplot(111)
plot_absence_ratio(ax, df_presence, orient='h', palette=clr, linewidth=1, edgecolor=[0, 0, 0])
plt.tight_layout()

plt.savefig('../output/absence_ratio.png')
# plt.close()
# plt.show()
plt.close()

Deleted features ['Lay-up', 'Resin', '0 Deg fabric', '45 Deg fabric', '90 deg fabric', 'Cure / Post Cure', 'Process', 'Test #', 'Coupon', 'Runout', 'Cycles for 50% strain increase', 'Cycles for 100% strain increase', 'Cycles for 25% reduction in Modulus', 'Cycles for 1% strain', 'Initial cracking strain, %', 'Modulus after initial cracking, GPa']




In [10]:
df_presence

Unnamed: 0,feature,ratio
0,Moisture Gain,0.949038
1,Temperature,0.924873
2,Static Maximum Compressive Strain,0.767549
3,Initial Elastic Modulus,0.571429
4,Width,0.544797
5,Minimum Strain,0.534111
6,Static Maximum Compressive Stress,0.333553
7,Maximum Strain,0.28144
8,Static Elastic Modulus,0.14746
9,Static Maximum Tensile Strain,0.145487
