# Project

## Data Preparation 

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [12]:
data = pd.read_csv('./data/Data.csv')

In [13]:
data

Unnamed: 0,Std,ID,Material,Heat treatment,Su,Sy,A5,Bhn,E,G,mu,Ro,pH,Desc,HV
0,ANSI,D8894772B88F495093C43AF905AB6373,Steel SAE 1015,as-rolled,421,314,39.0,126.0,207000,79000,0.30,7860,,,
1,ANSI,05982AC66F064F9EBC709E7A4164613A,Steel SAE 1015,normalized,424,324,37.0,121.0,207000,79000,0.30,7860,,,
2,ANSI,356D6E63FF9A49A3AB23BF66BAC85DC3,Steel SAE 1015,annealed,386,284,37.0,111.0,207000,79000,0.30,7860,,,
3,ANSI,1C758F8714AC4E0D9BD8D8AE1625AECD,Steel SAE 1020,as-rolled,448,331,36.0,143.0,207000,79000,0.30,7860,,,
4,ANSI,DCE10036FC1946FC8C9108D598D116AD,Steel SAE 1020,normalized,441,346,35.8,131.0,207000,79000,0.30,7860,550.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1547,JIS,512A80EC21EA416BA2725B38BA8096EF,Nodular cast iron,,600,370,,,169000,70000,0.20,7160,480.0,Nodular cast iron,210.0
1548,JIS,38526441BA8741CA979DBF870D0B8A9B,Nodular cast iron,,700,420,,,169000,70000,0.20,7160,560.0,Nodular cast iron,230.0
1549,JIS,CAC03D7EB1AA45E68EFF92A2EF4C3D9B,Nodular cast iron,,800,480,,,169000,70000,0.20,7160,600.0,Nodular cast iron,240.0
1550,JIS,45C82A36EC644F8BB6170A99ED819B62,Malleable cast iron,,400,180,4.0,,160000,64000,0.27,7160,300.0,Malleable cast iron,220.0


### About Data
The Material Dataset includes the following mechanical properties:

1. Standard (Std)
2. Unique Identification code for the Material (ID)
3. Material Name
4. Heat Treatment Method
5. Ultimate Tensile Strength (Su) in MPa
6. Yield Strength (Sy) in MPa
7. Elongation at Break or Strain (A5) as a Percentage
8. Brinell Hardness Number (BHN) in Microhardness Units
9. Elastic Modulus (E) in MPa
10. Shear Modulus (G) in MPa
11. Poisson's Ratio (mu) in Units of Length
12. Density (Ro) in Kg/m3
13. Pressure at Yield (pH) in MPa
14. Description of the Material (Desc)
15. Vickers Hardness Number (HV)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1552 entries, 0 to 1551
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Std             1552 non-null   object 
 1   ID              1552 non-null   object 
 2   Material        1552 non-null   object 
 3   Heat treatment  802 non-null    object 
 4   Su              1552 non-null   int64  
 5   Sy              1552 non-null   object 
 6   A5              1346 non-null   float64
 7   Bhn             463 non-null    float64
 8   E               1552 non-null   int64  
 9   G               1552 non-null   int64  
 10  mu              1552 non-null   float64
 11  Ro              1552 non-null   int64  
 12  pH              193 non-null    float64
 13  Desc            981 non-null    object 
 14  HV              165 non-null    float64
dtypes: float64(5), int64(4), object(6)
memory usage: 182.0+ KB


In [15]:
# Missing Values
data.isna().sum()

Std                  0
ID                   0
Material             0
Heat treatment     750
Su                   0
Sy                   0
A5                 206
Bhn               1089
E                    0
G                    0
mu                   0
Ro                   0
pH                1359
Desc               571
HV                1387
dtype: int64

In [18]:
data.drop(['ID','HV','pH','Bhn'],axis=1,inplace=True)

In [19]:
data.head()

Unnamed: 0,Std,Material,Heat treatment,Su,Sy,A5,E,G,mu,Ro,Desc
0,ANSI,Steel SAE 1015,as-rolled,421,314,39.0,207000,79000,0.3,7860,
1,ANSI,Steel SAE 1015,normalized,424,324,37.0,207000,79000,0.3,7860,
2,ANSI,Steel SAE 1015,annealed,386,284,37.0,207000,79000,0.3,7860,
3,ANSI,Steel SAE 1020,as-rolled,448,331,36.0,207000,79000,0.3,7860,
4,ANSI,Steel SAE 1020,normalized,441,346,35.8,207000,79000,0.3,7860,


In [20]:
data.nunique()

Std                  8
Material          1225
Heat treatment      44
Su                 309
Sy                 291
A5                  98
E                   43
G                   33
mu                  11
Ro                  35
Desc                83
dtype: int64

In [21]:
# define numerical & categorical columns
numeric_features = [feature for feature in data.columns if data[feature].dtype != 'O']
categorical_features = [feature for feature in data.columns if data[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 6 numerical features : ['Su', 'A5', 'E', 'G', 'mu', 'Ro']

We have 5 categorical features : ['Std', 'Material', 'Heat treatment', 'Sy', 'Desc']


In [24]:
print("Categories in 'Standard (Std)' variable:     ",end=" " )
print(data['Std'].unique())

print("Categories in 'Material' variable:  ",end=" ")
print(data['Material'].unique())

print("Categories in'Heat Treatment' variable:",end=" " )
print(data['Heat treatment'].unique())

print("Categories in 'Yield Strength (Sy)' variable:     ",end=" " )
print(data['Sy'].unique())

print("Categories in 'Description of material' variable:     ",end=" " )
print(data['Desc'].unique())

Categories in 'Standard (Std)' variable:      ['ANSI' 'ISO' 'DIN' 'BS' 'CSN' 'GOST' 'NF' 'JIS']
Categories in 'Material' variable:   ['Steel SAE 1015' 'Steel SAE 1020' 'Steel SAE 1022' ... 'JIS SUH330'
 'JIS SUH310' 'JIS SUH35']
Categories in'Heat Treatment' variable: ['as-rolled' 'normalized' 'annealed' 'tempered at 400 F'
 'tempered at 600 F' 'tempered at 800 F' '1/4-hard' '1/2-hard' '3/4-hard'
 'Full-hard' nan 'As hot rolled' 'As extruded' 'Cast (T7)' 'Cast (T4)'
 'Cast (F)' 'Cast (T61)' 'Cast (T77)' 'Cast (T6)' 'Cast (T62)'
 'Cast (T51)' 'Cast (T71)' 'Cast (F or T5)' 'Cast (T5)' 'Cast (T52)'
 'Cast (T53)' 'Wrought' 'sand casting' 'pressure die casting'
 'heat treated' 'case-hardened' 'face hardened' 'nitro-carburized'
 'nitro-case-hard.' 'nitrided' 'Heat-treatment nitrided steel'
 'Cold working' 'Annealing or high tempering' 'Improved'
 'Quenched and tempered' 'Quenching and cooling in the water'
 'Quenching and cooling in the oil' 'Quenching and heating'
 'Case hardening,quenching

In [25]:
print("Total Categories in 'Standard (Std)' variable:     ",end=" " )
print(len(data['Std'].unique()))

print("Total Categories in 'Material' variable:  ",end=" ")
print(len(data['Material'].unique()))

print("Total Categories in'Heat Treatment' variable:",end=" " )
print(len(data['Heat treatment'].unique()))

print("Total Categories in 'Yield Strength (Sy)' variable:     ",end=" " )
print(len(data['Sy'].unique()))

print("Total Categories in 'Description of material' variable:     ",end=" " )
print(len(data['Desc'].unique()))

Total Categories in 'Standard (Std)' variable:      8
Total Categories in 'Material' variable:   1225
Total Categories in'Heat Treatment' variable: 45
Total Categories in 'Yield Strength (Sy)' variable:      291
Total Categories in 'Description of material' variable:      84


## Data Cleaning 

## EDA - exploratory data analysis

### Univariate Analysis

#### Statistical Feature

#### Distribution

### Bivariate Analysis

### Multivariate Analysis