In [74]:
# Pandas library를 import 함
import pandas as pd

In [75]:
rock_samples = pd.read_csv("../data/rocksamples.csv")
rock_samples.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2229 entries, 0 to 2228
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            2229 non-null   int64  
 1   Mission       2229 non-null   object 
 2   Type          2229 non-null   object 
 3   Subtype       2226 non-null   object 
 4   Weight (g)    2229 non-null   float64
 5   Pristine (%)  2229 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 104.6+ KB


In [76]:
# rock_samples 데이터프레임의 모양을 알아본다
rock_samples.shape

(2229, 6)

In [77]:
# rock_samples 데이터프레임 행수를 알아봄
rock_samples.shape[0]
# rock_samples 데이터프레임 컬럼수를 알아봄
rock_samples.shape[1]

6

In [78]:
# rock_samples 데이터 프레임 인덱스
rock_samples.index

RangeIndex(start=0, stop=2229, step=1)

In [79]:
# rock_samples 데이터프레임의 컬럼명을 알아본다.
rock_samples.columns

Index(['ID', 'Mission', 'Type', 'Subtype', 'Weight (g)', 'Pristine (%)'], dtype='object')

In [80]:
# rock_samples 데이터 타입 확인
rock_samples.dtypes

ID                int64
Mission          object
Type             object
Subtype          object
Weight (g)      float64
Pristine (%)    float64
dtype: object

In [81]:
# rock_samples 데이터프레임 수치 데이터의 요약 통계를 알아봄
rock_samples.describe()

Unnamed: 0,ID,Weight (g),Pristine (%)
count,2229.0,2229.0,2229.0
mean,52058.432032,168.253024,84.512764
std,26207.651471,637.286458,22.057299
min,10001.0,0.0,0.0
25%,15437.0,3.0,80.01
50%,65527.0,10.2,92.3
75%,72142.0,93.49,98.14
max,79537.0,11729.0,180.0


## 누락값 검사

In [82]:
# 1단계 -> rock_samples 데이터프레임에 있는 모든 값이 null인지 알아봄
rock_samples.isnull()

Unnamed: 0,ID,Mission,Type,Subtype,Weight (g),Pristine (%)
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
2224,False,False,False,False,False,False
2225,False,False,False,False,False,False
2226,False,False,False,False,False,False
2227,False,False,False,False,False,False


In [83]:
# 2단계 -> 1단계 결과인 데이터 프레임의 각 컬럼값들을 더한다. False: 0, True: 1
rock_samples.isnull().sum()

ID              0
Mission         0
Type            0
Subtype         3
Weight (g)      0
Pristine (%)    0
dtype: int64

In [84]:
rock_samples['Weight (g)'].apply(lambda x: x * 0.001)

0       0.12580
1       5.62900
2       0.21300
3       0.04480
4       0.05340
         ...   
2224    0.00238
2225    0.00184
2226    0.00169
2227    0.00166
2228    0.00105
Name: Weight (g), Length: 2229, dtype: float64

In [85]:
rock_samples.head(2)

Unnamed: 0,ID,Mission,Type,Subtype,Weight (g),Pristine (%)
0,10001,Apollo11,Soil,Unsieved,125.8,88.36
1,10002,Apollo11,Soil,Unsieved,5629.0,93.73


In [86]:
rock_samples['Weight (g)']

0        125.80
1       5629.00
2        213.00
3         44.80
4         53.40
         ...   
2224       2.38
2225       1.84
2226       1.69
2227       1.66
2228       1.05
Name: Weight (g), Length: 2229, dtype: float64

In [87]:
# 1단계 컬럼값 변환: rock_samples['Weight (g)'] -> rock_samples['Weight (kg)']
rock_samples['Weight (g)'] = rock_samples['Weight (g)'].apply(lambda x : x * 0.001)
rock_samples['Weight (g)']

0       0.12580
1       5.62900
2       0.21300
3       0.04480
4       0.05340
         ...   
2224    0.00238
2225    0.00184
2226    0.00169
2227    0.00166
2228    0.00105
Name: Weight (g), Length: 2229, dtype: float64

In [88]:
rock_samples['Weight (g)']

0       0.12580
1       5.62900
2       0.21300
3       0.04480
4       0.05340
         ...   
2224    0.00238
2225    0.00184
2226    0.00169
2227    0.00166
2228    0.00105
Name: Weight (g), Length: 2229, dtype: float64

In [89]:
rock_samples.head()

Unnamed: 0,ID,Mission,Type,Subtype,Weight (g),Pristine (%)
0,10001,Apollo11,Soil,Unsieved,0.1258,88.36
1,10002,Apollo11,Soil,Unsieved,5.629,93.73
2,10003,Apollo11,Basalt,Ilmenite,0.213,65.56
3,10004,Apollo11,Core,Unsieved,0.0448,71.76
4,10005,Apollo11,Core,Unsieved,0.0534,40.31


In [90]:
rock_samples.rename(columns={"Weight (g)": "Weight (kg)"}, inplace=True)

 ## 아폴로 임무별 데이터 프레임 만들기: missions

In [91]:
# 빈 데이터 프레임을 만들고 변수 missions에 할당한다.
missions = pd.DataFrame()
missions

In [92]:
type(missions)

pandas.core.frame.DataFrame

In [93]:
# rock_samples['Mission']의 중복되지 않은 고유값들을 알아봄
rock_samples['Mission'].unique()

array(['Apollo11', 'Apollo12', 'Apollo14', 'Apollo15', 'Apollo16',
       'Apollo17'], dtype=object)

In [94]:
# 위의 값들을 mission 데이터프레임의 새로운 데이터 컬럼인 missions['Mission]에 할당한다
missions['Mission'] = rock_samples['Mission'].unique()
missions

Unnamed: 0,Mission
0,Apollo11
1,Apollo12
2,Apollo14
3,Apollo15
4,Apollo16
5,Apollo17


In [95]:
# rock_samples 데이터프레임의 'mission' 컬럼을 기준으로 그룹을 나눈다 -> groupby()
# 나누어진 그룹에서 'Weight (kg)' 컬럼의 총합을 구한다 -> groupby('Mission')['Weight (kg)'].sum()
# 결국 아폴로 달탐사 임무별로 수집해온 암석 샘플 총 중량을 구해서 새로운 변수에 할당한다 -> 이 변수는 시리즈이다.
sample_total_weight = rock_samples.groupby('Mission')['Weight (kg)'].sum()
sample_total_weight

Mission
Apollo11     21.55424
Apollo12     34.34238
Apollo14     41.83363
Apollo15     75.39910
Apollo16     92.46262
Apollo17    109.44402
Name: Weight (kg), dtype: float64

In [96]:
# missions 데이터프레임과 sample_total_weight 시리즈를 병합함.
missions = pd.merge(missions, sample_total_weight, on='Mission')
missions

Unnamed: 0,Mission,Weight (kg)
0,Apollo11,21.55424
1,Apollo12,34.34238
2,Apollo14,41.83363
3,Apollo15,75.3991
4,Apollo16,92.46262
5,Apollo17,109.44402


In [97]:
missions.rename(columns={'Weight (kg)': 'Sample weight (kg)'}, inplace=True)
missions

Unnamed: 0,Mission,Sample weight (kg)
0,Apollo11,21.55424
1,Apollo12,34.34238
2,Apollo14,41.83363
3,Apollo15,75.3991
4,Apollo16,92.46262
5,Apollo17,109.44402


In [98]:
missions['Weight idff'] = missions['Sample weight (kg)'].diff()
missions

Unnamed: 0,Mission,Sample weight (kg),Weight idff
0,Apollo11,21.55424,
1,Apollo12,34.34238,12.78814
2,Apollo14,41.83363,7.49125
3,Apollo15,75.3991,33.56547
4,Apollo16,92.46262,17.06352
5,Apollo17,109.44402,16.9814


In [99]:
missions.fillna(value=0, inplace=True)
missions

Unnamed: 0,Mission,Sample weight (kg),Weight idff
0,Apollo11,21.55424,0.0
1,Apollo12,34.34238,12.78814
2,Apollo14,41.83363,7.49125
3,Apollo15,75.3991,33.56547
4,Apollo16,92.46262,17.06352
5,Apollo17,109.44402,16.9814


In [104]:
# 달탐사선 = 달모듈 + 명령모듈 -> 이 둘을 합쳐서 승무원 모듈
# 달 모듈의 이름, 중량, 아폴로 임무간 중량차를 나타내는 3개의 컬럼을 missions 데이터 프레임에 추가함.
달모듈이름 = ['Eagle (LM-5)', 'Interpid (LM-6)', \
         'Antares (LM-8)', 'Falcon (LM-10)', 'Orion (LM-11)', 'Challenger (LM-12)']
달모듈중량 = [15103, 15235, 15264, 16430, 16445, 16456]
missions['Lunar module (LM)'] = 달모듈이름
missions['LM mass (kg)'] = 달모듈중량
missions

Unnamed: 0,Mission,Sample weight (kg),Weight idff,Lunar module (LM),LM mass (kg)
0,Apollo11,21.55424,0.0,Eagle (LM-5),15103
1,Apollo12,34.34238,12.78814,Interpid (LM-6),15235
2,Apollo14,41.83363,7.49125,Antares (LM-8),15264
3,Apollo15,75.3991,33.56547,Falcon (LM-10),16430
4,Apollo16,92.46262,17.06352,Orion (LM-11),16445
5,Apollo17,109.44402,16.9814,Challenger (LM-12),16456


In [106]:
missions['LM mass diff'] = missions['LM mass (kg)'].diff()
missions

Unnamed: 0,Mission,Sample weight (kg),Weight idff,Lunar module (LM),LM mass (kg),LM mass diff
0,Apollo11,21.55424,0.0,Eagle (LM-5),15103,
1,Apollo12,34.34238,12.78814,Interpid (LM-6),15235,132.0
2,Apollo14,41.83363,7.49125,Antares (LM-8),15264,29.0
3,Apollo15,75.3991,33.56547,Falcon (LM-10),16430,1166.0
4,Apollo16,92.46262,17.06352,Orion (LM-11),16445,15.0
5,Apollo17,109.44402,16.9814,Challenger (LM-12),16456,11.0


In [107]:
missions.fillna(value=0, inplace=True)
missions

Unnamed: 0,Mission,Sample weight (kg),Weight idff,Lunar module (LM),LM mass (kg),LM mass diff
0,Apollo11,21.55424,0.0,Eagle (LM-5),15103,0.0
1,Apollo12,34.34238,12.78814,Interpid (LM-6),15235,132.0
2,Apollo14,41.83363,7.49125,Antares (LM-8),15264,29.0
3,Apollo15,75.3991,33.56547,Falcon (LM-10),16430,1166.0
4,Apollo16,92.46262,17.06352,Orion (LM-11),16445,15.0
5,Apollo17,109.44402,16.9814,Challenger (LM-12),16456,11.0
