# [R 이용한 데이터 wrangling논쟁](https://hyunyulhenry.github.io/data_wrangling/numbers.html)

## 데이터 구조 다루기

> [library(stringr)](https://stringr.tidyverse.org/)

> [library(lubridate)](https://lubridate.tidyverse.org/)

> [library(forcats)](https://forcats.tidyverse.org/)

# Data Structure Basics

|Dimension	|Homogeneous|	Heterogeneous|
|:-:|:-:|:-:|
|1D	|Atomic Vector	|List|
|2D	|Matrix	|Data frame|
|nD	|Array	||

In [14]:
vectors = 1:10
lists = list(item1=1:10, itme2=LETTERS[1:18])
matrixs=matrix(1:12, nrow=4)

df = data.frame(item1=1:10, item2=LETTERS[1:10])

In [15]:
vectors
lists
matrixs

0,1,2
1,5,9
2,6,10
3,7,11
4,8,12


In [16]:
df

item1,item2
<int>,<chr>
1,A
2,B
3,C
4,D
5,E
6,F
7,G
8,H
9,I
10,J


In [6]:
1:18

In [10]:
length(LETTERS)

In [11]:
LETTERS[1:length(LETTERS)]

ERROR: ignored

In [23]:
str(lists)
str(matrixs)
str(df)

List of 2
 $ item1: int [1:10] 1 2 3 4 5 6 7 8 9 10
 $ itme2: chr [1:18] "A" "B" "C" "D" ...
 int [1:4, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
'data.frame':	10 obs. of  2 variables:
 $ item1: int  1 2 3 4 5 6 7 8 9 10
 $ item2: chr  "A" "B" "C" "D" ...


# Vector
- integer
- double
- logical
- character

In [24]:
vec_integer = 8:17
str(vec_integer)

 int [1:10] 8 9 10 11 12 13 14 15 16 17


In [26]:
typeof(vec_integer)
class(vec_integer)
length(vec_integer)
nchar(vec_integer)

In [27]:
vec_double = c(.5, .6, .2)
str(vec_double)

 num [1:3] 0.5 0.6 0.2


In [28]:
typeof(vec_double)
class(vec_double)
length(vec_double)
nchar(vec_double)

In [29]:
vec_logical = c(TRUE, FALSE, TRUE)
str(vec_logical)

 logi [1:3] TRUE FALSE TRUE


In [30]:
typeof(vec_logical)
class(vec_logical)
length(vec_logical)
nchar(vec_logical)

In [33]:
as.double(vec_logical)
as.integer(vec_logical)

In [35]:
as.logical(vec_double)
as.logical(vec_integer)

In [36]:
vec_char = c('a', "b", 1, 2)
str(vec_char)

 chr [1:4] "a" "b" "1" "2"


In [37]:
typeof(vec_char)
class(vec_char)

In [38]:
vec_num = c(1, 2, TRUE, FALSE)
str(vec_num)

 num [1:4] 1 2 1 0


In [39]:
typeof(vec_num)
class(vec_num)

In [40]:
vec_log = c('a', 'b', TRUE, FALSE)
str(vec_log)

 chr [1:4] "a" "b" "TRUE" "FALSE"


In [42]:
vec = 15:24
c(vec, 10:14)
vec[15]
vec[1:5]

In [44]:
vec[c(1, 3, 5)]
vec[seq(1, 15, 3)]

In [46]:
vec[-1] # 해당 순서 빼고
vec[0] # 아무것도 출력 안됨.

In [58]:
vec[-1:-5]

In [54]:
vec[vec<20 & 18<=vec] # and
vec[vec<16 | 22<vec] # or

# List

element간 타입 or 길이 달라도 결합 가능

In [69]:
round(runif(1:5, min=0, max=1) * 10, digits=2)

In [70]:
x = round(runif(1:5) * 10, digits=2)
li = list(x, c(TRUE, FALSE), LETTERS[-1:-24])
str(li)

List of 3
 $ : num [1:5] 3.67 7.45 7.51 9.26 3.74
 $ : logi [1:2] TRUE FALSE
 $ : chr [1:2] "Y" "Z"


In [72]:
class(li)
typeof(li)
length(li)
nchar(li)

In [73]:
li2 = append(li, c(1,.5))

In [74]:
li2

In [75]:
li3 = append(li2, list(list(c(TRUE, FALSE))))
li3

In [76]:
str(li3)

List of 6
 $ : num [1:5] 3.67 7.45 7.51 9.26 3.74
 $ : logi [1:2] TRUE FALSE
 $ : chr [1:2] "Y" "Z"
 $ : num 1
 $ : num 0.5
 $ :List of 1
  ..$ : logi [1:2] TRUE FALSE


In [78]:
# $ append 역할.
li3$item4 = 'new list items'
str(li3)

List of 7
 $      : num [1:5] 3.67 7.45 7.51 9.26 3.74
 $      : logi [1:2] TRUE FALSE
 $      : chr [1:2] "Y" "Z"
 $      : num 1
 $      : num 0.5
 $      :List of 1
  ..$ : logi [1:2] TRUE FALSE
 $ item4: chr "new list items"


In [79]:
li3[length(li3)]

In [80]:
li3$item4

In [89]:
li3[6]
li3[[6]]
li3[[6]][2]

# Matrix

In [90]:
mat1 = matrix(1:6, nrow=2, ncol=3)
mat2 = matrix(letters[1:6], 2, 3)
mat1
mat2

0,1,2
1,3,5
2,4,6


0,1,2
a,c,e
b,d,f


In [93]:
vec1 = 1:4
vec2 = 5:8
# combine by columns
cbind(vec1, vec2)
# combine by rows
rbind(vec1, vec2)

vec1,vec2
1,5
2,6
3,7
4,8


0,1,2,3,4
vec1,1,2,3,4
vec2,5,6,7,8


In [94]:
cbind(cbind(vec1, vec2), 9:12)

vec1,vec2,Unnamed: 2
1,5,9
2,6,10
3,7,11
4,8,12


In [96]:
rbind(rbind(vec1, vec2), 9:12)

0,1,2,3,4
vec1,1,2,3,4
vec2,5,6,7,8
,9,10,11,12


In [98]:
mat1

0,1,2
1,3,5
2,4,6


In [99]:
rownames(mat1) = c('r1','r2')
colnames(mat1) = c('c1','c2','c3')
mat1

Unnamed: 0,c1,c2,c3
r1,1,3,5
r2,2,4,6


In [100]:
length(mat1)
nchar(mat1)

Unnamed: 0,c1,c2,c3
r1,1,1,1
r2,1,1,1


In [105]:
mat1[1, 2:3]
mat1[2, c(1,3)]
mat1[, 2:3]
mat1[,]

Unnamed: 0,c2,c3
r1,3,5
r2,4,6


Unnamed: 0,c1,c2,c3
r1,1,3,5
r2,2,4,6


# Data Frame

In [109]:
df = data.frame(
    col1=1:3,
    col5=c("This", "is", "Text"),
    col8=c(TRUE, FALSE, TRUE),
    col10=c(2.4, 42, pi)
    # stringsAsFactors = FALSE
    # R에서는 문자 형식을 자동적으로 팩터로 인식. 원치 않으면. FALSE --> defualt
)
str(df)

'data.frame':	3 obs. of  4 variables:
 $ col1 : int  1 2 3
 $ col5 : chr  "This" "is" "Text"
 $ col8 : logi  TRUE FALSE TRUE
 $ col10: num  2.4 42 3.14


In [110]:
df

col1,col5,col8,col10
<int>,<chr>,<lgl>,<dbl>
1,This,True,2.4
2,is,False,42.0
3,Text,True,3.141593


In [112]:
?data.frame

In [111]:
li = list(
    col1=1:3,
    col5=c("This", "is", "Text"),
    col8=c(TRUE, FALSE, TRUE),
    col10=c(2.4, 42, pi)
)
str(li)

List of 4
 $ col1 : int [1:3] 1 2 3
 $ col5 : chr [1:3] "This" "is" "Text"
 $ col8 : logi [1:3] TRUE FALSE TRUE
 $ col10: num [1:3] 2.4 42 3.14


In [113]:
as.data.frame(li)

col1,col5,col8,col10
<int>,<chr>,<lgl>,<dbl>
1,This,True,2.4
2,is,False,42.0
3,Text,True,3.141593


In [114]:
letter = c(letters[1:3])
cbind(df, letter)

col1,col5,col8,col10,letter
<int>,<chr>,<lgl>,<dbl>,<chr>
1,This,True,2.4,a
2,is,False,42.0,b
3,Text,True,3.141593,c


In [115]:
df

col1,col5,col8,col10
<int>,<chr>,<lgl>,<dbl>
1,This,True,2.4
2,is,False,42.0
3,Text,True,3.141593


In [116]:
rbind(df, c(4,'R',F,1.1))

col1,col5,col8,col10
<chr>,<chr>,<chr>,<chr>
1,This,True,2.4
2,is,False,42.0
3,Text,True,3.14159265358979
4,R,False,1.1


In [117]:
T

In [118]:
# data frame 형태로
df[,2,drop=F]

col5
<chr>
This
is
Text


In [119]:
df

col1,col5,col8,col10
<int>,<chr>,<lgl>,<dbl>
1,This,True,2.4
2,is,False,42.0
3,Text,True,3.141593


In [122]:
# data frame 형태 drop. --> default
df[,2, drop=T]
df[,2]

In [124]:
str(df[,2]) # character
str(df[,2,drop=F]) # dataframe

 chr [1:3] "This" "is" "Text"
'data.frame':	3 obs. of  1 variable:
 $ col5: chr  "This" "is" "Text"


# 결측치 테스트

- is.na()

In [128]:
x= c(NA, rep(seq(1, 10, 5), times=2), NA)
x
is.na(x)

In [134]:
as.integer(is.na(x))
sum(as.integer(is.na(x)))

In [137]:
is.na(cbind(df, c(F,T,NA)))

col1,col5,col8,col10,"c(F, T, NA)"
False,False,False,False,False
False,False,False,False,False
False,False,False,False,True


In [138]:
which(is.na(cbind(df, c(F,T,NA))))

In [139]:
y = c(5, 10, NA, 20)
mean(y)
mean(y, na.rm=T)

In [146]:
# na 값 채우기

y[is.na(y)]
mean(y, na.rm=T)
y[is.na(y)] = mean(y, na.rm=T)
y

In [143]:
na.omit(cbind(df, c(F,T,NA)))

Unnamed: 0_level_0,col1,col5,col8,col10,"c(F, T, NA)"
Unnamed: 0_level_1,<int>,<chr>,<lgl>,<dbl>,<lgl>
1,1,This,True,2.4,False
2,2,is,False,42.0,True
