In [4]:
#### data frames are the work horse of R, so be sure to understand all the following topics
#### #1 Creating Data Frames
#### #2 Importing and Exporting Data
#### #3 Getting Information about Data Frames
#### #4 Referencing Cells
#### #5 Referencing Rows
#### #6 Referencing Columns
#### #7 Adding Rows
#### #8 Adding Columns
#### #9 Setting Column Names
#### #10 Selecting Multiple Rows
#### #11 Selecting Multiple Columns
#### #12 Dealing with Missing Data

In [5]:
#### #1 Creating Data Frames
#lets first create an empty data frame
empty <- data.frame()

In [28]:
# lets now create a vector 
c1 <-  runif(10)

In [29]:
#we can use the built in vector 'letters' to make the next vector
c2 <- letters[1:10]

In [30]:
c1
c2

In [31]:
# lets make our data frame and name the columns
df <- data.frame("numbers" = c1, "letters" = c2) 
df

numbers,letters
<dbl>,<fct>
0.38338091,a
0.69682516,b
0.20176775,c
0.69023435,d
0.02096354,e
0.14099527,f
0.24402258,g
0.79013028,h
0.60132246,i
0.96245307,j


In [33]:
#### #2 Importing and Exporting Data
write.csv(df,file = 'saved_df.csv') #this will save the csv file to the hard drive
df2 <- read.csv('saved_df.csv') # this will assign a new variable the csv data frame file
df2

X,numbers,letters
<int>,<dbl>,<fct>
1,0.38338091,a
2,0.69682516,b
3,0.20176775,c
4,0.69023435,d
5,0.02096354,e
6,0.14099527,f
7,0.24402258,g
8,0.79013028,h
9,0.60132246,i
10,0.96245307,j


In [34]:
### #3 Getting information about the dataframe
# Lets say we want to know the number of column and rows
# we can use the nrow() and ncol() function to check both values

In [35]:
nrow(df) #these won't provide the names for the columns and row, just the values
ncol(df)

In [41]:
colnames(df2)
rownames(df2) #be careful with this because this can be a massive amount of data depending on how much your data frame contains
#if you've already imported data from a csv, this will just return the number from which row the data came from

In [42]:
str(df)

'data.frame':	10 obs. of  2 variables:
 $ numbers: num  0.383 0.697 0.202 0.69 0.021 ...
 $ letters: Factor w/ 10 levels "a","b","c","d",..: 1 2 3 4 5 6 7 8 9 10


In [43]:
summary(df) #the numbers column is significant, but the letters column will just return one value for each letter

    numbers           letters 
 Min.   :0.02096   a      :1  
 1st Qu.:0.21233   b      :1  
 Median :0.49235   c      :1  
 Mean   :0.47321   d      :1  
 3rd Qu.:0.69518   e      :1  
 Max.   :0.96245   f      :1  
                   (Other):4  

In [45]:
#### #4 Referencing Cells
df
df[5,2] #the same notation as calling a value as a matrix, but not incredbly useful since we would have to know the column id#

numbers,letters
<dbl>,<fct>
0.38338091,a
0.69682516,b
0.20176775,c
0.69023435,d
0.02096354,e
0.14099527,f
0.24402258,g
0.79013028,h
0.60132246,i
0.96245307,j


In [49]:
df[5,'numbers'] #this will call the value on the 5th row under the 'numbers' column

In [52]:
#lets change that number using an assignment
df[[2,'numbers']] <- 999
df

numbers,letters
<dbl>,<fct>
0.38338091,a
999.0,b
0.20176775,c
0.69023435,d
0.02096354,e
0.14099527,f
0.24402258,g
0.79013028,h
0.60132246,i
0.96245307,j


In [53]:
#### #5 Referencing Rows
#referencing a row is similar to referencing a cell, you just don't need double bracket notation

In [55]:
df[3,]

Unnamed: 0_level_0,numbers,letters
Unnamed: 0_level_1,<dbl>,<fct>
3,0.2017678,c


In [56]:
#### #6 Referencing Columns
# lets use a different data frame for this example
head(mtcars)

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1


In [64]:
mtcars$mpg # This is how you can get a vector back. This is the most common way
mtcars[,'mpg'] # this is a similar way to pull the column 
mtcars[['mpg']]
mtcars['mpg'] #this will add the name value of the data frame

Unnamed: 0_level_0,mpg
Unnamed: 0_level_1,<dbl>
Mazda RX4,21.0
Mazda RX4 Wag,21.0
Datsun 710,22.8
Hornet 4 Drive,21.4
Hornet Sportabout,18.7
Valiant,18.1
Duster 360,14.3
Merc 240D,24.4
Merc 230,22.8
Merc 280,19.2


In [65]:
mtcars[c('mpg','cyl')] #if you want to pull multiple columns

Unnamed: 0_level_0,mpg,cyl
Unnamed: 0_level_1,<dbl>,<dbl>
Mazda RX4,21.0,6
Mazda RX4 Wag,21.0,6
Datsun 710,22.8,4
Hornet 4 Drive,21.4,6
Hornet Sportabout,18.7,8
Valiant,18.1,6
Duster 360,14.3,8
Merc 240D,24.4,4
Merc 230,22.8,4
Merc 280,19.2,6


In [None]:
#### #7 Adding Rows

In [73]:
df3 <- data.frame('numbers' = 2000, 'letters' = 'z')
df3
df

numbers,letters
<dbl>,<fct>
2000,z


numbers,letters
<dbl>,<fct>
0.38338091,a
999.0,b
0.20176775,c
0.69023435,d
0.02096354,e
0.14099527,f
0.24402258,g
0.79013028,h
0.60132246,i
0.96245307,j


In [76]:
dfnew <- rbind(df,df3)
dfnew

numbers,letters
<dbl>,<fct>
0.3833809,a
999.0,b
0.2017678,c
0.6902343,d
0.02096354,e
0.1409953,f
0.2440226,g
0.7901303,h
0.6013225,i
0.9624531,j


In [77]:
#### #8 Adding Columns
dfnew$newcol <- round(200*dfnew$numbers,1)

In [78]:
dfnew

numbers,letters,newcol
<dbl>,<fct>,<dbl>
0.3833809,a,76.7
999.0,b,199800.0
0.2017678,c,40.4
0.6902343,d,138.0
0.02096354,e,4.2
0.1409953,f,28.2
0.2440226,g,48.8
0.7901303,h,158.0
0.6013225,i,120.3
0.9624531,j,192.5


In [82]:
#### #9 Setting Column Names
colnames(dfnew)
colnames(dfnew) <- c('Numbers new name', 'letters new name',"200x rounded") # if we want to rename all the columns
dfnew

Numbers new name,letters new name,200x rounded
<dbl>,<fct>,<dbl>
0.3833809,a,76.7
999.0,b,199800.0
0.2017678,c,40.4
0.6902343,d,138.0
0.02096354,e,4.2
0.1409953,f,28.2
0.2440226,g,48.8
0.7901303,h,158.0
0.6013225,i,120.3
0.9624531,j,192.5


In [84]:
colnames(dfnew)[1] <- 'small numbers'
colnames(dfnew)[3] <- 'large numbers'
dfnew

small numbers,letters new name,large numbers
<dbl>,<fct>,<dbl>
0.3833809,a,76.7
999.0,b,199800.0
0.2017678,c,40.4
0.6902343,d,138.0
0.02096354,e,4.2
0.1409953,f,28.2
0.2440226,g,48.8
0.7901303,h,158.0
0.6013225,i,120.3
0.9624531,j,192.5


In [85]:
#### #10 Selecting Multiple Rows

In [87]:
dfnew[1:3,] ##selects the rows 1,2,3 

Unnamed: 0_level_0,small numbers,letters new name,large numbers
Unnamed: 0_level_1,<dbl>,<fct>,<dbl>
1,0.3833809,a,76.7
2,999.0,b,199800.0
3,0.2017678,c,40.4


In [91]:
dfnew[(-2),] #selects everything BUT the second row

Unnamed: 0_level_0,small numbers,letters new name,large numbers
Unnamed: 0_level_1,<dbl>,<fct>,<dbl>
1,0.3833809,a,76.7
3,0.2017678,c,40.4
4,0.6902343,d,138.0
5,0.02096354,e,4.2
6,0.1409953,f,28.2
7,0.2440226,g,48.8
8,0.7901303,h,158.0
9,0.6013225,i,120.3
10,0.9624531,j,192.5
11,2000.0,z,400000.0


In [92]:
head(mtcars,3) 

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1


In [97]:
#Lets try conditional selection
mtcars[ mtcars$mpg < 15 , ]

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Duster 360,14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
Cadillac Fleetwood,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
Lincoln Continental,10.4,8,460,215,3.0,5.424,17.82,0,0,3,4
Chrysler Imperial,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
Camaro Z28,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4


In [103]:
mtcars[ (mtcars$mpg > 20) & (mtcars$cyl == 6), ]

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1


In [None]:
mtcars[ (mtcars$mpg > 20) & (mtcars$cyl == 6), ]