In [1]:
library(tidyverse)
library(lubridate)

-- [1mAttaching packages[22m ------------------------------------------------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.2     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.4     [32mv[39m [34mdplyr  [39m 1.0.2
[32mv[39m [34mtidyr  [39m 1.1.2     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.4.0     [32mv[39m [34mforcats[39m 0.5.0

-- [1mConflicts[22m ---------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: 'lubridate'


The following objects are masked from 'package:base':

    date, intersect, setdiff, union




In [2]:
data <- read_csv("../datasets/Kumpula-June-2016-w-metadata.txt", skip = 8)
data


[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
  YEARMODA = [32mcol_double()[39m,
  TEMP = [32mcol_double()[39m,
  MAX = [32mcol_double()[39m,
  MIN = [32mcol_double()[39m
)




YEARMODA,TEMP,MAX,MIN
<dbl>,<dbl>,<dbl>,<dbl>
20160601,65.5,73.6,54.7
20160602,65.8,80.8,55.0
20160603,68.4,77.9,55.6
20160604,57.5,70.9,47.3
20160605,51.4,58.3,43.2
20160606,52.2,59.7,42.8
20160607,56.9,65.1,45.9
20160608,54.2,60.4,47.5
20160609,49.4,54.1,45.7
20160610,49.5,55.9,43.0


In [3]:
data$YEARMODA = ymd(data$YEARMODA)
data

YEARMODA,TEMP,MAX,MIN
<date>,<dbl>,<dbl>,<dbl>
2016-06-01,65.5,73.6,54.7
2016-06-02,65.8,80.8,55.0
2016-06-03,68.4,77.9,55.6
2016-06-04,57.5,70.9,47.3
2016-06-05,51.4,58.3,43.2
2016-06-06,52.2,59.7,42.8
2016-06-07,56.9,65.1,45.9
2016-06-08,54.2,60.4,47.5
2016-06-09,49.4,54.1,45.7
2016-06-10,49.5,55.9,43.0


## Computing
* creating a new column called "DIFF" and populate it with the difference between the MAX and MIN
* creating a new column called "DIFF_Min" and populate it with the difference between the TEMP and MIN
* creating a new column called "TEMP_CELCIUS" and populate it with the conversion from "F to °C of the "TEMP" column

In [4]:
data$DIFF <- data$MAX - data$MIN
data$DIFF_Min <- data$TEMP - data$MIN
data$TEMP_CELCIUS <- (data$TEMP - 32 ) * (5/9)
data

YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_Min,TEMP_CELCIUS
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2016-06-01,65.5,73.6,54.7,18.9,10.8,18.611111
2016-06-02,65.8,80.8,55.0,25.8,10.8,18.777778
2016-06-03,68.4,77.9,55.6,22.3,12.8,20.222222
2016-06-04,57.5,70.9,47.3,23.6,10.2,14.166667
2016-06-05,51.4,58.3,43.2,15.1,8.2,10.777778
2016-06-06,52.2,59.7,42.8,16.9,9.4,11.222222
2016-06-07,56.9,65.1,45.9,19.2,11.0,13.833333
2016-06-08,54.2,60.4,47.5,12.9,6.7,12.333333
2016-06-09,49.4,54.1,45.7,8.4,3.7,9.666667
2016-06-10,49.5,55.9,43.0,12.9,6.5,9.722222


### Using indices to selec data
Selecting first 5 rows

In [5]:
data[1:5,]

YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_Min,TEMP_CELCIUS
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2016-06-01,65.5,73.6,54.7,18.9,10.8,18.61111
2016-06-02,65.8,80.8,55.0,25.8,10.8,18.77778
2016-06-03,68.4,77.9,55.6,22.3,12.8,20.22222
2016-06-04,57.5,70.9,47.3,23.6,10.2,14.16667
2016-06-05,51.4,58.3,43.2,15.1,8.2,10.77778


Selecting row 8

In [6]:
data[8, ]

YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_Min,TEMP_CELCIUS
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2016-06-08,54.2,60.4,47.5,12.9,6.7,12.33333


Selecting from row 6 to 10 the values of column TEMP

In [7]:
data[6:10,"TEMP"]

TEMP
<dbl>
52.2
56.9
54.2
49.4
49.5


### Filtering and updating data
select rows with a temp_celsius value greater then 15 degrees

In [8]:
w_temp <- data %>%
            filter(TEMP_CELCIUS > 15)
w_temp

YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_Min,TEMP_CELCIUS
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2016-06-01,65.5,73.6,54.7,18.9,10.8,18.61111
2016-06-02,65.8,80.8,55.0,25.8,10.8,18.77778
2016-06-03,68.4,77.9,55.6,22.3,12.8,20.22222
2016-06-14,59.7,67.8,47.8,20.0,11.9,15.38889
2016-06-15,63.4,70.3,49.3,21.0,14.1,17.44444
2016-06-17,60.4,70.7,55.9,14.8,4.5,15.77778
2016-06-20,59.3,69.1,52.2,16.9,7.1,15.16667
2016-06-21,62.6,71.4,50.4,21.0,12.2,17.0
2016-06-22,61.7,70.2,55.4,14.8,6.3,16.5
2016-06-23,60.9,67.1,54.9,12.2,6.0,16.05556


select rows with a TEMP_CELSIUS is greter than 15 degrees and from the second half of the year

In [9]:
w_temp_2 <- data %>%
            filter(TEMP_CELCIUS > 15, YEARMODA > "2016-06-15")
w_temp_2

YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_Min,TEMP_CELCIUS
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2016-06-17,60.4,70.7,55.9,14.8,4.5,15.77778
2016-06-20,59.3,69.1,52.2,16.9,7.1,15.16667
2016-06-21,62.6,71.4,50.4,21.0,12.2,17.0
2016-06-22,61.7,70.2,55.4,14.8,6.3,16.5
2016-06-23,60.9,67.1,54.9,12.2,6.0,16.05556
2016-06-24,61.1,68.9,56.7,12.2,4.4,16.16667
2016-06-25,65.7,75.4,57.9,17.5,7.8,18.72222
2016-06-26,69.6,77.7,60.3,17.4,9.3,20.88889
2016-06-27,60.7,70.0,57.6,12.4,3.1,15.94444
2016-06-28,65.4,73.0,55.8,17.2,9.6,18.55556


### Sorting Data
Sort in ascending order the dataframe according to a column, for example 'TEMP'

In [10]:
sorted_temp_df <- data %>%
                    arrange(TEMP)
sorted_temp_df

YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_Min,TEMP_CELCIUS
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2016-06-09,49.4,54.1,45.7,8.4,3.7,9.666667
2016-06-10,49.5,55.9,43.0,12.9,6.5,9.722222
2016-06-05,51.4,58.3,43.2,15.1,8.2,10.777778
2016-06-06,52.2,59.7,42.8,16.9,9.4,11.222222
2016-06-11,54.0,62.1,41.7,20.4,12.3,12.222222
2016-06-08,54.2,60.4,47.5,12.9,6.7,12.333333
2016-06-12,55.4,64.2,46.0,18.2,9.4,13.0
2016-06-19,56.3,59.2,54.1,5.1,2.2,13.5
2016-06-07,56.9,65.1,45.9,19.2,11.0,13.833333
2016-06-18,57.3,62.8,54.0,8.8,3.3,14.055556


Descending order

In [11]:
sorted_temp_df_2 <- data %>%
                    arrange(desc(TEMP))
sorted_temp_df_2

YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_Min,TEMP_CELCIUS
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2016-06-26,69.6,77.7,60.3,17.4,9.3,20.888889
2016-06-03,68.4,77.9,55.6,22.3,12.8,20.222222
2016-06-02,65.8,80.8,55.0,25.8,10.8,18.777778
2016-06-29,65.8,73.2,59.7,13.5,6.1,18.777778
2016-06-25,65.7,75.4,57.9,17.5,7.8,18.722222
2016-06-30,65.7,72.7,59.2,13.5,6.5,18.722222
2016-06-01,65.5,73.6,54.7,18.9,10.8,18.611111
2016-06-28,65.4,73.0,55.8,17.2,9.6,18.555556
2016-06-15,63.4,70.3,49.3,21.0,14.1,17.444444
2016-06-21,62.6,71.4,50.4,21.0,12.2,17.0


Sorting the dataframe using two or more columns

In [12]:
sorted_temp_df_3 <- data %>%
                    arrange(desc(TEMP, DIFF))
sorted_temp_df_3

YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_Min,TEMP_CELCIUS
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2016-06-26,69.6,77.7,60.3,17.4,9.3,20.888889
2016-06-03,68.4,77.9,55.6,22.3,12.8,20.222222
2016-06-02,65.8,80.8,55.0,25.8,10.8,18.777778
2016-06-29,65.8,73.2,59.7,13.5,6.1,18.777778
2016-06-25,65.7,75.4,57.9,17.5,7.8,18.722222
2016-06-30,65.7,72.7,59.2,13.5,6.5,18.722222
2016-06-01,65.5,73.6,54.7,18.9,10.8,18.611111
2016-06-28,65.4,73.0,55.8,17.2,9.6,18.555556
2016-06-15,63.4,70.3,49.3,21.0,14.1,17.444444
2016-06-21,62.6,71.4,50.4,21.0,12.2,17.0


### Rounding and finding unique values

Create new column with the temperature in Celsius rounden to the nearest integer

In [13]:
data <- data %>%
            mutate(CELSIUS_ROUNDED = as.integer(round(TEMP_CELCIUS,0)))
data

YEARMODA,TEMP,MAX,MIN,DIFF,DIFF_Min,TEMP_CELCIUS,CELSIUS_ROUNDED
<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
2016-06-01,65.5,73.6,54.7,18.9,10.8,18.611111,19
2016-06-02,65.8,80.8,55.0,25.8,10.8,18.777778,19
2016-06-03,68.4,77.9,55.6,22.3,12.8,20.222222,20
2016-06-04,57.5,70.9,47.3,23.6,10.2,14.166667,14
2016-06-05,51.4,58.3,43.2,15.1,8.2,10.777778,11
2016-06-06,52.2,59.7,42.8,16.9,9.4,11.222222,11
2016-06-07,56.9,65.1,45.9,19.2,11.0,13.833333,14
2016-06-08,54.2,60.4,47.5,12.9,6.7,12.333333,12
2016-06-09,49.4,54.1,45.7,8.4,3.7,9.666667,10
2016-06-10,49.5,55.9,43.0,12.9,6.5,9.722222,10


In [14]:
unique <- data$CELSIUS_ROUNDED %>%
            unique()
unique