# task #1

In [34]:
library(httr)
library(xml2)
library(rvest)

In [2]:
get_wiki_covid19_page <- function() {
  wiki_base_url <<- "https://en.wikipedia.org/w/index.php"

  body <- list(title ="Template:COVID-19_testing_by_country")
  response <<- GET(wiki_base_url,query=body)  
  return(response)
}

In [3]:
get_wiki_covid19_page ()

Response [https://en.wikipedia.org/w/index.php?title=Template%3ACOVID-19_testing_by_country]
  Date: 2022-06-13 12:06
  Status: 200
  Content-Type: text/html; charset=UTF-8
  Size: 413 kB
<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Template:COVID-19 testing by country - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames...
"CS1 German-language sources (de)","CS1 Azerbaijani-language sources (az)","C...
"CS1 uses Japanese-language script (ja)","CS1 Japanese-language sources (ja)"...
"COVID-19 pandemic templates"],"wgPageContentLanguage":"en","wgPageContentMod...
"Q87325019","GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeE...
...

# task #2

In [4]:
root_node <- read_html(get_wiki_covid19_page())
root_node

{xml_document}
<html class="client-nojs" lang="en" dir="ltr">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body class="mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-10 ns-subject ...

In [5]:
url <- "https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country"
download.file (url, destfile = 'covid_19.html')

In [6]:
table_node <-html_node(root_node,"table") 

In [7]:
table_node

{xml_node}
<table class="wikitable plainrowheaders sortable collapsible autocollapse">
[1] <caption>\n<style data-mw-deduplicate="TemplateStyles:r1063604349">.mw-pa ...
[2] <tbody>\n<tr>\n<th>Country or region\n</th>\n<th>Date<sup id="cite_ref-1" ...

In [8]:
dataFrame_table <- html_table(table_node)

In [9]:
head(dataFrame_table) 

Country or region,Date[a],Tested,Units[b],Confirmed(cases),"Confirmed /tested,%","Tested /population,%","Confirmed /population,%",Ref.
Afghanistan,17 Dec 2020,154767,samples,49621,32.1,0.4,0.13,[1]
Albania,18 Feb 2021,428654,samples,96838,22.6,15.0,3.4,[2]
Algeria,2 Nov 2020,230553,samples,58574,25.4,0.53,0.13,[3][4]
Andorra,23 Feb 2022,300307,samples,37958,12.6,387.0,49.0,[5]
Angola,2 Feb 2021,399228,samples,20981,5.3,1.3,0.067,[6]
Antigua and Barbuda,6 Mar 2021,15268,samples,832,5.4,15.9,0.86,[7]


# task#3

In [10]:
summary(dataFrame_table)

 Country or region    Date[a]             Tested            Units[b]        
 Length:173         Length:173         Length:173         Length:173        
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
 Confirmed(cases)   Confirmed /tested,% Tested /population,%
 Length:173         Length:173          Length:173          
 Class :character   Class :character    Class :character    
 Mode  :character   Mode  :character    Mode  :character    
 Confirmed /population,%     Ref.          
 Length:173              Length:173        
 Class :character        Class :character  
 Mode  :character        Mode  :character  

In [11]:
dim(dataFrame_table)

In [12]:
preprocess_covid_data_frame <- function(data_frame) {
    
    shape <- dim(data_frame)

    # Remove the World row
    data_frame<-data_frame[!(data_frame$`Country or region`=="World"),]
    # Remove the last row
    data_frame <- data_frame[1:172, ]
    
    # We dont need the Units and Ref columns, so can be removed
    data_frame["Ref."] <- NULL
    data_frame["Units[b]"] <- NULL
    
    # Renaming the columns
    names(data_frame) <- c("country", "date", "tested", "confirmed", "confirmed.tested.ratio", "tested.population.ratio", "confirmed.population.ratio")
    
    # Convert column data types
    data_frame$country <- as.factor(data_frame$country)
    data_frame$date <- as.factor(data_frame$date)
    data_frame$tested <- as.numeric(gsub(",","",data_frame$tested))
    data_frame$confirmed <- as.numeric(gsub(",","",data_frame$confirmed))
    data_frame$'confirmed.tested.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.tested.ratio`))
    data_frame$'tested.population.ratio' <- as.numeric(gsub(",","",data_frame$`tested.population.ratio`))
    data_frame$'confirmed.population.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.population.ratio`))
    
    return(data_frame)
}

In [13]:
cleaned_data <- preprocess_covid_data_frame(dataFrame_table)
head(cleaned_data)

country,date,tested,confirmed,confirmed.tested.ratio,tested.population.ratio,confirmed.population.ratio
Afghanistan,17 Dec 2020,154767,49621,32.1,0.4,0.13
Albania,18 Feb 2021,428654,96838,22.6,15.0,3.4
Algeria,2 Nov 2020,230553,58574,25.4,0.53,0.13
Andorra,23 Feb 2022,300307,37958,12.6,387.0,49.0
Angola,2 Feb 2021,399228,20981,5.3,1.3,0.067
Antigua and Barbuda,6 Mar 2021,15268,832,5.4,15.9,0.86


In [14]:
summary(cleaned_data)

                country             date         tested         
 Afghanistan        :  1   7 Jun 2022 : 11   Min.   :     3880  
 Albania            :  1   6 Jun 2022 :  9   1st Qu.:   512037  
 Algeria            :  1   9 Jun 2022 :  8   Median :  3029859  
 Andorra            :  1   10 Jun 2022:  4   Mean   : 26007778  
 Angola             :  1   1 Mar 2021 :  3   3rd Qu.: 11751774  
 Antigua and Barbuda:  1   12 Jun 2022:  3   Max.   :553621766  
 (Other)            :166   (Other)    :134                      
   confirmed        confirmed.tested.ratio tested.population.ratio
 Min.   :       0   Min.   : 0.00          Min.   :   0.0065      
 1st Qu.:   37000   1st Qu.: 5.00          1st Qu.:   8.5000      
 Median :  281196   Median : 9.70          Median :  44.1500      
 Mean   : 1955123   Mean   :10.95          Mean   : 163.7438      
 3rd Qu.: 1155312   3rd Qu.:15.07          3rd Qu.: 141.0000      
 Max.   :35940893   Max.   :42.80          Max.   :2871.0000      
           

In [15]:

filePath <- "/home/mohammed/Documents/cv/covid_19.csv"
dataFile <- write.csv(cleaned_data,file =filePath,row.names = FALSE)

# task #4

In [17]:
covid_data_frame_csv <- read.csv("covid_19.csv", header=TRUE, sep=",")

In [18]:
covid_data_frame_csv[5:10,c("country", "confirmed")]

Unnamed: 0,country,confirmed
5,Angola,20981
6,Antigua and Barbuda,832
7,Argentina,9060495
8,Armenia,422963
9,Australia,7568100
10,Austria,4327840


# task #5

In [19]:
# Get the total confirmed cases worldwide
total_confirmed <- sum(covid_data_frame_csv["confirmed"])
# Get the total tested cases worldwide
total_tested <- sum(covid_data_frame_csv["tested"])
# Get the positive ratio (confirmed / tested)
positive_ratio_confirmed_tested <- total_confirmed/total_tested
positive_ratio_confirmed_tested

# task #6

In [20]:
country_column <- (covid_data_frame_csv[,1])
# Check its class (should be Factor)
class(country_column)
# Conver the country column into character so that you can easily sort them
country_column <- as.character(country_column)
class(country_column)
# Sort the countries AtoZ
country_column_dec <- sort(country_column, decreasing =FALSE)
# Sort the countries ZtoA
country_column_inc <- sort(country_column, decreasing =TRUE)
# Print the sorted ZtoA list
head(country_column_inc)

# task #7

In [21]:
matches <- regexpr("United.+",country_column)
United_extention <- regmatches(country_column,matches)

In [22]:
United_extention

# task #8

In [23]:
##task #8
#find the index of two country {row}
which(covid_data_frame_csv["country"]=="United Kingdom")
which(covid_data_frame_csv["country"]=="United States")

In [24]:
covid_data_frame_csv[165:166,]

Unnamed: 0,country,date,tested,confirmed,confirmed.tested.ratio,tested.population.ratio,confirmed.population.ratio
165,United Kingdom,19 May 2022,522526476,22232377,4.3,774,32.9
166,United States,9 Aug 2021,512152348,35940893,7.0,155,10.9


# task #9

In [25]:
uni_kng_index<-which(covid_data_frame_csv["country"]=="United Kingdom")
uni_state_index<-which(covid_data_frame_csv["country"]=="United States")
col <-grep("confirmed.population.ratio", colnames(covid_data_frame_csv))

In [26]:

if (covid_data_frame_csv[uni_kng_index,col]>covid_data_frame_csv[uni_state_index,col]) 
{
   print("United Kingdom has a high confirmed.population.ratio")
} else {
   print("United states has a high confirmed.population.ratio")
}


[1] "United Kingdom has a high confirmed.population.ratio"


# task #10

In [27]:
# change the name of the covid_data_frame_csv to DataFrame
data <- cbind(covid_data_frame_csv)

In [80]:
#the count of countries confirmed to population ratio rate less than a threshold
threshold <- 1
count(data %>% filter(confirmed.population.ratio < threshold )%>% 
select(country))
data %>% filter(confirmed.population.ratio < 1 )%>% 
select(country)

n
53


country
Afghanistan
Algeria
Angola
Antigua and Barbuda
Bangladesh
Benin
Brunei
Burkina Faso
Burundi
Cambodia


In [94]:
sapply(data,typeof)

In [127]:
#to obtain one value or more
summarize(filter(data, country == c("United Kingdom","United States"))
          ,tested.population.ratio,tested)


ERROR: Error: Column `tested.population.ratio` must be length 1 (a summary value), not 2


In [125]:
#to obtain one raw
filter(data, country == "United Kingdom")
                  

country,date,tested,confirmed,confirmed.tested.ratio,tested.population.ratio,confirmed.population.ratio
United Kingdom,19 May 2022,522526476,22232377,4.3,774,32.9


In [132]:
filter(data, country == c("United Kingdom","United States"))

country,date,tested,confirmed,confirmed.tested.ratio,tested.population.ratio,confirmed.population.ratio
United Kingdom,19 May 2022,522526476,22232377,4.3,774,32.9
United States,9 Aug 2021,512152348,35940893,7.0,155,10.9


In [None]:
slice(data[1:2,2:3])

date,tested
17 Dec 2020,154767
18 Feb 2021,428654
