# IMDB

# **IMDB project (Web Scraping)**
Final result: The dataframe which has 3 columns which are Name, Year, and Rating

In [71]:
# load library
# scrape data from website

library(tidyverse)
library(rvest) 

In [72]:
# define the URL for scraping data to object and read html from it

url <- "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc"

In [73]:
# read html
imdb <- read_html(url)

In [74]:
imdb

{html_document}
<html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
[1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
[2] <body id="styleguide-v2" class="fixed">\n            <img height="1" widt ...

In [75]:
# scraping the movie title from "h3.lister-item-header" node to text 
imdb %>%
    html_node("h3.lister-item-header") %>%
    html_text()

In [76]:
# clean data for the movie year column
# use "html_text2()" for remove special charactor
movie_titles <- imdb %>%
    html_nodes("h3.lister-item-header") %>%
    html_text2() %>%
    str_remove("[0-9]{4}") %>%                 # remove the 4 numbers or year
    str_remove_all("\\(") %>%                  # remove a left parenthesis
    str_remove_all("\\)")                      # remove a right parenthesis

In [77]:
movie_titles[1:10]

In [78]:
# scraping the year from "h3.lister-item-header" node to text
# use "html_text2()" for remove special charactor
movie_year <- imdb %>%
    html_nodes("h3.lister-item-header") %>%
    html_text2() %>%
    str_sub(-6) %>%                            # keep only 6 last character
    str_remove_all("\\(") %>%                  # remove a left parenthesis
    str_remove_all("\\)")                      # remove a right parenthesis

In [79]:
movie_year %>% head()

In [80]:
# scraping the rating from "h3.lister-item-header" node to text
# use "html_text2()" for remove special charactor
# convert the movie rating column from character to numeric

movie_ratings <- imdb %>%
    html_nodes("div.ratings-imdb-rating") %>%
    html_text2() %>%
    as.numeric()

In [81]:
movie_ratings[1:10]

In [82]:
# number of vote
num_votes <- imdb %>%
    html_nodes("p.sort-num_votes-visible") %>%
    html_text2()
    

In [83]:
num_votes %>% tail(1)

In [84]:
# build the data set
# combine 3 objects to be dataframe
df <- data.frame(
    title = movie_titles,
    year = movie_year,
    rating = movie_ratings,
    num_vote = num_votes
)

df %>% head()
df %>% tail()

Unnamed: 0_level_0,title,year,rating,num_vote
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
1,1. The Shawshank Redemption,1994,9.3,"Votes: 2,673,995 | Gross: $28.34M | Top 250: #1"
2,2. The Godfather,1972,9.2,"Votes: 1,853,314 | Gross: $134.97M | Top 250: #2"
3,3. The Dark Knight,2008,9.0,"Votes: 2,646,988 | Gross: $534.86M | Top 250: #3"
4,4. The Lord of the Rings: The Return of the King,2003,9.0,"Votes: 1,842,777 | Gross: $377.85M | Top 250: #7"
5,5. Schindler's List,1993,9.0,"Votes: 1,353,643 | Gross: $96.90M | Top 250: #6"
6,6. The Godfather Part II,1974,9.0,"Votes: 1,268,909 | Gross: $57.30M | Top 250: #4"


Unnamed: 0_level_0,title,year,rating,num_vote
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<chr>
45,45. Back to the Future,1985,8.5,"Votes: 1,203,542 | Gross: $210.61M | Top 250: #30"
46,46. Apocalypse Now,1979,8.5,"Votes: 667,985 | Gross: $83.47M | Top 250: #53"
47,47. Alien,1979,8.5,"Votes: 882,551 | Gross: $78.90M | Top 250: #51"
48,48. Once Upon a Time in the West,1968,8.5,"Votes: 330,500 | Gross: $5.32M | Top 250: #48"
49,49. Psycho,1960,8.5,"Votes: 672,471 | Gross: $32.00M | Top 250: #32"
50,50. Rear Window,1954,8.5,"Votes: 492,529 | Gross: $36.76M | Top 250: #49"


# SpecPhone

# **mini-project 02 - Specphone Database**

In [1]:
library(tidyverse)
library(rvest)

“running command 'timedatectl' had status 1”
“Failed to locate timezone database”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.5     [32m✔[39m [34mdplyr  [39m 1.0.7
[32m✔[39m [34mtidyr  [39m 1.1.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.0.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m  masks [34mstats[39m::filter()
[31m✖[39m [34mpurrr[39m::[32mflatten()[39m masks [34mjsonlite[39m::flatten()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m     masks [34mstats[39m::lag()


Attaching package: ‘rvest’


The following object is masked from ‘package:readr’:

    guess_encoding




In [2]:
url <- read_html("https://specphone.com/Samsung-Galaxy-A04.html")

In [3]:
att <- url %>%
    html_nodes("div.topic") %>%
    html_text2()

value <- url %>%
    html_nodes("div.detail") %>%
    html_text2()

In [4]:
data.frame(attribute = att, value = value)

attribute,value
<chr>,<chr>
วันเปิดตัว,ตุลาคม 2565
วันวางจำหน่าย,ยังไม่วางจำหน่าย
ขนาด,164.40 x 76.30 x 9.10 มม.
น้ำหนัก,192 กรัม
วัสดุ,"Glass front, plastic back, plastic frame"
SIM,"รองรับ 2 ซิมการ์ด (nano sim, nano sim)"
Technology,"HSPA 42.2/5.76 Mbps, LTE-A"
2G,850/900/1800/1900
3G,850/900/1900/2100
4G,850/900/1900/2100/2600


In [5]:
# All Sumsung amartphone
samsung_url <- read_html("https://specphone.com/brand/Samsung")

In [6]:
 #Links to all Samsung smartphone
links <- samsung_url %>%
    html_nodes("li.mobile-brand-item a") %>%
    html_attr("href")

In [7]:
links

In [8]:
full_links <- paste0("https://specphone.com", links)

In [9]:
full_links[1:10]

In [13]:
result <- data.frame()

for (link in full_links[1:10]) {
    ss_topic <- link %>%
        read_html() %>%
        html_nodes("div.topic") %>%
        html_text2()
    ss_detail <- link %>%
        read_html() %>%
        html_nodes("div.detail") %>%
        html_text2()

    tmp <- data.frame(attribute = ss_topic,
                        value = ss_detail)

    result <- bind_rows(result, tmp)                    
    print("progress...")
}

print(result)

[1] "progress..."
[1] "progress..."
[1] "progress..."
[1] "progress..."
[1] "progress..."
[1] "progress..."
[1] "progress..."
[1] "progress..."
[1] "progress..."
[1] "progress..."
         attribute
1          วันเปิดตัว
2      วันวางจำหน่าย
3             ขนาด
4            น้ำหนัก
5              วัสดุ
6              SIM
7       Technology
8               2G
9               3G
10              4G
11              5G
12         ความเร็ว
13          ประเภท
14       ขนาดหน้าจอ
15      ความละเอียด
16     ระบบปฏิบัติการ
17      ชิปประมวลผล
18         ชิปกราฟิก
19      หน่วยความจำ
20           ความจุ
21     Memory Card
22         กล้องหลัก
23  ความละเอียดวีดีโอ
24         กล้องหน้า
25       Bluetooth
26           Wi-Fi
27             USB
28             GPS
29             NFC
30           ความจุ
31          ประเภท
32   Fast Charging
33         วันเปิดตัว
34     วันวางจำหน่าย
35            ขนาด
36           น้ำหนัก
37             วัสดุ
38             SIM
39      Technology
40              2G
41  

In [14]:
print(head(result),3)

    attribute                                    value
1     วันเปิดตัว                              มิถุนายน 2565
2 วันวางจำหน่าย                            ยังไม่วางจำหน่าย
3        ขนาด                165.40 x 76.90 x 8.40 มม.
4       น้ำหนัก                                  192 กรัม
5         วัสดุ Glass front, plastic back, plastic frame
6         SIM      รองรับ 2 ซิมการ์ด (nano sim, nano sim)


In [11]:
# write csv
write_csv(result, "result_samsung_phone.csv")