In [None]:
from bs4 import BeautifulSoup  ## the BeautifulSoup library for scraping from the bs4 package
import requests ## Establish website connection using the requests library
import pandas as pd
import numpy as np
import re ## RegEx for pattern matching
from selenium import webdriver
!pip install webdriver_manager
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Setting up chrome web driver for collab

In [49]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300

Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF



Executing: /tmp/apt-key-gpghome.HP9RY8jjuB/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" not changed
gpg: Total number processed: 1
gpg:              unchanged: 1
Executing: /tmp/apt-key-gpghome.GIVNAKFnKE/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" not changed
gpg: Total number processed: 1
gpg:              unchanged: 1
Executing: /tmp/apt-key-gpghome.VjqnBvPZfx/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" not changed
gpg: Total number processed: 1
gpg:              unchanged: 1
gpg: cannot open '/dev/tty': No such device or address
gpg: [stdout]: write error: Broken pipe
gpg: filter_flush failed on c



# Functions for scraping

In [29]:
def name_getter(master_soup):

  df_name = pd.DataFrame(columns=['player_name','link'], dtype=str)

  for y in master_soup.findAll('span', class_="name")[:10000]:
    name = y.a.text
    link = 'https://fminside.net' + y.a.get('href')
    row = {'player_name':name,'link':link}
    df_name = df_name.append(row,ignore_index= True)
  return df_name


def player_info(soup_temp):
  info = []
  info_names = ['club','country','age','position','foot','height','weight','player_id','wages','contract_end']

  for y in soup_temp.findAll('span',class_= "value"):
    info.append(y.text)
  row = {info_names[0]:info[1], info_names[1]:info[2], info_names[2]:info[4], info_names[3]:info[5], info_names[4]:info[6], info_names[5]:info[7],
         info_names[6]:info[8], info_names[7]:info[10], info_names[8]:info[12], info_names[9]:info[13]}
  return row


def player_stats(soup_temp):
  numbers = []
  columns_names = ['Corners', 'Crossing', 'Dribbling', 'Finishing', 'First Touch', 'Free Kick Taking', 'Heading', 'Long Shots', 'Long Throws', 'Marking', 'Passing', 'Penalty Taking', 'Tackling', 'Technique',
                  'Aggression', 'Anticipation', 'Bravery', 'Composure', 'Concentration', 'Decisions', 'Determination', 'Flair', 'Leadership', 'Off the Ball', 'Positioning', 'Teamwork', 'Vision', 'Work Rate', 'Acceleration',
                  'Agility', 'Balance', 'Jumping Reach', 'Natural Fitness', 'Pace', 'Stamina', 'Strength']

  for y in soup_temp.findAll('td'):
    text = y.text
    number = re.findall(r'\d+', text)
    if number:
      numbers.append(number[0])
      if len(numbers) == 36:
        row = {columns_names[i]: int(numbers[i]) for i in range(36)}
  return row

# Loop to run button presser and scraper

In [47]:
def master():

  #
  chrome_options = webdriver.ChromeOptions()
  chrome_options.add_argument('--headless')
  chrome_options.add_argument('--no-sandbox')
  chrome_options.headless = True
  wd = webdriver.Chrome('chromedriver',options=chrome_options)

  site = 'https://fminside.net/players'

  wd.get(site)
  counter = 0

  for counter in range(201):

    wait = WebDriverWait(wd, 30).until(EC.element_to_be_clickable(('xpath', "//a[contains(@class,'loadmore')]")))

    button = wd.find_element('xpath', "//a[contains(@class,'loadmore')]")

    wd.execute_script("arguments[0].click();", button)

    counter += 1
    print(counter)

  master_soup = BeautifulSoup(wd.page_source, 'html.parser')

  df_name = name_getter(master_soup)

  info_names = ['club','country','age','position','foot','height','weight','player_id','wages','contract_end']
  df_info = pd.DataFrame(columns=info_names, dtype=str)

  columns_names = ['Corners', 'Crossing', 'Dribbling', 'Finishing', 'First Touch', 'Free Kick Taking', 'Heading', 'Long Shots', 'Long Throws', 'Marking', 'Passing', 'Penalty Taking', 'Tackling', 'Technique',
                  'Aggression', 'Anticipation', 'Bravery', 'Composure', 'Concentration', 'Decisions', 'Determination', 'Flair', 'Leadership', 'Off the Ball', 'Positioning', 'Teamwork', 'Vision', 'Work Rate', 'Acceleration',
                  'Agility', 'Balance', 'Jumping Reach', 'Natural Fitness', 'Pace', 'Stamina', 'Strength']
  df_stats = pd.DataFrame(columns=columns_names, dtype=int)


  for x in df_name['link'][:10000]:

    site_temp = x
    request_temp = requests.get(site_temp)
    soup_temp = BeautifulSoup(request_temp.text, 'html.parser')

    row = player_info(soup_temp)
    df_info = df_info.append(row, ignore_index=True)

    row = player_stats(soup_temp)
    df_stats = df_stats.append(row, ignore_index=True)

  df = pd.concat([df_name, df_info, df_stats], axis=1)

  return df

In [50]:
#run and output to csv
df = master()
df.to_csv('fm_player_data_3.csv')

  chrome_options.headless = True


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  df_info = df_info.append(row, ignore_index=True)
  df_stats = df_stats.append(row, ignore_index=True)
  df_info = df_info.append(row, ignore_index=True)
  df_stats = df_stats.append(row, ignore_index=True)
  df_info = df_info.append(row, ignore_index=True)
  df_stats = df_stats.append(row, ignore_index=True)
  df_info = df_info.append(row, ignore_index=True)
  df_stats = df_stats.append(row, ignore_index=True)
  df_info = df_info.append(row, ignore_index=True)
  df_stats = df_stats.append(row, ignore_index=True)
  df_info = df_info.append(row, ignore_index=True)
  df_stats = df_stats.append(row, ignore_index=True)
  df_info = df_info.append(row, ignore_index=True)
  df_stats = df_stats.append(row, ignore_index=True)
  df_info = df_info.append(row, ignore_index=True)
  df_stats = df_stats.append(row, ignore_index=True)
  df_info = df_info.append(row, ignore_index=True)
  df_stats = df_stats.append(row, ignore_index=True)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 48 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   player_name       10000 non-null  object
 1   link              10000 non-null  object
 2   club              10000 non-null  object
 3   country           10000 non-null  object
 4   age               10000 non-null  object
 5   position          10000 non-null  object
 6   foot              10000 non-null  object
 7   height            10000 non-null  object
 8   weight            10000 non-null  object
 9   player_id         10000 non-null  object
 10  wages             10000 non-null  object
 11  contract_end      10000 non-null  object
 12  Corners           10000 non-null  int64 
 13  Crossing          10000 non-null  int64 
 14  Dribbling         10000 non-null  int64 
 15  Finishing         10000 non-null  int64 
 16  First Touch       10000 non-null  int64 
 17  Free Kick Tak