# FIFA DataSet Analysis


* Importing Libraries

In [430]:

#Numpy
import numpy as np

# Pandas
import pandas as pd

# BeautifulSoup
from bs4 import BeautifulSoup

# Requests
import requests

# Reges
import regex as re 

# sqlite
import sqlite3 as sql

* Web Scrapping


In [394]:
req=requests.get("https://sofifa.com/players?offset=1")
req

<Response [200]>

* BeautifulSoup

In [395]:
soup=BeautifulSoup(req.content)

In [396]:
# Adding the attributes to the array
arr=[]
for i in soup.findAll('td'):
    arr.append(str(i))

* Cleaning Image URL

In [397]:
clean_img=re.sub('<td.*\n.*data-src=.|".*',"",arr[0])
clean_img

'https://cdn.sofifa.net/players/253/537/22_60.png'

In [398]:
# The row is from 0-8
arr[0]

'<td class="col-avatar" data-balloon="Click here!" data-balloon-pos="up" data-balloon-visible=""><figure class="avatar">\n<img alt="" class="player-check" data-root="https://cdn.sofifa.net/players/" data-src="https://cdn.sofifa.net/players/253/537/22_60.png" data-srcset="https://cdn.sofifa.net/players/253/537/22_120.png 2x, https://cdn.sofifa.net/players/253/537/22_180.png 3x" data-type="player" id="253537" src="https://cdn.sofifa.net/players/notfound_0_60.png"/></figure></td>'

* Cleaning Name

In [399]:
clean_name=re.sub('<td.*\n.*s">|<\/div.*>',"",arr[1])
clean_name

'B. Castillo'

* Cleaning Age

In [400]:
clean_age=re.sub('<td.*e">|<.*>',"",arr[11])
clean_age

'33'

* Cleaning OVA

In [401]:
clean_ova=re.sub('<td.*p.*">|<.*>',"",arr[3])
clean_ova

'76'

* Cleaning POT

In [402]:
clean_pot=re.sub('<td.*p.*">|<.*>',"",arr[4])
clean_pot

'81'

* Cleaning Team & Contract

In [403]:
clean_team=re.sub('<td.*\n<d.*\n<i.*\n</f.*\n.*/">|<.*\n.*>\n.*>\n.*>',"",arr[5])
clean_team

'Barcelona Sporting Club'

* Cleaning Value

In [404]:
clean_value=re.sub('<td.*">|<.*>',"",arr[6])
clean_value

'€11M'

* Cleaning Wage

In [405]:
clean_wage=re.sub('<td.*">|<.*>',"",arr[7])
clean_wage

'€1K'

* Cleaning Total

In [406]:
clean_total=re.sub('<td.*">|<.*>',"",arr[8])
clean_total

'1898'

* Looping through the data

In [424]:
#0-8
img=[]
name=[]
age=[]
ova=[]
pot=[]
team=[]
value=[]
wage=[]
total=[]

count=0
for j in range(1,20):
    url=f"https://sofifa.com/players?offset={j}"
    req=requests.get(url)
    soup=BeautifulSoup(req.content)
    
    for i in soup.findAll('td'):
        clean_img=re.sub('<td.*\n.*data-src=.|".*',"",str(i))
        clean_name=re.sub('<td.*\n.*s">|<\/div.*>',"",str(i))
        clean_age=re.sub('<td.*e">|<.*>',"",str(i))
        clean_ova=re.sub('<td.*p.*">|<.*>',"",str(i))
        clean_pot=re.sub('<td.*p.*">|<.*>',"",str(i))
        clean_team=re.sub('<td.*\n<d.*\n<i.*\n</f.*\n.*/">|<.*\n.*>\n.*>\n.*>',"",str(i))
        clean_value=re.sub('<td.*">.|.<.*>',"",str(i))
        clean_wage=re.sub('<td.*">.|.<.*>',"",str(i))
        clean_total=re.sub('<td.*">|<.*>',"",str(i))
        
        if count==0:
            img.append(clean_img)
            count+=1
        elif count==1:
            name.append(clean_name)
            count+=1
        elif count==2:
            age.append(clean_age)
            count+=1
        elif count==3:
            ova.append(clean_ova)
            count+=1
        elif count==4:
            pot.append(clean_pot)
            count+=1
        elif count==5:
            team.append(clean_team)
            count+=1
        elif count==6:
            value.append(clean_value)
            count+=1
        elif count==7:
            wage.append(clean_wage)
            count+=1
        else:
            count=0
            total.append(clean_total)
        


* Type casting

In [408]:
fifa=pd.DataFrame({'Image':img,'Name':name,'Age':age,'OVA':ova,'POT':pot,'Team':team,'Value (M$)':value,'Wage (K$)':wage,'Total':total})
fifa['Age']=pd.to_numeric(fifa['Age'])
fifa['OVA']=pd.to_numeric(fifa['OVA'])
fifa['POT']=pd.to_numeric(fifa['POT'])
fifa['Value (M$)']=pd.to_numeric(fifa['Value (M$)'])
fifa['Wage (K$)']=pd.to_numeric(fifa['Wage (K$)'])
fifa['Total']=pd.to_numeric(fifa['Total'])

In [409]:
# Getting the basic info for the DF
fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1140 entries, 0 to 1139
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Image       1140 non-null   object 
 1   Name        1140 non-null   object 
 2   Age         1140 non-null   int64  
 3   OVA         1140 non-null   int64  
 4   POT         1140 non-null   int64  
 5   Team        1140 non-null   object 
 6   Value (M$)  1140 non-null   float64
 7   Wage (K$)   1140 non-null   int64  
 8   Total       1140 non-null   int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 80.3+ KB


In [410]:
# Shape of the DF
fifa.shape

(1140, 9)

*Droping the Image url column from the DF

In [411]:
# Remmoving the Image column
fifa.drop(columns=["Image"],inplace=True,axis=1)

* 1st 20 datapoints

In [412]:
fifa.head(20)

Unnamed: 0,Name,Age,OVA,POT,Team,Value (M$),Wage (K$),Total
0,B. Castillo,22,76,81,Barcelona Sporting Club,11.0,1,1898
1,K. Benzema,33,91,91,Real Madrid CF,84.0,450,2145
2,Deulofeu,27,79,79,Udinese Calcio,19.0,24,1938
3,N. Molina,23,76,82,Udinese Calcio,11.0,14,1943
4,D. Núñez,22,78,86,SL Benfica,32.0,15,1987
5,Vinícius Jr.,20,84,91,Real Madrid CF,83.5,165,1970
6,D. Vlahović,21,83,90,Juventus,74.5,85,1874
7,Gavi,16,77,89,FC Barcelona,24.0,14,1970
8,J. Timber,20,79,88,Ajax,36.0,13,1935
9,A. Tchouaméni,21,81,88,AS Monaco,52.0,46,2141


* Last 20 Datapoints

In [413]:
fifa.tail(20)

Unnamed: 0,Name,Age,OVA,POT,Team,Value (M$),Wage (K$),Total
1120,J. Musiala,18,79,89,FC Bayern München,37.0,26,1911
1121,D. Rice,22,83,88,West Ham United,51.0,76,2072
1122,J. Foyth,23,77,82,Villarreal CF,15.0,25,1913
1123,S. Tonali,21,82,90,AC Milan,64.5,35,2205
1124,E. Camavinga,18,79,89,Real Madrid CF,37.0,48,2109
1125,Diogo Jota,24,85,89,Liverpool,78.0,155,2188
1126,E. Smith Rowe,20,80,87,Arsenal,42.0,65,1835
1127,J. Koundé,22,83,89,Sevilla FC,53.0,33,1916
1128,Vitinha,21,77,87,FC Porto,23.5,11,1992
1129,Alberto Moreno,28,77,77,Villarreal CF,9.5,30,2062


* Statistical Data

In [414]:
fifa.describe()

Unnamed: 0,Age,OVA,POT,Value (M$),Wage (K$),Total
count,1140.0,1140.0,1140.0,1140.0,1140.0,1140.0
mean,22.026316,80.268421,87.641228,47.891842,77.570175,1981.627193
std,4.298479,6.097097,2.973731,37.416702,80.047612,168.745114
min,16.0,64.0,77.0,1.7,1.0,1546.0
25%,19.0,77.0,86.0,23.0,22.0,1865.0
50%,21.0,80.0,88.0,40.5,46.0,2000.0
75%,23.0,84.0,89.0,64.5,110.0,2111.25
max,36.0,92.0,95.0,194.0,450.0,2257.0


# Analysis

* Display and drop all the duplicated datas

In [448]:
fifa.duplicated().sum()
fifa.drop_duplicates(inplace=True)

* Player who is having the highest wage?

In [451]:
fifa[fifa['Wage (K$)']==max(fifa['Wage (K$)'])]

# K. Benzema is the player with highest wage


Unnamed: 0,Name,Age,OVA,POT,Team,Value (M$),Wage (K$),Total
1,K. Benzema,33,91,91,Real Madrid CF,84.0,450,2145


* Find the youngest player(s)

In [456]:
# Youngest player
fifa[fifa['Age']==min(fifa['Age'])]

Unnamed: 0,Name,Age,OVA,POT,Team,Value (M$),Wage (K$),Total
7,Gavi,16,77,89,FC Barcelona,24.0,14,1970
14,Y. Moukoko,16,69,89,Borussia Dortmund,3.8,5,1725
40,N. Mbamba,16,64,88,Club Brugge KV,1.7,50,1546


* Find the oldest player(s)

In [464]:
# Oldest player
fifa[fifa['Age']==max(fifa['Age'])][['Name',"Age"]]

Unnamed: 0,Name,Age
49,Cristiano Ronaldo,36


* List out the Top 10 players according to their POT

In [459]:
fifa.sort_values(by="POT", ascending=False)['Name'].head(10)

41           K. Mbappé
39          E. Haaland
51          F. de Jong
20            L. Messi
19               Pedri
44      R. Lewandowski
16            P. Foden
1079        Rúben Dias
958           M. Salah
5         Vinícius Jr.
Name: Name, dtype: object

* Create a new DataFrame with the values of columns - Name & POT

In [None]:
new_df=fifa[['Name','POT']]
new_df