## **World Cup 2022 Prediction** 

Predicting the winners of the next FIFA World Cup

In [61]:
# Importing the dependencies 
import pandas as pd
import numpy as np
from datetime import datetime

### **----------------------------------  1. Data Collection  -----------------------------------**

In [8]:
# Loading the data from csv to dataframe
df = pd.read_csv('../Data/results.csv')

#### Filtering out relevant historic match data

The highest level of football before Senior football is U21 football. For national teams the AVG squad age is between 24 - 29.6 years. Players under the age of 21 rarely get recruited to the senior team so to accomodated for those that may 21 years old will be our lower bound. To accomodate for older players our upper bound will be 30 years. 

* Because we want a clear view of the performance of the current squad and not the England team of 1966 for example because that isnt the team playing now. A limit of 9 years would help imrpove this bias.  
* Currently doing more research to validate any lower or higher number. I will simply update the function input when I get this infor. 
* Past performance of previous squads does not guarantee future performance 

In [71]:
# Viewing last 2 rows in data set
df.tail(2)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
43043,2021-11-16,Chile,Ecuador,0.0,2.0,FIFA World Cup qualification,Santiago,Chile,False
43044,2021-11-19,Sri Lanka,Seychelles,,,Mahinda Rajapaksa Cup,Columbo,Sri Lanka,False


In [67]:
def yrRange(yr):
    # Difference in years - Example if you want 10 years of data from todays date you would enter 10

    # Using today() function to get todays date so that data can be most up to date based on date
    a = datetime.today().strftime('%Y-%m-%d')
    # print("Todays date")
    # print(a)

    # Subtracting 9 from year value
    ye = datetime.today().year - yr

    # Converting the yr value to string to allow for concatenation
    test = str(ye)

    # Concatting calculated calue to month and day values 
    b = datetime.today().strftime(test + '-%m-%d')
    # print("9 years ago")
    return b

# yrRange(9)

In [78]:
# Conditions to look at dates that are within 9 years from today
data = df[df.date >= yrRange(9)]
data.drop(['neutral'],inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [86]:
#Adding goal difference and establishing who is the winner 
winner = []
for i in range (11):
    if data['home_score'][i] > data['away_score'][i]:
        winner.append(data['home_team'][i])
    elif data['home_score'][i] < data['away_score'][i]:
        winner.append(data['away_team'][i])
    else:
        winner.append('Draw')
    data['winning_team'] = winner

#adding goal difference column
# data['goal_difference'] = np.absolute(data['home_score'] - data['away_score'])

# data.head()

KeyError: 0

In [69]:
# Viewing first 5 rows of new data
data.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
34861,2012-12-06,Uganda,Tanzania,3.0,0.0,CECAFA Cup,Kampala,Uganda,False
34862,2012-12-06,Zanzibar,Kenya,2.0,2.0,CECAFA Cup,Kampala,Uganda,True
34863,2012-12-07,Antigua and Barbuda,Dominican Republic,1.0,2.0,CFU Caribbean Cup,St. John's,Antigua and Barbuda,False
34864,2012-12-07,Guam,Australia,0.0,9.0,EAFF Championship,So Kon Po,Hong Kong,True
34865,2012-12-07,Haiti,Trinidad and Tobago,0.0,0.0,CFU Caribbean Cup,St. John's,Antigua and Barbuda,True


--------------------------   **Data 1**  --------------------------

<font  color=white>

 #### **1. Web scraping**

In [1]:
# read_html preferred as it has the ability to read tables efficiently 
import pandas as pd

url = "https://fbref.com/en/comps/1/FIFA-World-Cup-Stats"
dfs = pd.read_html(url)

# For loop to locate desired table with len(df) > 5 | Renaming table to data 
for df in dfs:
    if len(df) > 5:
        table = df
        break
data = table 

<font  color=white>

#### **2. Data Cleaning**

In [2]:
# Initial view of data 
data.head()

Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,xG,xGA,xGD,xGD/90,Top Team Scorer,Goalkeeper,Notes
0,1.0,fr France,7.0,6.0,1.0,0.0,14.0,6.0,8.0,19.0,8.1,4.8,3.3,0.47,"Kylian Mbappé, Antoine Griezmann - 4",Hugo Lloris,
1,,,,,,,,,,,,,,,,,
2,2.0,hr Croatia,7.0,4.0,2.0,1.0,14.0,9.0,5.0,14.0,10.2,7.0,3.3,0.41,"Mario Mandžukić, Ivan Perišić - 3",Danijel Subašić,
3,,,,,,,,,,,,,,,,,
4,3.0,be Belgium,7.0,6.0,0.0,1.0,16.0,6.0,10.0,18.0,11.8,8.8,3.1,0.44,Romelu Lukaku - 4,Thibaut Courtois,


In [43]:
# Initial look at the data shape 
data.shape

(38, 17)

In [44]:
# Check for NULL values
data.isnull().sum()

Rk                  6
Squad               6
MP                  6
W                   6
D                   6
L                   6
GF                  6
GA                  6
GD                  6
Pts                 6
xG                  6
xGA                 6
xGD                 6
xGD/90              6
Top Team Scorer     6
Goalkeeper          6
Notes              38
dtype: int64

In [65]:
# Dropping notes column first and then null rows from data, removing abbrevation for squad in Squad column 
data.drop('Notes', axis=1, inplace=True)
data.dropna(inplace=True)
data[['NULL','Squad']] = data["Squad"].str.split(" ", 1, expand=True)
data.drop('NULL', axis=1, inplace=True)

In [71]:
data.rename(columns={"xGD/90": "xGD per 90"},inplace=True)

In [72]:
# Ensuring column headers comply with SQL naming rules 
data.columns = data.columns.str.replace(' ','_')

In [73]:
data.head()

Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,xG,xGA,xGD,xGD_per_90,Top_Team_Scorer,Goalkeeper
0,1,France,7.0,6.0,1.0,0.0,14.0,6.0,8.0,19.0,8.1,4.8,3.3,0.47,"Kylian Mbappé, Antoine Griezmann - 4",Hugo Lloris
2,2,Croatia,7.0,4.0,2.0,1.0,14.0,9.0,5.0,14.0,10.2,7.0,3.3,0.41,"Mario Mandžukić, Ivan Perišić - 3",Danijel Subašić
4,3,Belgium,7.0,6.0,0.0,1.0,16.0,6.0,10.0,18.0,11.8,8.8,3.1,0.44,Romelu Lukaku - 4,Thibaut Courtois
6,4,England,7.0,3.0,1.0,3.0,12.0,8.0,4.0,10.0,10.0,6.8,3.2,0.42,Harry Kane - 6,Jordan Pickford
8,QF,Uruguay,5.0,4.0,0.0,1.0,7.0,3.0,4.0,12.0,5.9,2.1,3.7,0.75,Edinson Cavani - 3,Fernando Muslera


In [49]:
# Check for NULL values after cleaning
data.isnull().sum()

Rk                 0
Squad              0
MP                 0
W                  0
D                  0
L                  0
GF                 0
GA                 0
GD                 0
Pts                0
xG                 0
xGA                0
xGD                0
xGD/90             0
Top Team Scorer    0
Goalkeeper         0
dtype: int64

In [54]:
data.dtypes

Rk                  object
Squad               object
MP                 float64
W                  float64
D                  float64
L                  float64
GF                 float64
GA                 float64
GD                 float64
Pts                float64
xG                 float64
xGA                float64
xGD                float64
xGD/90             float64
Top Team Scorer     object
Goalkeeper          object
dtype: object

In [55]:
data.columns

Index(['Rk', 'Squad', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts', 'xG',
       'xGA', 'xGD', 'xGD/90', 'Top Team Scorer', 'Goalkeeper'],
      dtype='object')

<font  color=white>

#### **3. Data Warehousing**

In [50]:
# saving scraped df to csv 
data.to_csv('WorldCup2018_Ranking.csv', index=False)

In [78]:
# Storing data in SQL Server Database
import pyodbc

conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DESKTOP-I9HIN85;'
                      'Database=Projects;'
                      'Trusted_Connection=yes;')
cursor = conn.cursor()

# Create Table 
cursor.execute('''
		CREATE TABLE P1WorldCup2018_Ranking (
			Rk varchar(50),
            Squad varchar(50),
            MP float,
            W float,
            D float,
            L float,
            GF float,
            GA float,
            GD float,
            Pts float,
            xG float,
            xGA float,
            xGD float,
            xGD_per_90 float,
            Top_Team_Scorer varchar(50),
            Goalkeeper varchar(50)
			)
               ''')

# Insert DataFrame to Table
for row in data.itertuples():
    cursor.execute('''
                INSERT INTO P1WorldCup2018_Ranking (Rk, Squad, MP, W, D, L, GF, GA, GD, Pts, xG,xGA, xGD, xGD_per_90, Top_Team_Scorer, Goalkeeper)
                VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
                ''',
            row.Rk,
            row.Squad,
            row.MP,
            row.W,
            row.D,
            row.L,
            row.GF,
            row.GA,
            row.GD,
            row.Pts,
            row.xG,
            row.xGA,
            row.xGD,
            row.xGD_per_90,
            row.Top_Team_Scorer,
            row.Goalkeeper
			    )
conn.commit()


--------------------------  **Data 2**  --------------------------

<font  color=white>

 #### **1. Web scraping**

In [84]:
url = "https://fbref.com/en/comps/1/stats/FIFA-World-Cup-Stats"
dfs = pd.read_html(url)

# For loop to locate desired table with len(df) > 5 | Renaming table to data 
for df in dfs:
    if len(df) < 35:
        table = df
        break

table

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Playing Time,Playing Time,Playing Time,Playing Time,Performance,Performance,...,Per 90 Minutes,Expected,Expected,Expected,Expected,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes
Unnamed: 0_level_1,Squad,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,...,G+A-PK,xG,npxG,xA,npxG+xA,xG,xA,xG+xA,npxG,npxG+xA
0,ar Argentina,20,29.6,65.3,4,44,360,4.0,6,6,...,3.0,4.9,4.1,3.0,7.1,1.23,0.75,1.97,1.04,1.78
1,au Australia,15,27.7,51.7,3,33,270,3.0,2,0,...,0.0,3.5,2.0,1.6,3.6,1.16,0.55,1.71,0.66,1.21
2,be Belgium,21,27.7,52.9,7,77,630,7.0,15,12,...,3.71,11.8,11.1,9.1,20.2,1.69,1.3,2.99,1.58,2.88
3,br Brazil,18,28.2,58.6,5,55,450,5.0,8,7,...,3.0,11.8,11.8,9.0,20.8,2.36,1.8,4.16,2.36,4.16
4,co Colombia,20,26.8,50.5,4,44,390,4.3,6,5,...,2.54,2.7,2.7,2.3,5.1,0.63,0.54,1.17,0.63,1.17
5,cr Costa Rica,20,29.9,39.3,3,33,270,3.0,1,1,...,0.67,2.1,1.3,1.1,2.5,0.7,0.38,1.07,0.44,0.82
6,hr Croatia,21,28.8,56.6,7,77,720,8.0,13,8,...,2.5,10.2,8.7,5.8,14.5,1.28,0.72,2.0,1.09,1.81
7,dk Denmark,20,26.9,43.8,4,44,390,4.3,3,3,...,1.38,2.8,2.8,2.3,5.1,0.65,0.52,1.17,0.65,1.17
8,eg Egypt,16,28.9,43.3,3,33,270,3.0,2,1,...,0.67,2.6,1.8,1.5,3.3,0.87,0.49,1.36,0.61,1.1
9,eng England,21,25.5,54.4,7,77,690,7.7,12,6,...,1.96,10.0,7.7,5.3,13.1,1.31,0.7,2.0,1.01,1.7
