# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns


## Accessment of Data

In [2]:
# Making the copy of the dataset 

df_fifa = pd.read_csv('Fifa_world_cup.csv')
fifa = df_fifa.copy()

### Data Summery 

- The dataset represents historical information on FIFA World Cup matches. It includes details about home and away teams, match scores, penalties, goals, and other match-specific data like the stadium, city, attendance, and referee information. There are 865 matches in the dataset, and certain columns contain incomplete data, especially for goals scored by the first and second teams, and referee cities. Some columns, such as attendance, contain non-numeric values, and date formatting is inconsistent.

### Info About Each Column

- __Home_Team:__ The name of the home team (object type).
- __Score:__ The match score (object type).
- __Away_Team:__ The name of the away team (object type).
- __Penalties:__ Indicates whether penalties were involved (object type).
- __First_Team_Goals:__ Descriptions of goals scored by the first team (object type).
- __Second_Team_Goals:__ Descriptions of goals scored by the second team (object type).
- __Date:__ Date of the match (object type).
- __Time:__ Time of the match (object type).
- __Stadium:__ Stadium where the match was played (object type).
- __City:__ City where the match was played (object type).
- __Attendance:__ The number of spectators (object type).
- __Referee:__ The referee of the match (object type).
- __Referee_City:__ The city where the referee is from (object type).

In [3]:
fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Home_Team          929 non-null    object
 1   Score              929 non-null    object
 2   Away_Team          929 non-null    object
 3   Penalties          929 non-null    object
 4   First_Team_Goals   722 non-null    object
 5   Second_Team_Goals  602 non-null    object
 6   Date               929 non-null    object
 7   Time               928 non-null    object
 8   Stadium            929 non-null    object
 9   City               907 non-null    object
 10  Attendance         928 non-null    object
 11  Referee            928 non-null    object
 12  Referee_City       886 non-null    object
dtypes: object(13)
memory usage: 94.5+ KB


In [4]:
fifa.describe()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
count,929,929,929,929,722,602,929,928,929,907,928,928,886
unique,81,69,87,14,721,602,365,125,205,169,720,410,87
top,Brazil,1–0,Mexico,NO,Houseman 20',Mbappé 80' (pen.) 81' 118' (pen.),27 May 1934,16:00,Estadio Azteca,Mexico City,"Attendance: 45,000",Ravshan Irmatov,Italy
freq,82,100,41,894,2,1,8,80,19,23,13,11,51


In [5]:
fifa[fifa['Time'].isnull()]

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
37,Sweden,w/o,Austria,NO,,,5 June 1938,,Stade Gerland,Lyon,,,


In [6]:
fifa['First_Team_Goals'].isnull().sum()

np.int64(207)

In [7]:
fifa['Second_Team_Goals'].isnull().sum()

np.int64(327)

In [8]:
fifa.head()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
0,France,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',13 July 1930,15:00 UYT (UTC−03:30),Estadio Pocitos,Montevideo,"Attendance: 4,444",Uruguay,
1,Argentina,1–0,France,NO,Monti 81',,15 July 1930,16:00 UYT (UTC−03:30),Estadio Parque Central,Montevideo,"Attendance: 23,409",Almeida Rêgo,Brazil
2,Chile,3–0,Mexico,NO,"Vidal 3', 65'; M. Rosas 52' (o.g.); o.g.",,16 July 1930,14:45 UYT (UTC−03:30),Estadio Parque Central,Montevideo,"Attendance: 9,249",Henri Christophe,Belgium
3,Chile,1–0,France,NO,Subiabre 67',,19 July 1930,12:50 UYT (UTC−03:30),Estadio Centenario,Montevideo,"Attendance: 2,000",Uruguay,
4,Argentina,6–3,Mexico,NO,"Stábile 8', 17', 80'; Zumelzú 12', 55'; Vara...","M. Rosas 42' (pen.), 65'; pen. 75'; Gayón",19 July 1930,15:00 UYT (UTC−03:30),Estadio Centenario,Montevideo,"Attendance: 42,100",Ulises Saucedo,Bolivia


In [9]:
fifa.tail()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
924,England,1–2,France,NO,Kane 54' (pen.),Tchouaméni 17'; Giroud 78',10 December 2022 (2022-12-10),22:00,Al Bayt Stadium,Al Khor,"Attendance: 68,895",Wilton Sampaio,Brazil
925,Argentina,3–0,Croatia,NO,Messi 34' (pen.); Álvarez 39' 69',,13 December 2022 (2022-12-13),22:00,Lusail Stadium,Lusail,"Attendance: 88,966",Daniele Orsato,Italy
926,France,2–0,Morocco,NO,T. Hernandez 5'; Kolo Muani 79',,14 December 2022 (2022-12-14),22:00,Al Bayt Stadium,Al Khor,"Attendance: 68,294",César Arturo Ramos,Mexico
927,Croatia,2–1,Morocco,NO,Gvardiol 7'; Oršić 42',Dari 9',17 December 2022 (2022-12-17),18:00,Khalifa International Stadium,Al Rayyan,"Attendance: 44,137",Abdulrahman Al-Jassim,Qatar
928,Argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),18 December 2022 (2022-12-18),18:00,Lusail Stadium,Lusail,"Attendance: 88,966",Szymon Marciniak,Poland


In [10]:
fifa.sample(5)

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
883,Denmark,0–0,Tunisia,NO,,,22 November 2022 (2022-11-22),16:00,Education City Stadium,Al Rayyan,"Attendance: 42,925",César Arturo Ramos,Mexico
134,Sweden,3–1,West Germany,NO,Skoglund 32'; Gren 81'; Hamrin 88',Schäfer 24',24 June 1958,19:00 (CET),Ullevi,Gothenburg,"Attendance: 49,471",István Zsolt,Hungary
658,Argentina,2–1 (a.e.t.),Mexico,NO,Crespo 10'; Rodríguez 98',Márquez 6',24 June 2006,21:00,Zentralstadion,Leipzig,"Attendance: 43,000",Massimo Busacca,Switzerland
924,England,1–2,France,NO,Kane 54' (pen.),Tchouaméni 17'; Giroud 78',10 December 2022 (2022-12-10),22:00,Al Bayt Stadium,Al Khor,"Attendance: 68,895",Wilton Sampaio,Brazil
564,United States,3–2,Portugal,NO,O'Brien 4'; J. Costa 29' (o.g.); o.g. 36'; ...,Beto 39'; Agoos 71' (o.g.); o.g.,5 June 2002,18:00 KST (UTC+9),Suwon World Cup Stadium,Suwon,"Attendance: 37,306",Byron Moreno,Ecuador


### Dirty Data vs Messy Data

#### Dirty Data

- Home_Team
    1. Need to strip the spaces in the name __Consistency__
    2. Need to change all the letter into smaller cases __Consistency__

- Attendance
    1. The values contain non-numeric characters (e.g., "Attendance: 4,444").
        __Validity__

- First_Team_Goals & Second_Team_Goals
    1. The format includes textual representations of goals, such as "L. Laurent 19'" and mixed data types. __Accuracy__

- Date
    1. Inconsistent formats like "8 June 1958" and "15 July 1930". Need to remove one of the date.__Consistency__
    2. Also need to change the type to datetime. __Accuracy__

- Referee_City
    1. Missing data in several rows.
       __Completeness__

- Penalties
    1. There are only two values: "YES" and "NO". It can be simplified to binary (1 or 0). __Consistency__

- Score
    1. The scores are represented in a non-standardized format, e.g., "4–1". __Consistency__
    2. Need to change the type to int __Validity__

- Time
    1. Need to remove extra written text. __Validity__
    2. Need to change the column type datetime __Consistency__

#### Messy Data

- First_Team_Goal
    1. Need to make one more column in which it will show the player name who goal and at what time for Home team

- Second_Team_Goal 
    1. Need to make one more column in which it will show the player name who goal and at what time for away team

- Score 
    1. Need to make the two different row. One for Home teams score and away teams score team. Then one more for the winner team.


### Define, code, test

In [11]:
fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Home_Team          929 non-null    object
 1   Score              929 non-null    object
 2   Away_Team          929 non-null    object
 3   Penalties          929 non-null    object
 4   First_Team_Goals   722 non-null    object
 5   Second_Team_Goals  602 non-null    object
 6   Date               929 non-null    object
 7   Time               928 non-null    object
 8   Stadium            929 non-null    object
 9   City               907 non-null    object
 10  Attendance         928 non-null    object
 11  Referee            928 non-null    object
 12  Referee_City       886 non-null    object
dtypes: object(13)
memory usage: 94.5+ KB


In [12]:
# Home_Team: Strip spaces and convert to lowercase
fifa['Home_Team'] = fifa['Home_Team'].str.strip().str.lower()

In [13]:
fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Home_Team          929 non-null    object
 1   Score              929 non-null    object
 2   Away_Team          929 non-null    object
 3   Penalties          929 non-null    object
 4   First_Team_Goals   722 non-null    object
 5   Second_Team_Goals  602 non-null    object
 6   Date               929 non-null    object
 7   Time               928 non-null    object
 8   Stadium            929 non-null    object
 9   City               907 non-null    object
 10  Attendance         928 non-null    object
 11  Referee            928 non-null    object
 12  Referee_City       886 non-null    object
dtypes: object(13)
memory usage: 94.5+ KB


In [14]:
fifa['First_Team_Goals'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fifa['First_Team_Goals'].fillna(0, inplace=True)


In [15]:
fifa['Second_Team_Goals'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fifa['Second_Team_Goals'].fillna(0, inplace=True)


In [16]:
fifa['City'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fifa['City'].fillna('Unknown', inplace=True)


In [17]:
fifa['Referee_City'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fifa['Referee_City'].fillna('Unknown', inplace=True)


In [18]:
fifa.head()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',13 July 1930,15:00 UYT (UTC−03:30),Estadio Pocitos,Montevideo,"Attendance: 4,444",Uruguay,Unknown
1,argentina,1–0,France,NO,Monti 81',0,15 July 1930,16:00 UYT (UTC−03:30),Estadio Parque Central,Montevideo,"Attendance: 23,409",Almeida Rêgo,Brazil
2,chile,3–0,Mexico,NO,"Vidal 3', 65'; M. Rosas 52' (o.g.); o.g.",0,16 July 1930,14:45 UYT (UTC−03:30),Estadio Parque Central,Montevideo,"Attendance: 9,249",Henri Christophe,Belgium
3,chile,1–0,France,NO,Subiabre 67',0,19 July 1930,12:50 UYT (UTC−03:30),Estadio Centenario,Montevideo,"Attendance: 2,000",Uruguay,Unknown
4,argentina,6–3,Mexico,NO,"Stábile 8', 17', 80'; Zumelzú 12', 55'; Vara...","M. Rosas 42' (pen.), 65'; pen. 75'; Gayón",19 July 1930,15:00 UYT (UTC−03:30),Estadio Centenario,Montevideo,"Attendance: 42,100",Ulises Saucedo,Bolivia


In [19]:
import re
def clean_attendance(value):
    if pd.isnull(value):  
        return None
    numeric_value = re.sub(r'[^0-9]', '', str(value))
    return int(numeric_value) if numeric_value else None

fifa['Attendance'] = fifa['Attendance'].apply(clean_attendance)


In [20]:
fifa.head()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',13 July 1930,15:00 UYT (UTC−03:30),Estadio Pocitos,Montevideo,4444.0,Uruguay,Unknown
1,argentina,1–0,France,NO,Monti 81',0,15 July 1930,16:00 UYT (UTC−03:30),Estadio Parque Central,Montevideo,23409.0,Almeida Rêgo,Brazil
2,chile,3–0,Mexico,NO,"Vidal 3', 65'; M. Rosas 52' (o.g.); o.g.",0,16 July 1930,14:45 UYT (UTC−03:30),Estadio Parque Central,Montevideo,9249.0,Henri Christophe,Belgium
3,chile,1–0,France,NO,Subiabre 67',0,19 July 1930,12:50 UYT (UTC−03:30),Estadio Centenario,Montevideo,2000.0,Uruguay,Unknown
4,argentina,6–3,Mexico,NO,"Stábile 8', 17', 80'; Zumelzú 12', 55'; Vara...","M. Rosas 42' (pen.), 65'; pen. 75'; Gayón",19 July 1930,15:00 UYT (UTC−03:30),Estadio Centenario,Montevideo,42100.0,Ulises Saucedo,Bolivia


In [21]:
fifa.tail()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
924,england,1–2,France,NO,Kane 54' (pen.),Tchouaméni 17'; Giroud 78',10 December 2022 (2022-12-10),22:00,Al Bayt Stadium,Al Khor,68895.0,Wilton Sampaio,Brazil
925,argentina,3–0,Croatia,NO,Messi 34' (pen.); Álvarez 39' 69',0,13 December 2022 (2022-12-13),22:00,Lusail Stadium,Lusail,88966.0,Daniele Orsato,Italy
926,france,2–0,Morocco,NO,T. Hernandez 5'; Kolo Muani 79',0,14 December 2022 (2022-12-14),22:00,Al Bayt Stadium,Al Khor,68294.0,César Arturo Ramos,Mexico
927,croatia,2–1,Morocco,NO,Gvardiol 7'; Oršić 42',Dari 9',17 December 2022 (2022-12-17),18:00,Khalifa International Stadium,Al Rayyan,44137.0,Abdulrahman Al-Jassim,Qatar
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),18 December 2022 (2022-12-18),18:00,Lusail Stadium,Lusail,88966.0,Szymon Marciniak,Poland


In [22]:
fifa['First_Team_Goals'].tail()

924                        Kane 54' (pen.)
925      Messi 34' (pen.); Álvarez 39' 69'
926        T. Hernandez 5'; Kolo Muani 79'
927                 Gvardiol 7'; Oršić 42'
928    Messi 23' (pen.) 108'; Di María 36'
Name: First_Team_Goals, dtype: object

In [23]:
import re

# Function to clean attendance values by removing non-numeric characters
def clean_attendance(value):
    numeric_value = re.sub(r'[^0-9]', '', str(value))
    return int(numeric_value) if numeric_value else None

fifa['Attendance'] = fifa['Attendance'].apply(clean_attendance)

In [24]:
fifa.head()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',13 July 1930,15:00 UYT (UTC−03:30),Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown
1,argentina,1–0,France,NO,Monti 81',0,15 July 1930,16:00 UYT (UTC−03:30),Estadio Parque Central,Montevideo,234090.0,Almeida Rêgo,Brazil
2,chile,3–0,Mexico,NO,"Vidal 3', 65'; M. Rosas 52' (o.g.); o.g.",0,16 July 1930,14:45 UYT (UTC−03:30),Estadio Parque Central,Montevideo,92490.0,Henri Christophe,Belgium
3,chile,1–0,France,NO,Subiabre 67',0,19 July 1930,12:50 UYT (UTC−03:30),Estadio Centenario,Montevideo,20000.0,Uruguay,Unknown
4,argentina,6–3,Mexico,NO,"Stábile 8', 17', 80'; Zumelzú 12', 55'; Vara...","M. Rosas 42' (pen.), 65'; pen. 75'; Gayón",19 July 1930,15:00 UYT (UTC−03:30),Estadio Centenario,Montevideo,421000.0,Ulises Saucedo,Bolivia


In [25]:
fifa['Date'] = fifa['Date'].str.strip().str.split("(" ).str.get(0)

In [26]:
fifa['Date'] = pd.DatetimeIndex(fifa['Date'])

In [27]:
fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Home_Team          929 non-null    object        
 1   Score              929 non-null    object        
 2   Away_Team          929 non-null    object        
 3   Penalties          929 non-null    object        
 4   First_Team_Goals   929 non-null    object        
 5   Second_Team_Goals  929 non-null    object        
 6   Date               929 non-null    datetime64[ns]
 7   Time               928 non-null    object        
 8   Stadium            929 non-null    object        
 9   City               929 non-null    object        
 10  Attendance         928 non-null    float64       
 11  Referee            928 non-null    object        
 12  Referee_City       929 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(11)
memory usage: 94.5+ 

In [28]:
fifa.tail()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
924,england,1–2,France,NO,Kane 54' (pen.),Tchouaméni 17'; Giroud 78',2022-12-10,22:00,Al Bayt Stadium,Al Khor,688950.0,Wilton Sampaio,Brazil
925,argentina,3–0,Croatia,NO,Messi 34' (pen.); Álvarez 39' 69',0,2022-12-13,22:00,Lusail Stadium,Lusail,889660.0,Daniele Orsato,Italy
926,france,2–0,Morocco,NO,T. Hernandez 5'; Kolo Muani 79',0,2022-12-14,22:00,Al Bayt Stadium,Al Khor,682940.0,César Arturo Ramos,Mexico
927,croatia,2–1,Morocco,NO,Gvardiol 7'; Oršić 42',Dari 9',2022-12-17,18:00,Khalifa International Stadium,Al Rayyan,441370.0,Abdulrahman Al-Jassim,Qatar
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland


In [29]:
fifa.head()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00 UYT (UTC−03:30),Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown
1,argentina,1–0,France,NO,Monti 81',0,1930-07-15,16:00 UYT (UTC−03:30),Estadio Parque Central,Montevideo,234090.0,Almeida Rêgo,Brazil
2,chile,3–0,Mexico,NO,"Vidal 3', 65'; M. Rosas 52' (o.g.); o.g.",0,1930-07-16,14:45 UYT (UTC−03:30),Estadio Parque Central,Montevideo,92490.0,Henri Christophe,Belgium
3,chile,1–0,France,NO,Subiabre 67',0,1930-07-19,12:50 UYT (UTC−03:30),Estadio Centenario,Montevideo,20000.0,Uruguay,Unknown
4,argentina,6–3,Mexico,NO,"Stábile 8', 17', 80'; Zumelzú 12', 55'; Vara...","M. Rosas 42' (pen.), 65'; pen. 75'; Gayón",1930-07-19,15:00 UYT (UTC−03:30),Estadio Centenario,Montevideo,421000.0,Ulises Saucedo,Bolivia


In [30]:
fifa['Time'] = fifa['Time'].astype('string')

In [31]:
fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Home_Team          929 non-null    object        
 1   Score              929 non-null    object        
 2   Away_Team          929 non-null    object        
 3   Penalties          929 non-null    object        
 4   First_Team_Goals   929 non-null    object        
 5   Second_Team_Goals  929 non-null    object        
 6   Date               929 non-null    datetime64[ns]
 7   Time               928 non-null    string        
 8   Stadium            929 non-null    object        
 9   City               929 non-null    object        
 10  Attendance         928 non-null    float64       
 11  Referee            928 non-null    object        
 12  Referee_City       929 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(10), string(1)
memory us

In [32]:
fifa['Time'].astype('string')

0      15:00 UYT (UTC−03:30)
1      16:00 UYT (UTC−03:30)
2      14:45 UYT (UTC−03:30)
3      12:50 UYT (UTC−03:30)
4      15:00 UYT (UTC−03:30)
               ...          
924                    22:00
925                    22:00
926                    22:00
927                    18:00
928                    18:00
Name: Time, Length: 929, dtype: string

In [33]:
fifa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929 entries, 0 to 928
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Home_Team          929 non-null    object        
 1   Score              929 non-null    object        
 2   Away_Team          929 non-null    object        
 3   Penalties          929 non-null    object        
 4   First_Team_Goals   929 non-null    object        
 5   Second_Team_Goals  929 non-null    object        
 6   Date               929 non-null    datetime64[ns]
 7   Time               928 non-null    string        
 8   Stadium            929 non-null    object        
 9   City               929 non-null    object        
 10  Attendance         928 non-null    float64       
 11  Referee            928 non-null    object        
 12  Referee_City       929 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(10), string(1)
memory us

In [34]:
fifa['Time'] = fifa['Time'].str.split(" ").str.get(0)

In [35]:
def clean_time(value):
    if pd.isnull(value):  
        return np.nan
   
    time_part = value.split(' ')[0]
    return time_part 

fifa['Start_Time'] = fifa['Time'].apply(clean_time)

fifa['Start_Time'] = pd.to_datetime(fifa['Start_Time'], format='%H:%M', errors='coerce').dt.time


In [36]:
fifa['Time'] = fifa['Start_Time']

In [37]:
fifa.drop(columns=['Start_Time' ], inplace=True)

In [38]:
fifa.tail()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City
924,england,1–2,France,NO,Kane 54' (pen.),Tchouaméni 17'; Giroud 78',2022-12-10,22:00:00,Al Bayt Stadium,Al Khor,688950.0,Wilton Sampaio,Brazil
925,argentina,3–0,Croatia,NO,Messi 34' (pen.); Álvarez 39' 69',0,2022-12-13,22:00:00,Lusail Stadium,Lusail,889660.0,Daniele Orsato,Italy
926,france,2–0,Morocco,NO,T. Hernandez 5'; Kolo Muani 79',0,2022-12-14,22:00:00,Al Bayt Stadium,Al Khor,682940.0,César Arturo Ramos,Mexico
927,croatia,2–1,Morocco,NO,Gvardiol 7'; Oršić 42',Dari 9',2022-12-17,18:00:00,Khalifa International Stadium,Al Rayyan,441370.0,Abdulrahman Al-Jassim,Qatar
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland


In [39]:
import pandas as pd
import re

# Function to extract player names and goal times, handling multiple goals and separating entries
def extract_goals(goal_data):
    players = []
    goal_times = []
    
    # Skip processing if goal data is missing or not a string
    if pd.isna(goal_data) or not isinstance(goal_data, str):
        return players, goal_times
    
    # Clean goal data, remove '(pen.)' and split by ';'
    cleaned_goals = goal_data.replace('(pen.)', '').strip()
    goal_entries = cleaned_goals.split(';')
    
    # Process each goal entry to separate player names and times
    for entry in goal_entries:
        entry = entry.strip()
        match = re.match(r'(.+?)\s([\d\s\']+)', entry)
        if match:
            player_name = match.group(1).strip()
            times = match.group(2).strip()
            # Separate each goal time and repeat the player name for each goal
            for time in times.split():
                goal_time = time.replace("'", "").strip()
                players.append(player_name)
                goal_times.append(goal_time)
                
    return players, goal_times

# List to store expanded rows
expanded_rows = []

# Iterate over each row in the original dataframe
for index, row in fifa.iterrows():
    # Extract goals for both teams
    first_team_players, first_team_times = extract_goals(row.get('First_Team_Goals', ''))
    second_team_players, second_team_times = extract_goals(row.get('Second_Team_Goals', ''))
    
    # Case 1: No goals for both teams, keep the row as is
    if not first_team_players and not second_team_players:
        expanded_rows.append(row)
        continue
    
    # Combine players and goal times for each team separately
    if first_team_players:
        for player, goal_time in zip(first_team_players, first_team_times):
            new_row = row.copy()
            new_row['Team'] = 'Home'  # Assume first team is home team
            new_row['Player_Name'] = player
            new_row['Goal_Time'] = goal_time
            expanded_rows.append(new_row)
    
    if second_team_players:
        for player, goal_time in zip(second_team_players, second_team_times):
            new_row = row.copy()
            new_row['Team'] = 'Away'  # Assume second team is away team
            new_row['Player_Name'] = player
            new_row['Goal_Time'] = goal_time
            expanded_rows.append(new_row)

# Convert the expanded rows to a new DataFrame
expanded_fifa = pd.DataFrame(expanded_rows)

# Display the expanded dataframe
expanded_fifa.head()


Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City,Team,Player_Name,Goal_Time
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Home,L. Laurent,19
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Home,Langiller,40
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Home,Maschinot,43
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Away,Carreño,70
1,argentina,1–0,France,NO,Monti 81',0,1930-07-15,16:00:00,Estadio Parque Central,Montevideo,234090.0,Almeida Rêgo,Brazil,Home,Monti,81


In [40]:
expanded_fifa.tail()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City,Team,Player_Name,Goal_Time
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Home,Messi,108
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Home,Di María,36
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Away,Mbappé,80
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Away,Mbappé,81
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Away,Mbappé,118


In [41]:
expanded_fifa.head()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City,Team,Player_Name,Goal_Time
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Home,L. Laurent,19
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Home,Langiller,40
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Home,Maschinot,43
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Away,Carreño,70
1,argentina,1–0,France,NO,Monti 81',0,1930-07-15,16:00:00,Estadio Parque Central,Montevideo,234090.0,Almeida Rêgo,Brazil,Home,Monti,81


In [42]:
expanded_fifa.tail(10)

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City,Team,Player_Name,Goal_Time
926,france,2–0,Morocco,NO,T. Hernandez 5'; Kolo Muani 79',0,2022-12-14,22:00:00,Al Bayt Stadium,Al Khor,682940.0,César Arturo Ramos,Mexico,Home,Kolo Muani,79
927,croatia,2–1,Morocco,NO,Gvardiol 7'; Oršić 42',Dari 9',2022-12-17,18:00:00,Khalifa International Stadium,Al Rayyan,441370.0,Abdulrahman Al-Jassim,Qatar,Home,Gvardiol,7
927,croatia,2–1,Morocco,NO,Gvardiol 7'; Oršić 42',Dari 9',2022-12-17,18:00:00,Khalifa International Stadium,Al Rayyan,441370.0,Abdulrahman Al-Jassim,Qatar,Home,Oršić,42
927,croatia,2–1,Morocco,NO,Gvardiol 7'; Oršić 42',Dari 9',2022-12-17,18:00:00,Khalifa International Stadium,Al Rayyan,441370.0,Abdulrahman Al-Jassim,Qatar,Away,Dari,9
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Home,Messi,23
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Home,Messi,108
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Home,Di María,36
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Away,Mbappé,80
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Away,Mbappé,81
928,argentina,3–3 (a.e.t.),France,4–2,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.),2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Away,Mbappé,118


In [43]:
df = expanded_fifa.copy()

In [44]:
df.reset_index(drop=True, inplace=True)


In [45]:
temp_df = df[df['Referee_City'] == 'Unknown']
temp_df

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City,Team,Player_Name,Goal_Time
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Home,L. Laurent,19
1,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Home,Langiller,40
2,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Home,Maschinot,43
3,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,Uruguay,Unknown,Away,Carreño,70
7,chile,1–0,France,NO,Subiabre 67',0,1930-07-19,12:50:00,Estadio Centenario,Montevideo,20000.0,Uruguay,Unknown,Home,Subiabre,67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,france,4–0,Northern Ireland,NO,"Wisnieski 44'; Fontaine 55', 63'; Piantoni 68'",0,1958-06-19,19:00:00,Idrottsparken,Norrköping,118000.0,Spain,Unknown,Home,Piantoni,68
432,west germany,1–0,Yugoslavia,NO,Rahn 12',0,1958-06-19,19:00:00,Malmö Stadion,Malmö,200550.0,Switzerland,Unknown,Home,Rahn,12
1474,brazil,2–1,Turkey,NO,Ronaldo 50'; Rivaldo 87' (pen.); pen.,Hasan Şaş 45+2',2002-06-03,18:00:00,Munsu Football Stadium,Ulsan,338420.0,South Korea,Unknown,Home,Ronaldo,50
1475,brazil,2–1,Turkey,NO,Ronaldo 50'; Rivaldo 87' (pen.); pen.,Hasan Şaş 45+2',2002-06-03,18:00:00,Munsu Football Stadium,Ulsan,338420.0,South Korea,Unknown,Home,Rivaldo,87


In [46]:
df.loc[temp_df.index, "Referee_City"] = temp_df['Referee']


In [47]:
df.loc[temp_df.index, "Referee"] = 'unknown'


In [48]:
df.head()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City,Team,Player_Name,Goal_Time
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,Home,L. Laurent,19
1,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,Home,Langiller,40
2,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,Home,Maschinot,43
3,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,Away,Carreño,70
4,argentina,1–0,France,NO,Monti 81',0,1930-07-15,16:00:00,Estadio Parque Central,Montevideo,234090.0,Almeida Rêgo,Brazil,Home,Monti,81


In [49]:
df['Home_Score'] = df['Score'].str.extract(r'(\d+)').astype(float)
df['Away_Score'] = df['Score'].str.extract(r'–(\d+)').astype(float)

# Create a column for match outcome: Home Win, Away Win, or Draw
df['Outcome'] = df.apply(lambda row: 'Home Win' if row['Home_Score'] > row['Away_Score'] 
                                   else ('Away Win' if row['Away_Score'] > row['Home_Score'] 
                                         else 'Draw'), axis=1)

In [50]:
df.head()

Unnamed: 0,Home_Team,Score,Away_Team,Penalties,First_Team_Goals,Second_Team_Goals,Date,Time,Stadium,City,Attendance,Referee,Referee_City,Team,Player_Name,Goal_Time,Home_Score,Away_Score,Outcome
0,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,Home,L. Laurent,19,4.0,1.0,Home Win
1,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,Home,Langiller,40,4.0,1.0,Home Win
2,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,Home,Maschinot,43,4.0,1.0,Home Win
3,france,4–1,Mexico,NO,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70',1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,Away,Carreño,70,4.0,1.0,Home Win
4,argentina,1–0,France,NO,Monti 81',0,1930-07-15,16:00:00,Estadio Parque Central,Montevideo,234090.0,Almeida Rêgo,Brazil,Home,Monti,81,1.0,0.0,Home Win


In [51]:
new_order = ['Home_Team', 'Away_Team' , 'Home_Score' , 'Away_Score', 'Penalties', 'Player_Name', 'Goal_Time', 'Outcome' , 'Date', 'Time', 'Stadium', 'City', 'Attendance', 'Referee', 'Referee_City', 'First_Team_Goals', 'Second_Team_Goals']

fifa_data_reordered = df[new_order]

fifa_data_reordered.head()

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Penalties,Player_Name,Goal_Time,Outcome,Date,Time,Stadium,City,Attendance,Referee,Referee_City,First_Team_Goals,Second_Team_Goals
0,france,Mexico,4.0,1.0,NO,L. Laurent,19,Home Win,1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70'
1,france,Mexico,4.0,1.0,NO,Langiller,40,Home Win,1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70'
2,france,Mexico,4.0,1.0,NO,Maschinot,43,Home Win,1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70'
3,france,Mexico,4.0,1.0,NO,Carreño,70,Home Win,1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70'
4,argentina,France,1.0,0.0,NO,Monti,81,Home Win,1930-07-15,16:00:00,Estadio Parque Central,Montevideo,234090.0,Almeida Rêgo,Brazil,Monti 81',0


In [52]:
fifa_data_reordered.tail()

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Penalties,Player_Name,Goal_Time,Outcome,Date,Time,Stadium,City,Attendance,Referee,Referee_City,First_Team_Goals,Second_Team_Goals
2415,argentina,France,3.0,3.0,4–2,Messi,108,Draw,2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.)
2416,argentina,France,3.0,3.0,4–2,Di María,36,Draw,2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.)
2417,argentina,France,3.0,3.0,4–2,Mbappé,80,Draw,2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.)
2418,argentina,France,3.0,3.0,4–2,Mbappé,81,Draw,2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.)
2419,argentina,France,3.0,3.0,4–2,Mbappé,118,Draw,2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.)


In [53]:
fifa_data_reordered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2420 entries, 0 to 2419
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Home_Team          2420 non-null   object        
 1   Away_Team          2420 non-null   object        
 2   Home_Score         2419 non-null   float64       
 3   Away_Score         2419 non-null   float64       
 4   Penalties          2420 non-null   object        
 5   Player_Name        2342 non-null   object        
 6   Goal_Time          2342 non-null   object        
 7   Outcome            2420 non-null   object        
 8   Date               2420 non-null   datetime64[ns]
 9   Time               2294 non-null   object        
 10  Stadium            2420 non-null   object        
 11  City               2420 non-null   object        
 12  Attendance         2419 non-null   float64       
 13  Referee            2420 non-null   object        
 14  Referee_

In [54]:
fifa_data_reordered[fifa_data_reordered['Away_Score'].isnull()]

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Penalties,Player_Name,Goal_Time,Outcome,Date,Time,Stadium,City,Attendance,Referee,Referee_City,First_Team_Goals,Second_Team_Goals
114,sweden,Austria,,,NO,,,Draw,1938-06-05,NaT,Stade Gerland,Lyon,,unknown,,0,0


In [55]:
fifa_data_reordered.drop(index=114, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fifa_data_reordered.drop(index=114, inplace=True)


In [56]:
fifa_data_reordered[(fifa_data_reordered['Home_Team'] == 'sweden') & (fifa_data_reordered['Away_Team'] == 'Austria')]

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Penalties,Player_Name,Goal_Time,Outcome,Date,Time,Stadium,City,Attendance,Referee,Referee_City,First_Team_Goals,Second_Team_Goals


In [57]:
temp_df = fifa_data_reordered[fifa_data_reordered['Player_Name'].isnull()]

In [58]:
fifa_data_reordered.loc[temp_df.index , 'Player_Name'] = 'No_Goal_Scorer'

In [59]:
temp_df = fifa_data_reordered[fifa_data_reordered['Goal_Time'].isnull()]

In [60]:
fifa_data_reordered.loc[temp_df.index , 'Goal_Time'] = 'No_Goal_Scorer'

In [61]:
temp_df

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Penalties,Player_Name,Goal_Time,Outcome,Date,Time,Stadium,City,Attendance,Referee,Referee_City,First_Team_Goals,Second_Team_Goals
404,sweden,Wales,0.0,0.0,NO,No_Goal_Scorer,,Draw,1958-06-15,14:00:00,Råsunda Stadium,Unknown,302870.0,unknown,Belgium,0,0
417,brazil,England,0.0,0.0,NO,No_Goal_Scorer,,Draw,1958-06-11,19:00:00,Ullevi,Gothenburg,408950.0,unknown,West Germany,0,0
478,west germany,Italy,0.0,0.0,NO,No_Goal_Scorer,,Draw,1962-05-31,15:00:00,Estadio Nacional,Santiago,654400.0,Robert Holley Davidson,Scotland,0,0
491,brazil,Czechoslovakia,0.0,0.0,NO,No_Goal_Scorer,,Draw,1962-06-02,15:00:00,Estadio Sausalito,Viña del Mar,149030.0,Pierre Schwinte,France,0,0
511,hungary,Argentina,0.0,0.0,NO,No_Goal_Scorer,,Draw,1962-06-06,15:00:00,Estadio El Teniente,Rancagua,79450.0,Arturo Yamasaki Maldonado,Peru,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2286,denmark,Tunisia,0.0,0.0,NO,No_Goal_Scorer,,Draw,2022-11-22,16:00:00,Education City Stadium,Al Rayyan,429250.0,César Arturo Ramos,Mexico,0,0
2320,morocco,Croatia,0.0,0.0,NO,No_Goal_Scorer,,Draw,2022-11-23,13:00:00,Al Bayt Stadium,Al Khor,594070.0,Fernando Rapallini,Argentina,0,0
2329,croatia,Belgium,0.0,0.0,NO,No_Goal_Scorer,,Draw,2022-12-01,18:00:00,Ahmad bin Ali Stadium,Al Rayyan,439840.0,Anthony Taylor,England,0,0
2349,uruguay,South Korea,0.0,0.0,NO,No_Goal_Scorer,,Draw,2022-11-24,16:00:00,Education City Stadium,Al Rayyan,416630.0,Clément Turpin,France,0,0


In [62]:
temp_df = fifa_data_reordered[fifa_data_reordered['Time'].isnull()]

In [63]:
fifa_data_reordered.loc[temp_df.index , 'Time'] = 'No_Time_Available'

In [76]:
temp_df = fifa_data_reordered[fifa_data_reordered['Player_Name'] == 'pen.']

In [77]:
new_df = pd.read_csv('pen_mistake.csv')

In [86]:
fifa_data_reordered.loc[temp_df.index, 'Player_Name'] = new_df['Player_Name'].values


In [83]:
new_df_aligned = new_df.set_index(temp_df.index)

In [84]:
print("temp_df indices:", temp_df.index)
print("new_df indices:", new_df.index)

temp_df indices: Index([  12,   54,  130,  301,  329,  333,  340,  355,  357,  376,  444,  459,
        485,  504,  545,  594,  604,  658,  729,  730,  771,  786,  798,  841,
        845,  903,  921,  941, 1001, 1044, 1079, 1097, 1108, 1149, 1185, 1188,
       1202, 1208, 1215, 1216, 1235, 1254, 1325, 1329, 1343, 1354, 1419, 1421,
       1451, 1482],
      dtype='int64')
new_df indices: RangeIndex(start=0, stop=50, step=1)


In [None]:
# Overwrite Player_Name in fifa_data_reordered at temp_df.index with values from new_df
fifa_data_reordered.loc[temp_df.index, 'Player_Name'] = new_df['Player_Name'].values

In [97]:
fifa_data_reordered[fifa_data_reordered['Player_Name'] == 'pen.']

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Penalties,Player_Name,Goal_Time,Outcome,Date,Time,Stadium,City,Attendance,Referee,Referee_City,First_Team_Goals,Second_Team_Goals
54,spain,Brazil,3.0,1.0,NO,pen.,29,Home Win,1934-05-27,16:00:00,Stadio Luigi Ferraris,Genoa,210000.0,Alfred Birlem,Germany,"Iraragorri 18' (pen.), 25'; pen. 29'; Lángara",Leônidas 55'


In [95]:
fifa_data_reordered.reset_index()

Unnamed: 0,index,Home_Team,Away_Team,Home_Score,Away_Score,Penalties,Player_Name,Goal_Time,Outcome,Date,Time,Stadium,City,Attendance,Referee,Referee_City,First_Team_Goals,Second_Team_Goals
0,0,france,Mexico,4.0,1.0,NO,L. Laurent,19,Home Win,1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70'
1,1,france,Mexico,4.0,1.0,NO,Langiller,40,Home Win,1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70'
2,2,france,Mexico,4.0,1.0,NO,Maschinot,43,Home Win,1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70'
3,3,france,Mexico,4.0,1.0,NO,Carreño,70,Home Win,1930-07-13,15:00:00,Estadio Pocitos,Montevideo,44440.0,unknown,Uruguay,L. Laurent 19'; Langiller 40'; Maschinot 43...,Carreño 70'
4,4,argentina,France,1.0,0.0,NO,Monti,81,Home Win,1930-07-15,16:00:00,Estadio Parque Central,Montevideo,234090.0,Almeida Rêgo,Brazil,Monti 81',0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2414,2415,argentina,France,3.0,3.0,4–2,Messi,108,Draw,2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.)
2415,2416,argentina,France,3.0,3.0,4–2,Di María,36,Draw,2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.)
2416,2417,argentina,France,3.0,3.0,4–2,Mbappé,80,Draw,2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.)
2417,2418,argentina,France,3.0,3.0,4–2,Mbappé,81,Draw,2022-12-18,18:00:00,Lusail Stadium,Lusail,889660.0,Szymon Marciniak,Poland,Messi 23' (pen.) 108'; Di María 36',Mbappé 80' (pen.) 81' 118' (pen.)


In [99]:
fifa_data_reordered.to_csv('Fifa_world_cup2.csv', index=False)