# Import data

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
club_df = pd.read_csv('club.csv')

club_df.head()

Unnamed: 0.1,Unnamed: 0,Club Name,Competition Name,Squad Size,Average Age Of Players,Market Value Of Club In Millions(£),Average Market Value Of Players In Millions(£),Market Value Of Top 18 Players In Millions(£)
0,0,Manchester City,Premier League,24,27.1,970.02,40.42,920.7
1,1,Paris Saint-Germain,Ligue 1,36,26.1,891.18,24.76,801.0
2,2,Manchester United,Premier League,29,27.9,820.13,28.28,742.5
3,3,Chelsea FC,Premier League,27,26.9,802.35,29.72,737.1
4,4,Liverpool FC,Premier League,27,27.2,779.85,28.88,715.95


In [3]:
club_df.info

<bound method DataFrame.info of     Unnamed: 0               Club Name Competition Name  Squad Size  \
0            0         Manchester City   Premier League          24   
1            1     Paris Saint-Germain          Ligue 1          36   
2            2       Manchester United   Premier League          29   
3            3              Chelsea FC   Premier League          27   
4            4            Liverpool FC   Premier League          27   
..         ...                     ...              ...         ...   
95          95              Levante UD           LaLiga          27   
96          96                 FC Metz          Ligue 1          29   
97          97  Clube Atlético Mineiro          Série A          29   
98          98        Lokomotiv Moscow     Premier Liga          29   
99          99               Genoa CFC          Serie A          34   

    Average Age Of Players  Market Value Of Club In Millions(£)  \
0                     27.1                      

In [4]:
#check null values

vars_with_na = [
    var for var in club_df.columns
    if club_df[var].isnull().sum() > 0
]

vars_with_na

[]

no null values

In [5]:
#'Unnamed: 0' does not add anything
club_df = club_df.drop(columns=['Unnamed: 0'])

club_df.head()

Unnamed: 0,Club Name,Competition Name,Squad Size,Average Age Of Players,Market Value Of Club In Millions(£),Average Market Value Of Players In Millions(£),Market Value Of Top 18 Players In Millions(£)
0,Manchester City,Premier League,24,27.1,970.02,40.42,920.7
1,Paris Saint-Germain,Ligue 1,36,26.1,891.18,24.76,801.0
2,Manchester United,Premier League,29,27.9,820.13,28.28,742.5
3,Chelsea FC,Premier League,27,26.9,802.35,29.72,737.1
4,Liverpool FC,Premier League,27,27.2,779.85,28.88,715.95


In [6]:
#check unique values - is the amount normal for the data?
for i in club_df.columns:
    print(f'{i}: has {len(club_df[i].unique())} values')

Club Name: has 100 values
Competition Name: has 16 values
Squad Size: has 13 values
Average Age Of Players: has 46 values
Market Value Of Club In Millions(£): has 98 values
Average Market Value Of Players In Millions(£): has 96 values
Market Value Of Top 18 Players In Millions(£): has 99 values


As the data shows the top 100 richest clubs, most clubs here will belong to the top 5 leagues, and some leagues may only feature one club.

# Save dataset

In [7]:
club_df.to_csv('new_club.csv', index=False)

In [8]:
df2 = pd.read_csv('new_club.csv')
df2.head()

Unnamed: 0,Club Name,Competition Name,Squad Size,Average Age Of Players,Market Value Of Club In Millions(£),Average Market Value Of Players In Millions(£),Market Value Of Top 18 Players In Millions(£)
0,Manchester City,Premier League,24,27.1,970.02,40.42,920.7
1,Paris Saint-Germain,Ligue 1,36,26.1,891.18,24.76,801.0
2,Manchester United,Premier League,29,27.9,820.13,28.28,742.5
3,Chelsea FC,Premier League,27,26.9,802.35,29.72,737.1
4,Liverpool FC,Premier League,27,27.2,779.85,28.88,715.95


now ready for sql.

# Tableau prep

When saved as a csv some club and competition names were corrupted leaving athletico madrid to be displayed as: AtlÃ©tico de Madrid.
This can be easily rectified by searching for all none alphanumerics and replacing them with the correct strings.

STEPS:
1. Use club_1.csv to manipulate the dataset 
2. Apply the changes and save as new_club_1.0.csv ready for SQL again.

### 1. Use club_1.csv to manipulate the dataset

In [9]:
club1_df = pd.read_csv('club_1.csv')
club1_df.head()

Unnamed: 0,club_name,competition_name,squad_size,avg_player_age,markt_val_of_club_in_mil_£,avg_markt_val_of_player_in_mil_£,markt_val_top_18_players_in_mil_£
0,1.FSV Mainz 05,Bundesliga,25,25.7,96,4,90
1,AC Milan,Serie A,29,26.8,429,15,390
2,ACF Fiorentina,Serie A,26,27.0,228,9,214
3,AFC Bournemouth,Championship,26,25.7,104,4,100
4,Ajax Amsterdam,Eredivisie,25,25.7,305,12,283


In [10]:
club1_df.shape

(100, 7)

club_name and competition_name are the columns of interest

In [11]:
#CREATE the FUNCTION
#we want to find the index of the corrupted values so we can replace them

def find_non_alphanum(df, column):
    non_alnum_index = []
    for c in column:
        for i in range(len(df[c])):
            if df[c][i] == 'SÃ¼per Lig':#function does not spot this string otherwise
                non_alnum_index.append(i)
            for z in range(len(df[c][i].split(' '))):
                split_df = df[c][i].split(' ')
                if split_df[z].isalnum() == False:
                    non_alnum_index.append(i)
                    continue
    print(f'length: {len(non_alnum_index)}')
    return non_alnum_index

In [12]:
#create a list of the indexes for the club and competition columns
club_idx = find_non_alphanum(club1_df,['club_name'])
competition_idx = find_non_alphanum(club1_df,['competition_name'])

length: 6
length: 0


In [13]:
#print club names that need correcting
for i in club_idx:
    print(club1_df['club_name'][i])

1.FSV Mainz 05
AS Saint-Etienne
Brighton & Hove Albion
Galatasaray A.S.
Paris Saint-Germain
Zenit St. Petersburg


not all of the above are corrupted.

In [14]:
#show competition names that need correcting
#had to print the whole row so I know what country the competition takes place in.
club1_df.iloc[competition_idx]

Unnamed: 0,club_name,competition_name,squad_size,avg_player_age,markt_val_of_club_in_mil_£,avg_markt_val_of_player_in_mil_£,markt_val_top_18_players_in_mil_£


In [15]:
#create a list of the errors and the correct values
club_error = ['AS Saint-Ã‰tienne', 'AtlÃ©tico de Madrid', 'Borussia MÃ¶nchengladbach',
                   'Club AtlÃ©tico River Plate', 'Clube AtlÃ©tico Mineiro', 'Real Betis BalompiÃ©']
club_correct = ['AS Saint-Etienne', 'Atletico de Madrid', 'Borussia Monchengladbach',
                   'Club Atletico River Plate', 'Clube Atletico Mineiro', 'Real Betis Balompie']

comp_error = ['SÃ©rie A', '1.HNL', 'SÃ¼per Lig']
comp_correct = ['Brasileiro Serie A', 'Prva HNL', 'Turkish Super Lig']

def create_dict(key_list, val_list):
    zip_lists = zip(key_list, val_list)
    new_dict = dict(zip_lists)
    
    return new_dict


In [16]:
import warnings
warnings.filterwarnings('ignore')
#import warning here to hide my path file name

#correct the errors
club_dict = create_dict(club_error, club_correct)
comp_dict = create_dict(comp_error, comp_correct)

for i in club_idx:
    if club1_df['club_name'][i] in club_dict:
        club1_df['club_name'][i] = club_dict[club1_df['club_name'][i]]

for i in competition_idx:
    if club1_df['competition_name'][i] in comp_dict:
        club1_df['competition_name'][i] = comp_dict[club1_df['competition_name'][i]]

In [17]:
pd.set_option('display.max_rows', None) #displays all rows to check before exporting
club1_df

Unnamed: 0,club_name,competition_name,squad_size,avg_player_age,markt_val_of_club_in_mil_£,avg_markt_val_of_player_in_mil_£,markt_val_top_18_players_in_mil_£
0,1.FSV Mainz 05,Bundesliga,25,25.7,96,4,90
1,AC Milan,Serie A,29,26.8,429,15,390
2,ACF Fiorentina,Serie A,26,27.0,228,9,214
3,AFC Bournemouth,Championship,26,25.7,104,4,100
4,Ajax Amsterdam,Eredivisie,25,25.7,305,12,283
5,Arsenal FC,Premier League,26,25.4,507,20,454
6,AS Monaco,Ligue 1,29,24.4,331,11,292
7,AS Roma,Serie A,30,25.2,386,13,349
8,AS Saint-Etienne,Ligue 1,27,24.3,90,3,81
9,Aston Villa,Premier League,25,25.6,372,15,354


### 2. Apply the changes and save as new_club_1.0.csv ready for SQL again.

In [18]:
club1_df.to_csv('new_club_1.0.csv', index=False)

In [19]:
new_club_df = pd.read_csv('new_club_1.0.csv')
new_club_df.head()

Unnamed: 0,club_name,competition_name,squad_size,avg_player_age,markt_val_of_club_in_mil_£,avg_markt_val_of_player_in_mil_£,markt_val_top_18_players_in_mil_£
0,1.FSV Mainz 05,Bundesliga,25,25.7,96,4,90
1,AC Milan,Serie A,29,26.8,429,15,390
2,ACF Fiorentina,Serie A,26,27.0,228,9,214
3,AFC Bournemouth,Championship,26,25.7,104,4,100
4,Ajax Amsterdam,Eredivisie,25,25.7,305,12,283


ready for mySQL