In [3]:
import os
import sqlite3

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

## 1) Database connection using sqlite 3

In [9]:
db_path = 'database.sqlite'
conn = sqlite3.connect(db_path)

### 1a) test db connection with a simple sql statement
**NOTE**: used string formatting to save potential typing later on

In [21]:
# base sql string
sql_base_select = 'SELECT {} FROM {};'

# format sql string
sql = sql_base_select.format('*', 'league')

# run select statement and show results
leagues = pd.read_sql(sql, conn)
leagues.head()

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A


In [55]:
# how many leagues exists in db and which leagues are they
print(leagues['name'].nunique())
list(leagues['name'].unique()) # cast to list for "prettier" formatting than pd.Series object

11


['Belgium Jupiler League',
 'England Premier League',
 'France Ligue 1',
 'Germany 1. Bundesliga',
 'Italy Serie A',
 'Netherlands Eredivisie',
 'Poland Ekstraklasa',
 'Portugal Liga ZON Sagres',
 'Scotland Premier League',
 'Spain LIGA BBVA',
 'Switzerland Super League']

### 1b) using a **subquery** to *only* select premier league matches
**NOTE**: this avoids needing to find the league id before running the query, you just need to know a *unique* component of the league name which we can figure out from the list we printed above

In [52]:
sql = '''
    SELECT * FROM match
    WHERE league_id = (
        SELECT id from league
        WHERE name LIKE "%England%"
    );
'''

df = pd.read_sql(sql, conn)
df.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1729,1729,1729,2008/2009,1,2008-08-17 00:00:00,489042,10260,10261,1,...,10.0,1.28,5.5,12.0,1.3,4.75,10.0,1.29,4.5,11.0
1,1730,1729,1729,2008/2009,1,2008-08-16 00:00:00,489043,9825,8659,1,...,12.0,1.25,6.0,13.0,1.22,5.5,13.0,1.22,5.0,13.0
2,1731,1729,1729,2008/2009,1,2008-08-16 00:00:00,489044,8472,8650,0,...,1.73,5.5,3.8,1.65,5.0,3.4,1.7,4.5,3.4,1.73
3,1732,1729,1729,2008/2009,1,2008-08-16 00:00:00,489045,8654,8528,2,...,3.75,1.9,3.5,4.35,1.91,3.25,4.0,1.91,3.25,3.8
4,1733,1729,1729,2008/2009,1,2008-08-17 00:00:00,489046,10252,8456,4,...,3.75,1.9,3.5,4.35,1.91,3.25,4.0,1.91,3.3,3.75


In [58]:
tables = [
    'country',
    'league',
    'match',
    'player',
    'player_attributes',
    'team',
    'team_attributes'    
]

for table in tables:
    df2 = pd.read_sql(f'SELECT * FROM {table};', conn)
    file_path = os.path.join(os.getcwd(), 'raw_data', f'{table}.csv')
    df2.to_csv(file_path, index=False)
    
print('done')

done


In [76]:
folder_name_input = 'raw_data'
folder_name_output = 'csv_data'

file_name_input = 'match.csv'
file_path_input = os.path.join(os.getcwd(), folder_name_input, file_name_input)

df3 = pd.read_csv(file_path_input)

row_count = df3.shape[0]
chunks = 15
chunk_size = row_count // chunks

for i in range(chunks):
    if i != (chunks - 1):
        df_chunk = df3.iloc[i * chunk_size : (i + 1) * chunk_size]
    else:
        df_chunk = df3.iloc[i * chunk_size :]
        
    file_name_output = f'match_{i + 1}.csv'
    file_path_output = os.path.join(os.getcwd(), folder_name_output, file_name_output)
    df_chunk.to_csv(file_path_output, index=False)