In [1]:
from sqlalchemy import create_engine, inspect, MetaData, Table, Column, Integer, String, Float, Date, insert, func, select, delete
import pandas as pd
pd.set_option('display.width',None)
pd.set_option('display.max_columns',None) 

import requests
from datetime import datetime
import csv
from io import StringIO

In [2]:
engine = create_engine('sqlite:///olympicsdata.sqlite')
metadata = MetaData()

In [11]:
url_athletes = 'https://assets.datacamp.com/production/repositories/3815/datasets/b68860c77542f92e108d5c09db510df7a3d28b03/athletes.csv'
url_wg_data = 'https://assets.datacamp.com/production/repositories/3815/datasets/c6a559e920e509e6f0bcc66f5c278aaa38c95be8/winter_games.csv'
url_countries = 'https://assets.datacamp.com/production/repositories/3815/datasets/3f2822876f807bd2a430ff506712e79ba5ae48df/countries.csv'
urls = [url_athletes,url_countries, url_wg_data]

responses = []
csv_files = []
readers = []
for index,url in enumerate(urls):
    try:
        response = requests.get(url)
        response .raise_for_status()
        csv_file = StringIO(response.text)
        reader = csv.reader(csv_file)

        responses.append(response)
        csv_files.append(csv_file)
        readers.append(reader)

        print('Data retrieved successfully for url {}.'.format(url))
    except requests.exceptions.RequestException as e:
        print('Failed to retrieve data: ', e)

Data retrieved successfully for url https://assets.datacamp.com/production/repositories/3815/datasets/b68860c77542f92e108d5c09db510df7a3d28b03/athletes.csv.
Data retrieved successfully for url https://assets.datacamp.com/production/repositories/3815/datasets/3f2822876f807bd2a430ff506712e79ba5ae48df/countries.csv.
Data retrieved successfully for url https://assets.datacamp.com/production/repositories/3815/datasets/c6a559e920e509e6f0bcc66f5c278aaa38c95be8/winter_games.csv.


In [4]:
# See columns structure for each data source
for reader,url in zip(readers,urls):
    header = next(reader)
    data = list(reader)
    df = pd.DataFrame(data,columns=header)
    print(f"Printing data for {url}")
    print(df.head(2))
    print('')
    print('')

Printing data for https://assets.datacamp.com/production/repositories/3815/datasets/b68860c77542f92e108d5c09db510df7a3d28b03/athletes.csv
   id                 name gender age height weight
0  51    Nstor Abad Sanjun      M  23    167     64
1  55  Antonio Abadia Beci      M  26    170     65


Printing data for https://assets.datacamp.com/production/repositories/3815/datasets/3f2822876f807bd2a430ff506712e79ba5ae48df/countries.csv
  id            country                               region
0  1  AFG - Afghanistan        ASIA (EX. NEAR EAST)         
1  2      ALB - Albania  EASTERN EUROPE                     


Printing data for https://assets.datacamp.com/production/repositories/3815/datasets/c6a559e920e509e6f0bcc66f5c278aaa38c95be8/winter_games.csv
           sport                          event        year athlete_id  \
0  Alpine Skiing   Alpine Skiing Women's Slalom  2014-01-01        126   
1  Alpine Skiing  Alpine Skiing Women's Super G  2014-01-01        463   

  country_id me

In [5]:
metadata = MetaData()

wintergames = Table('wintergames',
                metadata,
                Column('sport',String()),
                Column('event',String()),
                Column('year',Integer()),
                Column('athlete_id',Integer()),                
                Column('country_id',Integer()),
                Column('medal',String(),default='gold')
)

countries = Table('countries',
                metadata,
                Column('id',Integer(), primary_key=True),
                Column('country',String()),
                Column('region',String())
)

athletes = Table('athletes',
                metadata,
                Column('id',Integer(), primary_key=True),
                Column('name',String()),
                Column('gender',String()),
                Column('age',Integer()),
                Column('height',Integer()),
                Column('weigth',Integer())
)

metadata.create_all(engine)


In [6]:
#Cleaning of the tables
current_tables = inspect(engine).get_table_names()
print("Current tables in database: ", current_tables)

conn = engine.connect()
for table in current_tables:
    tableObject = Table(table,metadata,autoload=engine)
    result = engine.execute(delete(tableObject))
    print(f"Number of deleted rows for table {table}: {result.rowcount}")

Current tables in database:  ['athletes', 'countries', 'wintergames']
Number of deleted rows for table athletes: 4215
Number of deleted rows for table countries: 202
Number of deleted rows for table wintergames: 2177


  tableObject = Table(table,metadata,autoload=engine)


In [12]:
# Populating tables 
values_insrt_athletes = []
values_insrt_countries = []
values_insrt_wintergames = []

#Populating athletes
next(readers[0]) 
next(readers[0])
for row in readers[0]:
    val_dict = {'id':row[0],
                'name':row[1],
                'gender': row[2],
                'age':row[3],
                'height':row[4],
                'weigth':row[5]}
    values_insrt_athletes.append(val_dict)

# Populating countries table 
next(readers[1])
next(readers[1])
for row in readers[1]:
    val_dict = {'id':row[0],
                'country':row[1],
                'region': row[2].strip()}
    values_insrt_countries.append(val_dict)

# Populating wintergames table 
# Skip the first two rows
next(readers[2])  
next(readers[2])
for row in readers[2]:
    val_dict = {'sport':row[0],
                'event':row[1],
                'year': datetime.strptime(row[2],'%Y-%m-%d').year,
                'athlete_id':row[3],
                'country_id':row[4],
                'medal':row[5]}
    values_insrt_wintergames.append(val_dict)

insrt_stmts = [values_insrt_athletes, values_insrt_countries, values_insrt_wintergames]

for tablename,insrt in zip(current_tables,insrt_stmts):
    table = Table(tablename,metadata,autoload=engine)
    result = conn.execute(insert(table,insrt))
    print('Number of rows inserted for table {}: {}'.format(table,result.rowcount))

Number of rows inserted for table athletes: 4215
Number of rows inserted for table countries: 202
Number of rows inserted for table wintergames: 2177


In [13]:
conn = engine.connect()

for tablename in current_tables:
    table = Table(tablename,metadata,autoload=engine)
    stmt = select([func.count()]).select_from(table)
    countrows = conn.execute(stmt).scalar()
    print(f'Number of rows in table {table}: {countrows}')
    print('#########################################')

    resultSet = conn.execute(select([table])).fetchall()
    print(resultSet[0].keys())
    for row in resultSet[:5]:
        print(row)


Number of rows in table athletes: 4215
#########################################
RMKeyView(['id', 'name', 'gender', 'age', 'height', 'weigth'])
(55, 'Antonio Abadia Beci', 'M', 26, 170, 65)
(110, 'Abubakar Abbas Abbas', 'M', 20, 175, 66)
(126, 'Forough Abbasi', 'F', 20, 164, 58)
(251, 'Bashir Abdi', 'M', 27, 176, 56)
(273, 'Maizurah Abdul Rahim', 'F', 17, 147, 50)
Number of rows in table countries: 202
#########################################
RMKeyView(['id', 'country', 'region'])
(2, 'ALB - Albania', 'EASTERN EUROPE')
(3, 'ALG - Algeria', 'NORTHERN AFRICA')
(4, 'ASA - American Samoa', 'OCEANIA')
(5, 'AND - Andorra', 'WESTERN EUROPE')
(6, 'ANG - Angola', 'SUB-SAHARAN AFRICA')
Number of rows in table wintergames: 2177
#########################################
RMKeyView(['sport', 'event', 'year', 'athlete_id', 'country_id', 'medal'])
('Alpine Skiing', "Alpine Skiing Women's Super G", 2014, 463, 102, '')
('Alpine Skiing', "Alpine Skiing Women's Giant Slalom", 2014, 463, 102, '')
('Alpine