In [2]:
from sqlalchemy import create_engine, text, inspect, func

# ORM imports
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import Column, Integer, String, Float, Boolean
from sqlalchemy.orm import Session
from sqlalchemy.ext.declarative import declarative_base

# API
import requests
import json

# Data Science and Visualization
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
import datetime as dt

CSV Imports

In [3]:
#CSV import for circuits table
filepath = "Datasets/circuits.csv"
df_circuits = pd.read_csv(filepath)
df_circuits.head()

Unnamed: 0,circuitId,circuitRef,name,location,country,lat,lng,alt,url
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_Internatio...
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park


In [4]:
df_circuits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   circuitId   77 non-null     int64  
 1   circuitRef  77 non-null     object 
 2   name        77 non-null     object 
 3   location    77 non-null     object 
 4   country     77 non-null     object 
 5   lat         77 non-null     float64
 6   lng         77 non-null     float64
 7   alt         77 non-null     int64  
 8   url         77 non-null     object 
dtypes: float64(2), int64(2), object(5)
memory usage: 5.5+ KB


In [5]:
#CSV import for drivers table
filepath2 = "Datasets/drivers.csv"
df_drivers = pd.read_csv(filepath2)
df_drivers.head()

Unnamed: 0,driverId,driverRef,number,code,forename,surname,dob,nationality,url
0,1,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
1,2,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
2,3,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg
3,4,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
4,5,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen


In [6]:
df_drivers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 859 entries, 0 to 858
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   driverId     859 non-null    int64 
 1   driverRef    859 non-null    object
 2   number       859 non-null    object
 3   code         859 non-null    object
 4   forename     859 non-null    object
 5   surname      859 non-null    object
 6   dob          859 non-null    object
 7   nationality  859 non-null    object
 8   url          859 non-null    object
dtypes: int64(1), object(8)
memory usage: 60.5+ KB


In [7]:
#CSV import for standings table
filepath3 = "Datasets/driver_standings.csv"
df_standings = pd.read_csv(filepath3)
df_standings.head()

Unnamed: 0,driverStandingsId,raceId,driverId,points,position,positionText,wins
0,1,18,1,10.0,1,1,1
1,2,18,2,8.0,2,2,0
2,3,18,3,6.0,3,3,0
3,4,18,4,5.0,4,4,0
4,5,18,5,4.0,5,5,0


In [8]:
df_standings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34595 entries, 0 to 34594
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   driverStandingsId  34595 non-null  int64  
 1   raceId             34595 non-null  int64  
 2   driverId           34595 non-null  int64  
 3   points             34595 non-null  float64
 4   position           34595 non-null  int64  
 5   positionText       34595 non-null  object 
 6   wins               34595 non-null  int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 1.8+ MB


In [9]:
#CSV import for constructor table
filepath4 = "Datasets/constructors.csv"
df_constructor = pd.read_csv(filepath4)
df_constructor.head()

Unnamed: 0,constructorId,constructorRef,name,nationality,url
0,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
1,2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
2,3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Pr...
3,4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formul...
4,5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso


In [10]:
df_constructor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212 entries, 0 to 211
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   constructorId   212 non-null    int64 
 1   constructorRef  212 non-null    object
 2   name            212 non-null    object
 3   nationality     212 non-null    object
 4   url             212 non-null    object
dtypes: int64(1), object(4)
memory usage: 8.4+ KB


Data Cleaning

In [11]:
#Remove url Column from df_circuits -  TBD

# df_circuits.drop('url', axis=1, inplace=True)
#df_circuits.head()

In [12]:
# Remove url and code from df_drivers

columns_to_remove = ['url', 'code']

df_drivers.drop(columns_to_remove, axis=1, inplace=True)

df_drivers.head()

Unnamed: 0,driverId,driverRef,number,forename,surname,dob,nationality
0,1,hamilton,44,Lewis,Hamilton,1985-01-07,British
1,2,heidfeld,\N,Nick,Heidfeld,1977-05-10,German
2,3,rosberg,6,Nico,Rosberg,1985-06-27,German
3,4,alonso,14,Fernando,Alonso,1981-07-29,Spanish
4,5,kovalainen,\N,Heikki,Kovalainen,1981-10-19,Finnish


In [13]:
df_drivers['dob']=pd.to_datetime(df_drivers['dob'])

df_drivers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 859 entries, 0 to 858
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   driverId     859 non-null    int64         
 1   driverRef    859 non-null    object        
 2   number       859 non-null    object        
 3   forename     859 non-null    object        
 4   surname      859 non-null    object        
 5   dob          859 non-null    datetime64[ns]
 6   nationality  859 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 47.1+ KB


In [14]:
# Checking value counts for df_standings['positionText']

df_standings['positionText'].value_counts()

positionText
1      1113
2      1113
3      1113
4      1113
5      1113
       ... 
105       2
107       1
D         1
108       1
106       1
Name: count, Length: 109, dtype: int64

In [15]:
df_constructor.drop('url', axis=1, inplace=True)
df_constructor.head()

Unnamed: 0,constructorId,constructorRef,name,nationality
0,1,mclaren,McLaren,British
1,2,bmw_sauber,BMW Sauber,German
2,3,williams,Williams,British
3,4,renault,Renault,French
4,5,toro_rosso,Toro Rosso,Italian


Analysis

In [16]:
# What country has the most F1 circuits

df_circuits_country = df_circuits.groupby('country')

df_circuits_country['country'].count()

country
Argentina         1
Australia         2
Austria           2
Azerbaijan        1
Bahrain           1
Belgium           3
Brazil            2
Canada            3
China             1
France            7
Germany           3
Hungary           1
India             1
Italy             4
Japan             3
Korea             1
Malaysia          1
Mexico            1
Monaco            1
Morocco           1
Netherlands       1
Portugal          4
Qatar             1
Russia            1
Saudi Arabia      1
Singapore         1
South Africa      2
Spain             6
Sweden            1
Switzerland       1
Turkey            1
UAE               1
UK                4
USA              11
United States     1
Name: country, dtype: int64

In [17]:
#  What nationality has the most drivers?

df_drivers_country = df_drivers.groupby('nationality')

df_drivers_country['nationality'].count()

nationality
American             158
American-Italian       1
Argentine             24
Argentine-Italian      1
Australian            18
Austrian              15
Belgian               23
Brazilian             32
British              166
Canadian              14
Chilean                1
Chinese                1
Colombian              3
Czech                  1
Danish                 5
Dutch                 18
East German            3
Finnish                9
French                73
German                50
Hungarian              1
Indian                 2
Indonesian             1
Irish                  5
Italian               99
Japanese              20
Liechtensteiner        1
Malaysian              1
Mexican                6
Monegasque             4
New Zealander         10
Polish                 1
Portuguese             4
Rhodesian              4
Russian                4
South African         23
Spanish               15
Swedish               10
Swiss                 23
Thai         

SQL Lite Related

In [18]:
# Create engine using the `spacex.sqlite` database file
engine = create_engine("sqlite:///f1.sqlite")

In [20]:
# SOURCE = df_circuits
# TARGET = circuits

In [21]:
def SQL_CREATE_STATEMENT_FROM_DATAFRAME(SOURCE, TARGET):

    sql_text = pd.io.sql.get_schema(SOURCE.reset_index(), TARGET)   
    return sql_text

In [24]:
print (pd.io.sql.get_schema(df_circuits.reset_index(), 'circuits'))

CREATE TABLE "circuits" (
"index" INTEGER,
  "circuitId" INTEGER,
  "circuitRef" TEXT,
  "name" TEXT,
  "location" TEXT,
  "country" TEXT,
  "lat" REAL,
  "lng" REAL,
  "alt" INTEGER,
  "url" TEXT
)


In [22]:
print('\n\n'.join(sql_text))

NameError: name 'sql_text' is not defined