In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
circuits = pd.read_csv("../data/circuits.csv", header=None)
circuits.columns = ["circuitId", "circuitRef", "circuitName", "circuitLocation", "circuitCountry", "circuitLatitude", "circuitLongitude", "circuitAltitude", "circuitUrl"]
circuits.circuitAltitude = pd.to_numeric(circuits.circuitAltitude, errors="coerce")
circuits.head()

In [None]:
constructors = pd.read_csv("../data/constructors.csv", header=None)
constructors.columns = ["constructorId", "constructorRef", "constructorName", "constructorNationality", "constructorUrl"]
constructors.head()

In [None]:
drivers = pd.read_csv("../data/driver.csv", header=None)
drivers.columns = ["driverId", "driverRef", "driverNumber", "driverCode", "driverName", "driverSurname", "driverBirthDate", "driverNationality", "driverUrl"]
drivers.driverNumber = pd.to_numeric(drivers.driverNumber, errors="coerce")
drivers.loc[drivers.driverCode == "\\N", "driverCode"] = None
drivers.driverBirthDate = pd.to_datetime(drivers.driverBirthDate, errors="coerce", format="%Y-%m-%d")
drivers["driverYearOfBirth"] = pd.to_numeric(drivers.driverBirthDate.dt.strftime("%Y"), errors="coerce")
drivers.head()

In [None]:
races = pd.read_csv("../data/races.csv", header=None)
races.columns = ["raceId", "raceYear", "raceRound", "circuitId", "raceName", "raceDate", "raceTime", "raceUrl"]
races.raceDate = pd.to_datetime(races.raceDate, errors="coerce", format="%Y-%m-%d")
races.raceTime = pd.to_datetime(races.raceTime, errors="coerce", format="%H:%M:%s")
races.head()

In [None]:
results = pd.read_csv("../data/results.csv", header=None)
results.columns = ["resultId", "raceId", "driverId", "constructorId", "driverNumber", "grid", "position", "positionText", "positionOrder", "points", "laps", "time", "milliseconds", "fastestLap", "fastestLapRank", "fastestLapTime", "fastestLapSpeed", "statusId"]
results.driverNumber = pd.to_numeric(results.driverNumber, errors="coerce")
results.position = pd.to_numeric(results.position, errors="coerce")
results.milliseconds = pd.to_numeric(results.milliseconds, errors="coerce")
results.fastestLap = pd.to_numeric(results.fastestLap, errors="coerce")
results.fastestLapRank = pd.to_numeric(results.fastestLapRank, errors="coerce")
results.fastestLapTime = pd.to_datetime(results.fastestLapTime, errors="coerce", format="%M:%s.%f")
results.fastestLapSpeed = pd.to_numeric(results.fastestLapSpeed, errors="coerce")
results.head()

In [None]:
df = pd.read_csv("../data/data.csv")

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.hist()

In [None]:
df.plot(kind="scatter", x="grid", y="position", alpha=0.1)

In [None]:
corr_matrix = df.corr()

In [None]:
corr_matrix

In [None]:
corr_matrix["position"].sort_values(ascending=False)

In [None]:
attributes = ["position", "grid", "constructor_id", "driver_id"]
scatter_matrix(df[attributes])