In [27]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

### Data Types: all objects in python have a type. You can check the type by using the _type()_ function. Here are a few standard ones

In [28]:
type(1.5)

float

In [29]:
type(3)

int

In [30]:
type('abc')

str

In [31]:
type(True)

bool

### You can convert between types

In [32]:
float(1)

1.0

In [33]:
str(1)

'1'

In [34]:
int('9')

9

In [35]:
int(9.9)

9

### DataFrames also have a type

In [36]:
accidents = pd.read_csv('../data/Traffic_Accidents_2019.csv')

In [37]:
type(accidents)

pandas.core.frame.DataFrame

### And each column has a type

In [38]:
accidents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34692 entries, 0 to 34691
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Accident Number             34692 non-null  int64  
 1   Date and Time               34692 non-null  object 
 2   Number of Motor Vehicles    34692 non-null  int64  
 3   Number of Injuries          34692 non-null  int64  
 4   Number of Fatalities        34692 non-null  int64  
 5   Property Damage             2495 non-null   object 
 6   Hit and Run                 34691 non-null  object 
 7   Reporting Officer           34684 non-null  float64
 8   Collision Type Code         34688 non-null  float64
 9   Collision Type Description  34688 non-null  object 
 10  Weather Code                34641 non-null  float64
 11  Weather Description         34641 non-null  object 
 12  Illumination Code           34665 non-null  float64
 13  Illumination Description    346

In [39]:
accidents.dtypes

Accident Number                 int64
Date and Time                  object
Number of Motor Vehicles        int64
Number of Injuries              int64
Number of Fatalities            int64
Property Damage                object
Hit and Run                    object
Reporting Officer             float64
Collision Type Code           float64
Collision Type Description     object
Weather Code                  float64
Weather Description            object
Illumination Code             float64
Illumination Description       object
Harmful Code                   object
Harmful Code Description       object
Street Address                 object
City                           object
State                          object
ZIP                           float64
RPA                           float64
Precinct                       object
Latitude                      float64
Longitude                     float64
Mapped Location                object
dtype: object

In [40]:
accidents.head()

Unnamed: 0,Accident Number,Date and Time,Number of Motor Vehicles,Number of Injuries,Number of Fatalities,Property Damage,Hit and Run,Reporting Officer,Collision Type Code,Collision Type Description,...,Harmful Code Description,Street Address,City,State,ZIP,RPA,Precinct,Latitude,Longitude,Mapped Location
0,20190038972,01/15/2019 07:40:00 PM,2,0,0,,N,256374.0,4.0,ANGLE,...,MOTOR VEHICLE IN TRANSPORT,BELL RD & CEDAR POINTE PKWY,ANTIOCH,TN,37013.0,8753.0,SOUTH,36.0449,-86.6671,POINT (-86.6671 36.0449)
1,20190045402,01/17/2019 11:09:00 PM,2,0,0,,Y,405424.0,11.0,Front to Rear,...,PARKED MOTOR VEHICLE,3248 PERCY PRIEST DR,NASHVILLE,TN,37214.0,8955.0,HERMIT,36.1531,-86.6291,POINT (-86.6291 36.1531)
2,20190051468,01/20/2019 12:57:00 PM,2,0,0,,N,834798.0,6.0,SIDESWIPE - OPPOSITE DIRECTION,...,PARKED MOTOR VEHICLE,700 THOMPSON LN,NASHVILLE,TN,37204.0,8305.0,MIDTOW,36.1122,-86.7625,POINT (-86.7625 36.1122)
3,20190088097,02/02/2019 12:38:00 AM,2,0,0,,Y,660929.0,4.0,ANGLE,...,MOTOR VEHICLE IN TRANSPORT;PARKED MOTOR VEHICLE,400 RADNO,NASHVILLE,TN,,,,36.0483,-86.4369,POINT (-86.4369 36.0483)
4,20190091289,02/03/2019 01:25:00 PM,2,0,0,,N,212369.0,4.0,ANGLE,...,MOTOR VEHICLE IN TRANSPORT,ELLINGTON AG CENTER PVTDR & EDMONDSON PK,NASHVILLE,TN,37220.0,8615.0,MIDTOW,36.0618,-86.7405,POINT (-86.7405 36.0618)


### One data type you will encounter is a `datetime`

### The `Date and Time` column in the `accidents` dataframe is treated as an `object` but we can convert it to a different type, such as a `datetime` 

In [41]:
# Let's convert the 'Date and Time' column to a datetime and assign it back to itself
accidents['Date and Time'] = pd.to_datetime(accidents['Date and Time'])


# pd.to_datetime will infer the different date and time components of the string.
# If the datetime is in a strange format or you want to be explicit you can use the 'format' argument
# You will have to use datetime symbols: 
# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

# It will take a second to run...

  accidents['Date and Time'] = pd.to_datetime(accidents['Date and Time'])


In [42]:
# Now the column is a datetime64[ns]
accidents.dtypes

Accident Number                        int64
Date and Time                 datetime64[ns]
Number of Motor Vehicles               int64
Number of Injuries                     int64
Number of Fatalities                   int64
Property Damage                       object
Hit and Run                           object
Reporting Officer                    float64
Collision Type Code                  float64
Collision Type Description            object
Weather Code                         float64
Weather Description                   object
Illumination Code                    float64
Illumination Description              object
Harmful Code                          object
Harmful Code Description              object
Street Address                        object
City                                  object
State                                 object
ZIP                                  float64
RPA                                  float64
Precinct                              object
Latitude  

In [43]:
# The values in the Date and Time column look different now
accidents.head()

Unnamed: 0,Accident Number,Date and Time,Number of Motor Vehicles,Number of Injuries,Number of Fatalities,Property Damage,Hit and Run,Reporting Officer,Collision Type Code,Collision Type Description,...,Harmful Code Description,Street Address,City,State,ZIP,RPA,Precinct,Latitude,Longitude,Mapped Location
0,20190038972,2019-01-15 19:40:00,2,0,0,,N,256374.0,4.0,ANGLE,...,MOTOR VEHICLE IN TRANSPORT,BELL RD & CEDAR POINTE PKWY,ANTIOCH,TN,37013.0,8753.0,SOUTH,36.0449,-86.6671,POINT (-86.6671 36.0449)
1,20190045402,2019-01-17 23:09:00,2,0,0,,Y,405424.0,11.0,Front to Rear,...,PARKED MOTOR VEHICLE,3248 PERCY PRIEST DR,NASHVILLE,TN,37214.0,8955.0,HERMIT,36.1531,-86.6291,POINT (-86.6291 36.1531)
2,20190051468,2019-01-20 12:57:00,2,0,0,,N,834798.0,6.0,SIDESWIPE - OPPOSITE DIRECTION,...,PARKED MOTOR VEHICLE,700 THOMPSON LN,NASHVILLE,TN,37204.0,8305.0,MIDTOW,36.1122,-86.7625,POINT (-86.7625 36.1122)
3,20190088097,2019-02-02 00:38:00,2,0,0,,Y,660929.0,4.0,ANGLE,...,MOTOR VEHICLE IN TRANSPORT;PARKED MOTOR VEHICLE,400 RADNO,NASHVILLE,TN,,,,36.0483,-86.4369,POINT (-86.4369 36.0483)
4,20190091289,2019-02-03 13:25:00,2,0,0,,N,212369.0,4.0,ANGLE,...,MOTOR VEHICLE IN TRANSPORT,ELLINGTON AG CENTER PVTDR & EDMONDSON PK,NASHVILLE,TN,37220.0,8615.0,MIDTOW,36.0618,-86.7405,POINT (-86.7405 36.0618)


In [44]:
# And we can see each value is a timestamp
accidents.loc[0, 'Date and Time']


Timestamp('2019-01-15 19:40:00')

### Once you have a `datetime` object, you can pull out [individual parts](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.html)
- Use `.dt` to specify a datetime attribute/function and then what you want to pull out
- Pull out the month from the 'Date and Time' column and save it to a new column called 'month'

In [45]:
accidents['month'] = accidents['Date and Time'].dt.month
accidents.head()

Unnamed: 0,Accident Number,Date and Time,Number of Motor Vehicles,Number of Injuries,Number of Fatalities,Property Damage,Hit and Run,Reporting Officer,Collision Type Code,Collision Type Description,...,Street Address,City,State,ZIP,RPA,Precinct,Latitude,Longitude,Mapped Location,month
0,20190038972,2019-01-15 19:40:00,2,0,0,,N,256374.0,4.0,ANGLE,...,BELL RD & CEDAR POINTE PKWY,ANTIOCH,TN,37013.0,8753.0,SOUTH,36.0449,-86.6671,POINT (-86.6671 36.0449),1
1,20190045402,2019-01-17 23:09:00,2,0,0,,Y,405424.0,11.0,Front to Rear,...,3248 PERCY PRIEST DR,NASHVILLE,TN,37214.0,8955.0,HERMIT,36.1531,-86.6291,POINT (-86.6291 36.1531),1
2,20190051468,2019-01-20 12:57:00,2,0,0,,N,834798.0,6.0,SIDESWIPE - OPPOSITE DIRECTION,...,700 THOMPSON LN,NASHVILLE,TN,37204.0,8305.0,MIDTOW,36.1122,-86.7625,POINT (-86.7625 36.1122),1
3,20190088097,2019-02-02 00:38:00,2,0,0,,Y,660929.0,4.0,ANGLE,...,400 RADNO,NASHVILLE,TN,,,,36.0483,-86.4369,POINT (-86.4369 36.0483),2
4,20190091289,2019-02-03 13:25:00,2,0,0,,N,212369.0,4.0,ANGLE,...,ELLINGTON AG CENTER PVTDR & EDMONDSON PK,NASHVILLE,TN,37220.0,8615.0,MIDTOW,36.0618,-86.7405,POINT (-86.7405 36.0618),2


#### What is the maximum number of cars involved in a single accident in July?
- subset the `accidents` DataFrame to get the July accidents
- find the maximum `Number of Motor Vehicles` for accidents that happened in July


In [46]:
july_accidents = accidents[accidents['month']==7]
july_accidents.head()

Unnamed: 0,Accident Number,Date and Time,Number of Motor Vehicles,Number of Injuries,Number of Fatalities,Property Damage,Hit and Run,Reporting Officer,Collision Type Code,Collision Type Description,...,Street Address,City,State,ZIP,RPA,Precinct,Latitude,Longitude,Mapped Location,month
16,20190521589,2019-07-12 05:45:00,2,0,0,,N,192379.0,11.0,Front to Rear,...,MM 87 0 I 65,NASHVILLE,TN,37207.0,18020.0,EAST,36.2018,-86.7766,POINT (-86.7766 36.2018),7
17,20190537481,2019-07-18 14:51:00,4,0,0,,N,716886.0,11.0,Front to Rear,...,MM 1 6 I 440,NASHVILLE,TN,37209.0,5230.0,WEST,36.1531,-86.8228,POINT (-86.8228 36.1531),7
18,20190540298,2019-07-19 16:30:00,2,0,0,,Y,299238.0,4.0,ANGLE,...,S 6TH ST & WOODLAND ST,NASHVILLE,TN,37206.0,1117.0,EAST,36.1726,-86.7628,POINT (-86.7628 36.1726),7
19,20190549470,2019-07-23 12:40:00,4,3,0,Y,N,902543.0,4.0,ANGLE,...,MM 90 7 I 65,MADISON,TN,37115.0,20044.0,MADISO,36.2444,-86.7473,POINT (-86.7473 36.2444),7
20,20190552118,2019-07-24 11:15:00,2,2,0,,N,332215.0,4.0,ANGLE,...,LOMBARDY AV & HILLSBORO PKE,NASHVILLE,TN,,6001.0,WEST,,,,7


In [47]:
july_accidents['Number of Motor Vehicles'].max()

np.int64(8)

In [48]:
july_accidents.nlargest(4, 'Number of Motor Vehicles')

Unnamed: 0,Accident Number,Date and Time,Number of Motor Vehicles,Number of Injuries,Number of Fatalities,Property Damage,Hit and Run,Reporting Officer,Collision Type Code,Collision Type Description,...,Street Address,City,State,ZIP,RPA,Precinct,Latitude,Longitude,Mapped Location,month
1818,20190560008,2019-07-27 14:45:00,8,0,0,,Y,299267.0,11.0,Front to Rear,...,MM 1 4 I 440,NASHVILLE,TN,37209.0,52320.0,WEST,36.1496,-86.8227,POINT (-86.8227 36.1496),7
5945,20190534880,2019-07-17 15:50:00,6,2,0,,N,717341.0,4.0,ANGLE,...,MURFREESBORO PKE & EZELL PKE,NASHVILLE,TN,37217.0,8841.0,SOUTH,36.1057,-86.6719,POINT (-86.6719 36.1057),7
394,20190525762,2019-07-14 00:55:00,5,2,0,,N,299274.0,11.0,Front to Rear,...,MM 210 8 I 40,NASHVILLE,TN,37210.0,40080.0,CENTRA,36.1544,-86.7625,POINT (-86.7625 36.1544),7
1403,20190547162,2019-07-22 15:45:00,5,3,0,,N,352172.0,11.0,Front to Rear,...,OLD HICKORY BLVD & HIGHWAY 70S,NASHVILLE,TN,37221.0,4901.0,WEST,36.0742,-86.9208,POINT (-86.9208 36.0742),7


In [49]:
#top of anything in a dataframe nlargest



In [50]:
# How many accidents happened in December?
(accidents['month']==12).sum()

np.int64(2622)

### There are [many different attributes associated with datetimes](https://towardsdatascience.com/working-with-datetime-in-pandas-dataframe-663f7af6c587)

In [51]:
accidents['Date and Time'].dt.time.head()

0    19:40:00
1    23:09:00
2    12:57:00
3    00:38:00
4    13:25:00
Name: Date and Time, dtype: object

In [52]:
accidents['Date and Time'].dt.date.head()

0    2019-01-15
1    2019-01-17
2    2019-01-20
3    2019-02-02
4    2019-02-03
Name: Date and Time, dtype: object

In [53]:
accidents['Date and Time'].dt.weekday.head()

0    1
1    3
2    6
3    5
4    6
Name: Date and Time, dtype: int32

In [54]:
accidents['Date and Time'].dt.is_leap_year.head()

0    False
1    False
2    False
3    False
4    False
Name: Date and Time, dtype: bool

### You can use comparison symbols on `datetime` objects as well

In [55]:
# How many accidents happened before March 3
sum(accidents['Date and Time'] < '03/03/2019')

# Note: You have to input the comparison value as a string,
# but the format can vary and pandas will attempt to infer the format.
# Try putting in different formats and rerunning this cell.

5558

### You can also perform calculations on `datetime` objects

In [56]:
# How long between the 1st and 101th accident?
accidents = accidents.sort_values('Date and Time')
accidents.loc[100, 'Date and Time'] - accidents.loc[0, 'Date and Time']

# It appears as a Timedelta, or a change in time

Timedelta('81 days 16:05:00')

# End of Instruction

### Use PGAdmin to get the player info for all players, if they are in the Hall of fame, also pull that data.   
### Save those results as a .csv and read them into this notebook in the cell below

In [57]:
players = pd.read_csv('/Users/jonathanfarro/Documents/NSS/Python/data/bbplayers.csv')
players.columns

  players = pd.read_csv('/Users/jonathanfarro/Documents/NSS/Python/data/bbplayers.csv')


Index(['playerid', 'birthyear', 'birthmonth', 'birthday', 'birthcountry',
       'birthstate', 'birthcity', 'deathyear', 'deathmonth', 'deathday',
       'deathcountry', 'deathstate', 'deathcity', 'namefirst', 'namelast',
       'namegiven', 'weight', 'height', 'bats', 'throws', 'debut', 'finalgame',
       'retroid', 'bbrefid', 'yearid', 'votedby', 'ballots', 'needed', 'votes',
       'inducted', 'category', 'needed_note'],
      dtype='object')

### Convert the debut and final game info into Datetime

In [None]:
players = players.rename(columns= {'inducted':'hall_of_fame'})



In [None]:
players = players.drop(columns =['playerid', 'birthyear', 'birthmonth', 'birthday', 'birthcountry',
                                 'birthstate', 'birthcity', 'deathyear', 'deathmonth', 'deathday',
                                 'deathcountry', 'deathstate', 'deathcity', 'namefirst', 'namelast',
                                 'namegiven', 'bats', 'throws','retroid', 'bbrefid',  
                                 'votedby', 'ballots', 'needed', 'votes', 'needed_note'])


In [None]:
players['debut'] = pd.to_datetime(players.debut)
players['finalgame'] = pd.to_datetime(players.finalgame)


In [78]:
players



Unnamed: 0,weight,height,debut,finalgame,yearid,hall_of_fame,category,final_vs_debut
0,175.0,73.0,1905-08-30,1928-09-11,1936.0,Y,Player,8413 days
1,215.0,74.0,1914-07-11,1935-05-30,1936.0,Y,Player,7628 days
2,200.0,71.0,1897-07-19,1917-09-17,1936.0,Y,Player,7364 days
3,195.0,73.0,1900-07-17,1916-09-04,1936.0,Y,Player,5893 days
4,200.0,73.0,1907-08-02,1927-09-30,1936.0,Y,Player,7364 days
...,...,...,...,...,...,...,...,...
22003,165.0,75.0,2001-07-25,2010-04-29,,,,3200 days
22004,190.0,72.0,1987-09-16,1989-10-01,,,,746 days
22005,165.0,70.0,1931-04-27,1931-09-27,,,,153 days
22006,165.0,72.0,1917-07-05,1922-04-17,,,,1747 days


### Find the difference in debut and final game for all players

In [62]:
players['final_vs_debut'] = (players.finalgame - players.debut)



In [97]:

)

Unnamed: 0,final_vs_debut,hall_of_fame
0,-4 days,77
1,361 days,7
2,726 days,0
3,1091 days,4
4,1456 days,4
5,1821 days,4
6,2186 days,17
7,2551 days,12
8,2916 days,68
9,3281 days,115


### Next compare that difference among all players, hall of fame players, and players not in the hall of fame