In [1]:
# Import of important libs
import pandas as pd

In [2]:
# Create function for converting temperature from farengheit to celsius
def temp_to_celcius(temperature):
    return (temperature - 32) * 5 / 9

In [3]:
# Variable for path to the file
path_to_file = '2_taxi_nyc.csv'

In [4]:
# Import dataset
taxi = pd.read_csv(path_to_file, encoding='windows-1251')

### Data frame structure:

- `pickup_dt` – период с точностью до часа
- `pickup_month` – месяц
- `borough` – район Нью-Йорка, из которого был сделан заказ (5 районов + аэропорт)
- `pickups` – число поездок за период (час)
- `hday` – является ли день праздничным/выходным; Y - да,  N - нет
- `spd` – скорость ветра в милях в час
- `vsb` – видимость
- `temp` – температура в градусах Фаренгейта
- `dewp` – точка росы по Фаренгейту
- `slp` – давление
- `pcp_01` – количество осадков за час
- `pcp_06` – количество осадков за 6 часов
- `pcp_24` – количество осадков за 24 часа
- `sd` – глубина снега в дюймах

In [5]:
taxi.head()

Unnamed: 0,pickup_dt,pickup_month,borough,pickups,hday,spd,vsb,temp,dewp,slp,pcp 01,pcp 06,pcp 24,sd
0,2015-01-01 01:00:00,Jan,Bronx,152,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0
1,2015-01-01 01:00:00,Jan,Brooklyn,1519,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0
2,2015-01-01 01:00:00,Jan,EWR,0,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0
3,2015-01-01 01:00:00,Jan,Manhattan,5258,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0
4,2015-01-01 01:00:00,Jan,Queens,405,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0


In [6]:
taxi.describe()

Unnamed: 0,pickups,spd,vsb,temp,dewp,slp,pcp 01,pcp 06,pcp 24,sd
count,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0,29101.0
mean,490.215903,5.984924,8.818125,47.669042,30.823065,1017.817938,0.00383,0.026129,0.090464,2.529169
std,995.649536,3.699007,2.442897,19.814969,21.283444,7.768796,0.018933,0.093125,0.219402,4.520325
min,0.0,0.0,0.0,2.0,-16.0,991.4,0.0,0.0,0.0,0.0
25%,1.0,3.0,9.1,32.0,14.0,1012.5,0.0,0.0,0.0,0.0
50%,54.0,6.0,10.0,46.0,30.0,1018.2,0.0,0.0,0.0,0.0
75%,449.0,8.0,10.0,64.5,50.0,1022.9,0.0,0.0,0.05,2.958333
max,7883.0,21.0,10.0,89.0,73.0,1043.4,0.28,1.24,2.1,19.0


In [7]:
taxi.shape

(29101, 14)

In [8]:
taxi.dtypes

pickup_dt        object
pickup_month     object
borough          object
pickups           int64
hday             object
spd             float64
vsb             float64
temp            float64
dewp            float64
slp             float64
pcp 01          float64
pcp 06          float64
pcp 24          float64
sd              float64
dtype: object

### Task 1.
Check how many rows and columns there are in the dataset.

In [9]:
taxi.shape

(29101, 14)

### Task 2.
Let's look at the types of columns. Were all of them counted correctly? As an answer, select the type that prevails in the dataset.

In [10]:
taxi.dtypes

pickup_dt        object
pickup_month     object
borough          object
pickups           int64
hday             object
spd             float64
vsb             float64
temp            float64
dewp            float64
slp             float64
pcp 01          float64
pcp 06          float64
pcp 24          float64
sd              float64
dtype: object

In [11]:
taxi.dtypes.value_counts()

float64    9
object     4
int64      1
dtype: int64

### Task 3.
There are spaces in the column names pcp 01, pcp 06, pcp 24. This is not very convenient if in the future you plan to access the columns through a dot, without using quotation marks and parentheses.

Replace spaces in the names with an underscore.

The data is saved to the taxi variable. The data must be changed in the same dataframe.

In [12]:
taxi.columns = taxi.columns.str.replace(' ', '_')

In [14]:
taxi.columns

Index(['pickup_dt', 'pickup_month', 'borough', 'pickups', 'hday', 'spd', 'vsb',
       'temp', 'dewp', 'slp', 'pcp_01', 'pcp_06', 'pcp_24', 'sd'],
      dtype='object')

### Task 4.

You were asked to find out how many entries (rows) in the dataframe relate to the Manhattan area. To get an answer to this question, for example, using the following command:

taxi.query("borough == 'Manhattan'").shape[0]

And if we want to see how many times each of the districts occurs? Is it really necessary to use a similar construction for each level of the variable? Here the value_counts() method comes to the rescue, more about which you can read in the synopsis!

How many times does the Brooklyn neighborhood occur in the data?

In [16]:
taxi.query("borough == 'Manhattan'").shape[0]

4343

### Task 5.
Next task: find out from which area the largest number of trips were made during the entire period. To begin with, count the total number of trips (pickups), without grouping.

In [15]:
taxi.pickups.sum()

14265773

### Task 6.
And now group the data by borough and indicate from which point the most trips were made.

In [27]:
taxi.groupby('borough') \
    .pickups.sum() \
    .idxmax()

'Manhattan'

### Task 7.
Some more useful methods in pandas! Perhaps in the previous step you just looked at the data or sorted the values. To speed up this process in the future, the idxmin() and idxmax() methods will help, which return the index of the minimum or maximum value.
idxmin – index of the minimum value idxmax – index of the maximum value Save the name of the area with the least number of trips to the min_pickups variable by applying the appropriate method.

In [28]:
min_pickups = taxi.borough[taxi \
    .groupby('borough', as_index=False) \
    .agg({'pickups': 'sum'}) \
    .pickups.idxmin()]

min_pickups

'EWR'

### Task 8.
Let's continue studying the data and look at the number of trips on weekends. Group the data by two criteria: the area of the city and whether the day is a weekend (borough and hday columns). Compare the average number of trips, and select areas from which more orders are received on average on holidays than on normal days.

In [32]:
taxi.groupby(['borough', 'hday'], as_index=False) \
    .agg({'pickups': 'mean'}) \
    .rename(columns={'pickups': 'avg_pickups'}) \
    .pivot(index='borough', columns='hday', values='avg_pickups') \
    .query('Y > N')

hday,N,Y
borough,Unnamed: 1_level_1,Unnamed: 2_level_1
EWR,0.023467,0.041916
Queens,308.899904,320.730539


### Task 9.
For each district, count the number of trips by month. Sort the received values in descending order and save the resulting dataframe in pickups_by_mon_bar.

Please note that the final dataset should consist of 3 columns - pickup_month, borough, pickups.

In [37]:
pickups_by_mon_bor = taxi \
    .groupby(['borough', 'pickup_month'], as_index=False) \
    .agg({'pickups': 'sum'}) \
    .sort_values('pickups', ascending=False)

pickups_by_mon_bor.head()

Unnamed: 0,borough,pickup_month,pickups
21,Manhattan,Jun,1995388
23,Manhattan,May,1888800
19,Manhattan,Feb,1718571
22,Manhattan,Mar,1661261
18,Manhattan,Apr,1648278


### Task 10.
Now the task is more difficult! Let's practice writing our own functions and applying them to a dataframe. Since the travel data is in New York, the temperature is presented in degrees Fahrenheit.

Write the temp_to_celsius function, which receives a column with a temperature in °F as input, and returns values converted to degrees Celsius.

In [33]:
taxi['temp_C'] = temp_to_celcius(taxi.temp)
taxi.head()

Unnamed: 0,pickup_dt,pickup_month,borough,pickups,hday,spd,vsb,temp,dewp,slp,pcp_01,pcp_06,pcp_24,sd,temp_C
0,2015-01-01 01:00:00,Jan,Bronx,152,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,-1.111111
1,2015-01-01 01:00:00,Jan,Brooklyn,1519,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,-1.111111
2,2015-01-01 01:00:00,Jan,EWR,0,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,-1.111111
3,2015-01-01 01:00:00,Jan,Manhattan,5258,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,-1.111111
4,2015-01-01 01:00:00,Jan,Queens,405,Y,5.0,10.0,30.0,7.0,1023.5,0.0,0.0,0.0,0.0,-1.111111
