# Proyecto: Extracción de subconjuntos de un dataframe y algunas operaciones matemáticas 

Los datos de esta práctica se pueden descargar desde: https://www.kaggle.com/hmavrodiev/london-bike-sharing-dataset

### Algunas funciones que se usan en este script son:

`.dt.hour` <- Extrae la hora (como número entero) de una variable tipo 'datetime' 

`.iloc[: , N : N+1]` <- Extrae la columna N

`.iloc[r1:rn , c1:cm]` <- Extrae un subconjunto del dataframe

`.iloc[::n]` <- Selecciona valores de una columna (1D) de 'n' en 'n'

`.iloc[::n , :]` <- Selecciona datos de un dataframe (2D); todas las columnas e índices de 'n' en 'n'

`+ - * /` <- Operaciones permitidas entre columnas, ej: 'df[column_1]*df[column_2]'

` columna1 .dot( columna2 )` <- Producto punto (o interno) entre 2 columnas


In [1]:
import pandas as pd

import numpy as np

In [2]:
# Cargamos datos:
df_lmerged = pd.read_csv('./db/Bikes/london_merged.csv')

Significado de las variables:

"timestamp" - fecha del registro

"cnt" - cantidad de bicicletas compartidas

"t1" - temperatura real del día en grados Centigrados

"t2" - sensación de temperatura del día en grados Centigrados

"hum" - porcentaje de humedad del aire

"windspeed" - velocidad del aire en km/h

"weathercode" - categoría del clima

"isholiday" - campo booleano: 1 día feriado / 0 día no feriado

"isweekend" - campo booleano: 1 día Viernes / 0 otro día de la semana

"season" - Temporada actual: 0-primavera ; 1-verano; 2-otoño; 3-invierno

In [3]:
df_lmerged.head(10)

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
5,2015-01-04 05:00:00,46,2.0,2.0,93.0,4.0,1.0,0.0,1.0,3.0
6,2015-01-04 06:00:00,51,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0
7,2015-01-04 07:00:00,75,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0
8,2015-01-04 08:00:00,131,1.5,-1.0,96.5,8.0,4.0,0.0,1.0,3.0
9,2015-01-04 09:00:00,301,2.0,-0.5,100.0,9.0,3.0,0.0,1.0,3.0


In [4]:
df_lmerged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     17414 non-null  object 
 1   cnt           17414 non-null  int64  
 2   t1            17414 non-null  float64
 3   t2            17414 non-null  float64
 4   hum           17414 non-null  float64
 5   wind_speed    17414 non-null  float64
 6   weather_code  17414 non-null  float64
 7   is_holiday    17414 non-null  float64
 8   is_weekend    17414 non-null  float64
 9   season        17414 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.3+ MB


In [5]:
# Vemos que la columna 'timestamp' registra fechas y son de tipo 'object'
# así que la convertimos al tipo 'datetime64' de pandas:
df_lmerged['timestamp'] = pd.to_datetime( df_lmerged['timestamp'] ) 

df_lmerged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   timestamp     17414 non-null  datetime64[ns]
 1   cnt           17414 non-null  int64         
 2   t1            17414 non-null  float64       
 3   t2            17414 non-null  float64       
 4   hum           17414 non-null  float64       
 5   wind_speed    17414 non-null  float64       
 6   weather_code  17414 non-null  float64       
 7   is_holiday    17414 non-null  float64       
 8   is_weekend    17414 non-null  float64       
 9   season        17414 non-null  float64       
dtypes: datetime64[ns](1), float64(8), int64(1)
memory usage: 1.3 MB


In [6]:
# Dado que la columna 'timestamp' ya es de tipo 'datetime' podemos extraer la hora
# y gaurdar el valor en una nueva columna llamada 'hour'

df_lmerged['hour'] = df_lmerged['timestamp'].dt.hour

df_lmerged.head(10)

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,hour
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,1
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0,3
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0,4
5,2015-01-04 05:00:00,46,2.0,2.0,93.0,4.0,1.0,0.0,1.0,3.0,5
6,2015-01-04 06:00:00,51,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0,6
7,2015-01-04 07:00:00,75,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0,7
8,2015-01-04 08:00:00,131,1.5,-1.0,96.5,8.0,4.0,0.0,1.0,3.0,8
9,2015-01-04 09:00:00,301,2.0,-0.5,100.0,9.0,3.0,0.0,1.0,3.0,9


In [7]:
# Usamos .iloc[: , 1:] para extraer "todos" los renglones (:) y columnas en el rango (1:)

df = df_lmerged.iloc[: , 1:]
df

Unnamed: 0,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,hour
0,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,0
1,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0,1
2,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2
3,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0,3
4,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0,4
...,...,...,...,...,...,...,...,...,...,...
17409,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0,19
17410,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0,20
17411,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0,21
17412,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0,22


In [8]:
# Usamos .iloc[: , N : N+1] para extraer la columna N
df.iloc[: , 3:4]

Unnamed: 0,hum
0,93.0
1,93.0
2,96.5
3,100.0
4,93.0
...,...
17409,81.0
17410,81.0
17411,78.5
17412,76.0


In [9]:
# Elevamos al cuadrado y sumamos 10 a los valores de una columna:
np.sin( df['wind_speed']**2 + 10 )

0        0.901788
1       -0.428183
2       -0.544021
3       -0.544021
4        0.915631
           ...   
17409    0.287932
17410   -0.983582
17411    0.995681
17412   -0.976578
17413   -0.696102
Name: wind_speed, Length: 17414, dtype: float64

In [10]:
# Seleccion de datos de una columna, de 2 en 2:
df['t1'].iloc[::2]

0        3.0
2        2.5
4        2.0
6        1.0
8        1.5
        ... 
17404    6.0
17406    6.0
17408    5.0
17410    5.0
17412    5.5
Name: t1, Length: 8707, dtype: float64

In [11]:
# Seleccion de datos de una columna, de 3 en 3:
df['t1'].iloc[::3]

0        3.0
3        2.0
6        1.0
9        2.0
12       2.0
        ... 
17400    3.0
17403    6.0
17406    6.0
17409    5.0
17412    5.5
Name: t1, Length: 5805, dtype: float64

In [12]:
# Selecciona todas las columnas e indices de 2 en 2 en un dataframe
df.iloc[::2, :]

Unnamed: 0,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season,hour
0,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0,0
2,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0,2
4,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0,4
6,51,1.0,-1.0,100.0,7.0,4.0,0.0,1.0,3.0,6
8,131,1.5,-1.0,96.5,8.0,4.0,0.0,1.0,3.0,8
...,...,...,...,...,...,...,...,...,...,...
17404,765,6.0,2.0,73.5,22.0,3.0,0.0,0.0,3.0,14
17406,1201,6.0,2.0,71.0,26.0,4.0,0.0,0.0,3.0,16
17408,2220,5.0,1.0,81.0,22.0,2.0,0.0,0.0,3.0,18
17410,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0,20


In [13]:
# Opraciones entre columnas (+ - * /):
df['t1']*df['t2']

0        6.00
1        7.50
2        6.25
3        4.00
4        0.00
         ... 
17409    5.00
17410    5.00
17411    8.25
17412    8.25
17413    5.00
Length: 17414, dtype: float64

In [14]:
# Producto punto (o interno) entre dos columnas:
df['t1'].dot(df['t2'])

3135730.277777778

In [15]:
# Guardamos dataframe:
df.to_csv('./db/Bikes/london_merged_new.csv',index=False)