## Work with CSV files as dataframes and Dictionary as dataframe

# working with the NY City Weather dataset 

In [4]:
# import Pandas module
import pandas as pd
# Create a dataframe from a CSV file
df = pd.read_csv(r'C:\Users\Gustavocolmenares\Documents\SchoolStaff\GIS_PCC\Courses_Training\Geo_python_Course\py\pandas\1_intro\nyc_weather.csv')
df.head()

Unnamed: 0,EST,Temperature,DewPoint,Humidity,Sea Level PressureIn,VisibilityMiles,WindSpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
0,1/1/2016,38,23,52,30.03,10,8.0,0,5,,281
1,1/2/2016,36,18,46,30.02,10,7.0,0,3,,275
2,1/3/2016,40,21,47,29.86,10,8.0,0,1,,277
3,1/4/2016,25,9,44,30.05,10,9.0,0,3,,345
4,1/5/2016,20,-3,41,30.57,10,5.0,0,0,,333


In [87]:
# Use function max() to retrive the max temp 
df['Temperature'].max()

50

In [88]:
# report the dates where did rain in the city
df['EST'][df['Events']== 'Rain']


8      1/9/2016
9     1/10/2016
15    1/16/2016
26    1/27/2016
Name: EST, dtype: object

Note:
This dataset is not clean, It contains cells with vallues of 'NaN'. for this reason any calculation that involves those columns it will be not correct, it is necessary to clean the data; this process is call <b>[Data Muning or Data Wrangling]</b>



In [90]:
# retrieve the wind Speed using the mean() funcion without cleaning the data
df['WindSpeedMPH'].mean()

6.892857142857143

In [91]:
# see that the previous result it was calculated with 'NaN' values because the data hasn't been clean yet
# In order to fix this we can fill the cell that contains 'NaN' with (0) value like:
df.fillna(0,inplace = True)
df['WindSpeedMPH'].mean()

6.225806451612903

In [3]:
# you can Create a dataframe from a dictionary like:
weather_data ={
    'day': ['1/1/2017', '1/2/2017', '1/3/2017', '1/4/2017', '1/5/2017', '1/6/2017'],
    'temperature': [32,25,28,24,35,31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow', 'Snow', 'Rain', 'Suuny']
}
df= pd.DataFrame(weather_data)
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,25,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,35,4,Rain
5,1/6/2017,31,2,Suuny


In [128]:
df.shape # print the dimensions of the dataframe
# to save this values in variables do:
rows, columns = df.shape

In [94]:
rows, columns 

(6, 4)

In [95]:
# head() function returns just a few records of the dataframe
df.head() # by default it will print the first 4

Unnamed: 0,day,temperature,windspeed,event
0,1/1207,32,6,Rain
1,1/2/2017,25,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,35,4,Rain


In [96]:
df.tail() # return last 4, if you add a number as an argument in the function tail(<number>), will return that number of records 
df.tail(2) # will return the las 2 records from the df

Unnamed: 0,day,temperature,windspeed,event
4,1/5/2017,35,4,Rain
5,1/06/2017,31,2,Suuny


In [97]:
"""
* to print selected records using index value.
* to print everything you can do df[:] or just df
"""

df[2:5] # [<start>:<until-but-not-include>]

Unnamed: 0,day,temperature,windspeed,event
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,35,4,Rain


In [98]:
# to print the column names you can do:
df.columns

Index(['day', 'temperature', 'windspeed', 'event'], dtype='object')

In [99]:
# To print the content of one of the columns
df.day # or df[day] similar to accessing to a dictionary

0       1/1207
1     1/2/2017
2     1/3/2017
3     1/4/2017
4     1/5/2017
5    1/06/2017
Name: day, dtype: object

In [100]:
# print content of selected columns
df['event']

0     Rain
1    Sunny
2     Snow
3     Snow
4     Rain
5    Suuny
Name: event, dtype: object

In [101]:
# print multiple columns 
df[['event','day', 'temperature']]

Unnamed: 0,event,day,temperature
0,Rain,1/1207,32
1,Sunny,1/2/2017,25
2,Snow,1/3/2017,28
3,Snow,1/4/2017,24
4,Rain,1/5/2017,35
5,Suuny,1/06/2017,31


In [102]:
# Do some operations 
print(df['temperature'].max()) # find max temp
print(df['temperature'].min()) # find min temp
print(df['temperature'].mean()) # find average
print(df['temperature'].std()) # find standar deviation

35
24
29.166666666666668
4.262237284181474


In [103]:
df.describe() # will return the statistics of the data

Unnamed: 0,temperature,windspeed
count,6.0,6.0
mean,29.166667,4.666667
std,4.262237,2.33809
min,24.0,2.0
25%,25.75,2.5
50%,29.5,5.0
75%,31.75,6.75
max,35.0,7.0


In [104]:
# select data based on conditions using SQL
df[df.temperature>=32]

Unnamed: 0,day,temperature,windspeed,event
0,1/1207,32,6,Rain
4,1/5/2017,35,4,Rain


In [105]:
# print data when the temperature were max
df[df.temperature == df.temperature.max()]

Unnamed: 0,day,temperature,windspeed,event
4,1/5/2017,35,4,Rain


In [106]:
# if the column name contain spaces use this syntax:
df[df.temperature == df['temperature'].max()]

Unnamed: 0,day,temperature,windspeed,event
4,1/5/2017,35,4,Rain


In [107]:
# just print specific column value based on a condition
df[['day']][df.temperature== df['temperature'].max()]

Unnamed: 0,day
4,1/5/2017


In [108]:
# you can print multiple columns 
df[['day', 'temperature']][df.temperature == df['temperature'].max()]

Unnamed: 0,day,temperature
4,1/5/2017,35


In [109]:
""" To find more operation, just google 'Pandas operations' and you will find more operations 

- https://pandas.pydata.org/pandas-docs/stable/10min.html 
- https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.Series.html
"""

" To find more operation, just google 'Pandas operations' and you will find more operations \n\n- https://pandas.pydata.org/pandas-docs/stable/10min.html \n- https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.Series.html\n"

In [111]:
# how to setup a column as your index column
    # to find the index range of the dataframe use (.index)
df.index # it will print  "RangeIndex(start = <1stValue>, stop=<lastvalue but not included>)

RangeIndex(start=0, stop=6, step=1)

In [117]:
# Change Index Column
# .set_index() will not change que dataframe it will create a new one, 
df_2 =df.set_index('day')
df_2

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1207,32,6,Rain
1/2/2017,25,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow
1/5/2017,35,4,Rain
1/06/2017,31,2,Suuny


In [118]:
# If you want to change the dataframe that you are woking on you have to include (inplace= True) paramenter
df.set_index('day', inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1207,32,6,Rain
1/2/2017,25,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow
1/5/2017,35,4,Rain
1/06/2017,31,2,Suuny


In [122]:
# Now we can use day as an index by calling a date from the day index column
df.loc['1/2/2017']

temperature       25
windspeed          7
event          Sunny
Name: 1/2/2017, dtype: object

In [126]:
#Reseting the index of your dataframe
df.reset_index(inplace=True)
df

Unnamed: 0,index,day,temperature,windspeed,event
0,0,1/1207,32,6,Rain
1,1,1/2/2017,25,7,Sunny
2,2,1/3/2017,28,2,Snow
3,3,1/4/2017,24,7,Snow
4,4,1/5/2017,35,4,Rain
5,5,1/06/2017,31,2,Suuny
