# Formatting to CSV

```python
# xarray used for reformatting netCDF to CSV
import xarray as xr

# opening netCDF file using xarray
x_ds = xr.open_dataset('~/Desktop/cs_project/TRMM_2.5x2.5.nc')

# Using precipitation weather data to create a csv file
x_ds.PREC[:].to_dataframe().to_csv('~/Desktop/cs_project/prec.csv')
```

# Data Analysis using Pandas

In [8]:
import pandas as pd

# Reading csv file with Pandas which created a dataframe
weather = pd.read_csv('~/Desktop/cs_project/prec.csv')

# print dimensions of the CSV file
print('Date shape: ' + str(weather.shape))

# print the size of the DataFrame
print('Size of DataFrame: ' + str(weather.size))

# Notice the weather time is a float type
print('TIME attribute information: ' + str(weather.TIME[0]) + ' ' + str(type(weather.TIME[0])))

Date shape: (83296512, 4)
Size of DataFrame: 333186048
TIME attribute information: 19980101.0 <class 'numpy.float64'>


In [15]:
# Print first 5 rows of the DataFrame
print(weather.head())

         TIME    LAT    LON       PREC
0  19980101.0 -88.75   1.25   9.262500
1  19980101.0 -88.75   3.75  14.422500
2  19980101.0 -88.75   6.25  11.647500
3  19980101.0 -88.75   8.75  14.722500
4  19980101.0 -88.75  11.25   7.664999


In [6]:
# Filter to include only data within LON [160, -160] and LAT [40, -40]
# This is done using label-based indexing, loc
# and multiple conditional indexing

filtered_weather = weather.loc[
    (weather.LON >= -160) & (weather.LON <= 160) &
    (weather.LAT >= -40) & (weather.LAT <= 40)]

# print first 5 rows of the filtered_weather DataFrame
print(filtered_weather.head())

            TIME    LAT    LON  PREC
2880  19980101.0 -38.75   1.25   0.0
2881  19980101.0 -38.75   3.75   0.0
2882  19980101.0 -38.75   6.25   0.0
2883  19980101.0 -38.75   8.75   0.0
2884  19980101.0 -38.75  11.25   0.0


In [7]:
# size of the filtered_weather
print(filtered_weather.size)

65814528


In [18]:
from datetime import datetime as dt

# Change data value of TIME from float to datetime so it looks pretty
time_float_to_str = ["%8.1f" % x for x in filtered_weather.TIME[:]]

# convert string list into datetime list
formatted_dates = (dt.strptime(x, '%Y%m%d.%f').date() for x in time_float_to_str)
time_list = [x for x in formatted_dates]

# Replace TIME float of the filtered_weather with datetime.date data type
filtered_weather['TIME'] = time_list

# check out the new DataFrame
print(filtered_weather.head())

            TIME    LAT    LON  PREC
2880  1998-01-01 -38.75   1.25   0.0
2881  1998-01-01 -38.75   3.75   0.0
2882  1998-01-01 -38.75   6.25   0.0
2883  1998-01-01 -38.75   8.75   0.0
2884  1998-01-01 -38.75  11.25   0.0


In [19]:
# Creating a new CSV file with the filtered_weather DataFrame
filtered_weather.to_csv('~/Desktop/cs_project/filtered_weather.csv')

In [23]:
# Finding weather data where the precipitation is greater than 0.00
print(filtered_weather.loc[(filtered_weather.PREC > 0.00)])

                TIME    LAT     LON       PREC
2893      1998-01-01 -38.75   33.75   6.367500
2894      1998-01-01 -38.75   36.25  10.672501
2895      1998-01-01 -38.75   38.75   4.387500
2896      1998-01-01 -38.75   41.25  13.710000
2897      1998-01-01 -38.75   43.75   9.839999
...              ...    ...     ...        ...
83293542  2019-12-30  38.75  136.25   5.910000
83293543  2019-12-30  38.75  138.75   6.681515
83293545  2019-12-30  38.75  143.75   4.642499
83293546  2019-12-30  38.75  146.25   1.237500
83293547  2019-12-30  38.75  148.75   5.070000

[7294559 rows x 4 columns]
