In [1]:
import pandas as pd

# Create Canned data
Canned data is hard coded within the program

Create a key:value collection of series to use to populate the dataframe for testing

In [2]:
data = {'Month': pd.Series(['January', 'February', 'March', 'April', 'May',
                           'June', 'July', 'August', 'September', 'October','November', 'December']),
       'Rainfall': pd.Series([1.65,1.25,1.94, 2.75, 3.14, 3.65,
                              5.05, 1.50, 1.33, 0.07, 0.50, 2.30])
       }

In [3]:
# Create a Dataframe using the static data
df = pd.DataFrame(data)
df.shape

(12, 2)

In [4]:
print("Our data frame:")
print(df, "\n")

Our data frame:
        Month  Rainfall
0     January      1.65
1    February      1.25
2       March      1.94
3       April      2.75
4         May      3.14
5        June      3.65
6        July      5.05
7      August      1.50
8   September      1.33
9     October      0.07
10   November      0.50
11   December      2.30 



Pandas Series is a single dimension array

Pandas dataframe is a two-dimensional array, like a spreadsheet.
Our df consists of 2 rows of series(months and rainfall)

In [5]:
# Extra step to make my own df
data = {'Month': pd.Series(['January', 'February', 'March', 'April', 'May',
                           'June', 'July', 'August', 'September', 'October','November', 'December']),
       'Rainfall': pd.Series([1.65,1.25,1.94, 2.75, 2.75, 3.645,
                              5.5, 1, 1.3," ", 0.50, 2.3]),
        'Temperature': pd.Series([3,10,15,20,25,24,30,1,33," ",32,2.3])
       }

In [6]:
#change to df
df = pd.DataFrame(data)

#save as csv on desktop
df.to_csv('rainfall.csv', index = 0)

#read from csv
dfc = pd.read_csv('rainfall.csv')
dfc

Unnamed: 0,Month,Rainfall,Temperature
0,January,1.65,3.0
1,February,1.25,10.0
2,March,1.94,15.0
3,April,2.75,20.0
4,May,2.75,25.0
5,June,3.645,24.0
6,July,5.5,30.0
7,August,1.0,1.0
8,September,1.3,33.0
9,October,,


In [7]:
#to read json file
dfj = pd.read_json('data.json')

print("Our data frame:")
print(dfj, "\n")

Our data frame:
        Month  Rainfall  Temperature
0     January     1.650          3.0
1    February     1.250         10.0
10   November     0.500         32.0
11   December     2.300          2.3
2       March     1.940         15.0
3       April     2.750         20.0
4         May     2.750         25.0
5        June     3.645         24.0
6        July     5.500         30.0
7      August     1.000          1.0
8   September     1.300         33.0
9     October       NaN          NaN 



# Cleaning Data:

One of the most important tasks in processing data.

Data needs to be consistent to be reliably analyzed.

Cleaning involves parsing the data detecting 'bad' or missing data

In [8]:
# October is bad from source
# To not break the algorithm we will zero the data
df_zeros = dfj.fillna(0)
print("Our data with zerod values: ")
print(df_zeros)
#Zero can skew the data so we should remove invalid data
# so, we will not use this data later on

Our data with zerod values: 
        Month  Rainfall  Temperature
0     January     1.650          3.0
1    February     1.250         10.0
10   November     0.500         32.0
11   December     2.300          2.3
2       March     1.940         15.0
3       April     2.750         20.0
4         May     2.750         25.0
5        June     3.645         24.0
6        July     5.500         30.0
7      August     1.000          1.0
8   September     1.300         33.0
9     October     0.000          0.0


In [9]:
#remove rows with the missing values
df_clean = dfj.dropna()
print("Our data with dropped values: \n", df_clean)

Our data with dropped values: 
         Month  Rainfall  Temperature
0     January     1.650          3.0
1    February     1.250         10.0
10   November     0.500         32.0
11   December     2.300          2.3
2       March     1.940         15.0
3       April     2.750         20.0
4         May     2.750         25.0
5        June     3.645         24.0
6        July     5.500         30.0
7      August     1.000          1.0
8   September     1.300         33.0


In [10]:
#create a count of all rows containg Nans
count = 0
for index, row in dfj.iterrows():
    if any(row.isnull()):
        count = count + 1
        
print("\n Number of rows with Nans: " + str(count))


 Number of rows with Nans: 1


In [11]:
#create a count of all rows containg Nans
count = 0
for index, row in df_clean.iterrows():
    if any(row.isnull()):
        count = count + 1
        
print("\n Number of rows with Nans: " + str(count))


 Number of rows with Nans: 0


In [12]:
df_clean = df_clean.sort_index()
df_clean

Unnamed: 0,Month,Rainfall,Temperature
0,January,1.65,3.0
1,February,1.25,10.0
2,March,1.94,15.0
3,April,2.75,20.0
4,May,2.75,25.0
5,June,3.645,24.0
6,July,5.5,30.0
7,August,1.0,1.0
8,September,1.3,33.0
10,November,0.5,32.0


# Statistical Analysis
Mean = the average of a set of numbers.

Median = The middle calue in a sorted set of numbers.

Standard deviation = How much each value differs from the mean. Can be used to detect outliers.

Mode = The most common value in a list of data.

Pandas easily perform these functions!

In [13]:
print("Mean: ")
print(df_clean.mean())

print("              ")
print("Median : ")
print(df_clean.median())

#print("              ")
print("\n", "Standard Deviation: ")
print(df_clean.std())

#The mode here is wrong but will do it for explanation purposes
#rainfall has repeated value of 2.75
#temperature are all unique
print("\n", "Mode : ")
print(df_clean.mode())

Mean: 
Rainfall        2.235000
Temperature    17.754545
dtype: float64
              
Median : 
Rainfall        1.94
Temperature    20.00
dtype: float64

 Standard Deviation: 
Rainfall        1.413936
Temperature    12.193553
dtype: float64

 Mode : 
        Month  Rainfall  Temperature
0       April      2.75          1.0
1      August       NaN          2.3
2    December       NaN          3.0
3    February       NaN         10.0
4     January       NaN         15.0
5        July       NaN         20.0
6        June       NaN         24.0
7       March       NaN         25.0
8         May       NaN         30.0
9    November       NaN         32.0
10  September       NaN         33.0


In [14]:
df_clean.describe()

Unnamed: 0,Rainfall,Temperature
count,11.0,11.0
mean,2.235,17.754545
std,1.413936,12.193553
min,0.5,1.0
25%,1.275,6.5
50%,1.94,20.0
75%,2.75,27.5
max,5.5,33.0


# Selecting Parts of a Dataframe

### Indexing
Select single columns using a column name(temperature). Returns a series.

Example: df_clean['Temperature']

Select multiple columns using column names. Must specify a list of column names.

Example: df_clean[['Temperature', 'Rainfall']]

### iloc and loc
Select a certain row number using iloc:

Example: print("Third row \n", df_clean.iloc[2])

Select a certain row using a certain value:

Example: print("\n Third row \n", dfIndexed.loc['March']);

In [15]:
print("Temperature column: \n", df_clean['Temperature'])
print("\n Temperature and Rainfall column: \n",df_clean[['Temperature', 'Rainfall']])

print("\n Third row \n", df_clean.iloc[2])

Temperature column: 
 0      3.0
1     10.0
2     15.0
3     20.0
4     25.0
5     24.0
6     30.0
7      1.0
8     33.0
10    32.0
11     2.3
Name: Temperature, dtype: float64

 Temperature and Rainfall column: 
     Temperature  Rainfall
0           3.0     1.650
1          10.0     1.250
2          15.0     1.940
3          20.0     2.750
4          25.0     2.750
5          24.0     3.645
6          30.0     5.500
7           1.0     1.000
8          33.0     1.300
10         32.0     0.500
11          2.3     2.300

 Third row 
 Month          March
Rainfall        1.94
Temperature       15
Name: 2, dtype: object


In [16]:
#To use loc, we require a properly indexed datafram
index = df_clean['Month']
dfIndexed = df_clean.set_index(index)

print("\n Third row \n", dfIndexed.loc['March'])


 Third row 
 Month          March
Rainfall        1.94
Temperature       15
Name: March, dtype: object


In [17]:
#Print the rainfall and mean for the first few months
rainfall = df_clean['Rainfall'][0:3]
print(rainfall, "\n")
print("The mean of rainfall is: \n", rainfall.mean(), "\n")

0    1.65
1    1.25
2    1.94
Name: Rainfall, dtype: float64 

The mean of rainfall is: 
 1.61333333333 



In [18]:
#Print the rainfall and mean for the first few months
print("\n Just Temperature and rainfall data ")
df_TempRain = df_clean[['Temperature','Rainfall']]
print(df_TempRain, "\n")
print("The mean of Temperature and Rainfall is: \n", df_TempRain.mean(), "\n")


 Just Temperature and rainfall data 
    Temperature  Rainfall
0           3.0     1.650
1          10.0     1.250
2          15.0     1.940
3          20.0     2.750
4          25.0     2.750
5          24.0     3.645
6          30.0     5.500
7           1.0     1.000
8          33.0     1.300
10         32.0     0.500
11          2.3     2.300 

The mean of Temperature and Rainfall is: 
 Temperature    17.754545
Rainfall        2.235000
dtype: float64 

