# Data Cleaning

The goal of this notebook is as follows:

 - Data was in the form of a .xlsx format.  Convert it to a .csv
 - Drop meaningless rows, and set up headers properly.
 - Deal with null values
 
Data: /data/combined_temparature_and_humidity_for_the_year.xlsx

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('../data/combined_temparature_and_humidity_for_the_year.xlsx')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,
2,Time,Day,Month,Year,TC Open,TC Top,TC 5m,TC 10m,TC 20m,H% Open,H% Top,H% 5m,H% 10m,H% 20m
3,,,,,,,,,,,,,,
4,14:35:01,15,7,2016,28.655,29.116,29.132,26.084,,28.522,27.828,45.301,42.496,


## Set headers

The headers are in row 2.  Set it as the header, and drop the NaN rows

In [4]:
df = df.drop([0, 1], axis=0)
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
2,Time,Day,Month,Year,TC Open,TC Top,TC 5m,TC 10m,TC 20m,H% Open,H% Top,H% 5m,H% 10m,H% 20m
3,,,,,,,,,,,,,,
4,14:35:01,15,7,2016,28.655,29.116,29.132,26.084,,28.522,27.828,45.301,42.496,
5,15:05:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.121,27.816,26.514,
6,15:35:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.8,28.481,27.198,


In [5]:
df.columns = df.loc[2]
df.head()

2,Time,Day,Month,Year,TC Open,TC Top,TC 5m,TC 10m,TC 20m,H% Open,H% Top,H% 5m,H% 10m,H% 20m
2,Time,Day,Month,Year,TC Open,TC Top,TC 5m,TC 10m,TC 20m,H% Open,H% Top,H% 5m,H% 10m,H% 20m
3,,,,,,,,,,,,,,
4,14:35:01,15,7,2016,28.655,29.116,29.132,26.084,,28.522,27.828,45.301,42.496,
5,15:05:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.121,27.816,26.514,
6,15:35:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.8,28.481,27.198,


In [6]:
df = df.drop([2, 3], axis=0).reset_index()
df.head()

2,index,Time,Day,Month,Year,TC Open,TC Top,TC 5m,TC 10m,TC 20m,H% Open,H% Top,H% 5m,H% 10m,H% 20m
0,4,14:35:01,15,7,2016,28.655,29.116,29.132,26.084,,28.522,27.828,45.301,42.496,
1,5,15:05:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.121,27.816,26.514,
2,6,15:35:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.8,28.481,27.198,
3,7,16:05:01,15,7,2016,24.656,24.117,25.133,25.084,,27.145,27.828,27.148,27.88,
4,8,16:35:01,15,7,2016,25.656,25.117,26.133,25.584,,27.834,28.501,29.146,27.88,


# Get Index right

There are two index columns.  Set the column "index" as the actual index, and drop the other one

In [7]:
temp_df = df.set_index("index", drop=True)
temp_df.head()

2,Time,Day,Month,Year,TC Open,TC Top,TC 5m,TC 10m,TC 20m,H% Open,H% Top,H% 5m,H% 10m,H% 20m
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
4,14:35:01,15,7,2016,28.655,29.116,29.132,26.084,,28.522,27.828,45.301,42.496,
5,15:05:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.121,27.816,26.514,
6,15:35:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.8,28.481,27.198,
7,16:05:01,15,7,2016,24.656,24.117,25.133,25.084,,27.145,27.828,27.148,27.88,
8,16:35:01,15,7,2016,25.656,25.117,26.133,25.584,,27.834,28.501,29.146,27.88,


In [8]:
temp_df.columns

Index(['Time', 'Day', 'Month', 'Year', 'TC Open', 'TC Top', 'TC 5m', 'TC 10m',
       'TC 20m', 'H% Open', 'H% Top', 'H% 5m', 'H% 10m', 'H% 20m'],
      dtype='object', name=2)

In [9]:
temp_df.columns = ['Time', 'Day', 'Month', 'Year', 'TC Open', 'TC Top',
                   'TC 5m', 'TC 10m','TC 20m', 'H% Open', 'H% Top',_'H% 5m', 'H% 10m', 'H% 20m']

In [10]:
temp_df.head()

Unnamed: 0_level_0,Time,Day,Month,Year,TC Open,TC Top,TC 5m,TC 10m,TC 20m,H% Open,H% Top,H% 5m,H% 10m,H% 20m
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
4,14:35:01,15,7,2016,28.655,29.116,29.132,26.084,,28.522,27.828,45.301,42.496,
5,15:05:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.121,27.816,26.514,
6,15:35:01,15,7,2016,24.656,24.117,24.633,24.584,,27.145,25.8,28.481,27.198,
7,16:05:01,15,7,2016,24.656,24.117,25.133,25.084,,27.145,27.828,27.148,27.88,
8,16:35:01,15,7,2016,25.656,25.117,26.133,25.584,,27.834,28.501,29.146,27.88,
