#### Imports defined

In [None]:
import pandas as pd

#### Reading the csv file and storing it into a DataFrame.

In [None]:
df = pd.read_csv('data/esb.csv')

#### Extracting specific year values from the DataFrame.

In [None]:
year_values = [ i for i in df.loc[0] if not isinstance(i,float)]

#### First row in the DataFrame in filled up with NaN values. Clearing them up.

In [None]:
df = df.drop(df.index[0])

#### Based on the index position of the rows in the DataFrame, slicing it to separate the records for various counties.

In [None]:
df_county = df.loc[2:30]

#### Similarly separating the records for the cities.

In [None]:
df_city = df.loc[32:36]

#### Function to format a DataFrame. This function aims to clean up a DataFrame. Below are the steps of operation.
1. It begins by changing the index of the DataFrame, replacing it with the values found in column <b>Umnamed : 0</b>
2. The numeric range for the columns in the DataFrame is calulated. It is for iterating over the DataFrame.
3. The columns are in a multiple of 13 (it being a prime number also helps) for each of the years. Every group of 12 columns (Jan-Dec) is made into a separetae DataFrame which are the ESB connection values for every month in each of the years.
4. Every sliced DataFrame is them inserted with two new columns namely <b>Council Type</b> and <b>Year</b>. It records the type of the Council viz. County and City along with the year for which the values correspond.
5. All the smaller DataFrames are then concatenated to form a single DataFrame.
6. Some columns had dirty data, numbers preceeded with <b>#</b> symbol. This function also removes this anomaly.

#### The output of this function is a DataFrame with less number of columns and increaseed numbe of rows in comparision with the input DataFrame.

<b>Note</b>: As part of this formatting, column <b>Totals</b> and the rows <b>TOTALS</b> and <b>Conversions</b> are removed.

In [None]:
def format_dataframe(df,Type):
    
    col_list = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sept','Oct','Nov','Dec']
    
    new_df = pd.DataFrame()
    col_indexes = []
    counter = 0
    
    df = df.set_index('Unnamed: 0')
    
    df.columns = range(1, (df.shape[1] + 1) )
    
    for i in df.columns:
        if not i % 13 == 0 :
            col_indexes.append(i)

        else:
            start = col_indexes[0]
            end = col_indexes[(len(col_indexes) - 1)]

            col_indexes.clear()
            sliced_df = df.loc[:,start:end]

            if counter < len(year_values):
                
                sliced_df.columns = col_list
                sliced_df.insert(0,'Council Type',Type)
                sliced_df.insert(1,'Year',year_values[counter])

                new_df = pd.concat([new_df, sliced_df], ignore_index=False)
                counter+= 1
                
    new_df.reset_index(level=0, inplace=True)        
    new_df.rename({'Unnamed: 0':'Council Name'},axis='columns',inplace=True)
    
    for i in col_list:
        new_df[i] = new_df[i].str.replace('#','')
    
    new_df[col_list] = new_df[col_list].apply(pd.to_numeric, errors = 'coerce')
                
    return new_df

#### Function to tidy a DataFrame. This function shrinks the columns of the  DataFrame and records each of the separate observations for months (Jan-Dec) under a single column. 


In [None]:
def tidy_up(passed_df):
    
    tidy_df = pd.DataFrame(columns = ['Council Name','Council Type','Year','Month','ESB Connection'])
    months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sept','Oct','Nov','Dec']

    for i in passed_df.index:
        constants = list(passed_df.iloc[i,0:3])
        monthly_values = list(passed_df.iloc[i,3:])

        for j,k in enumerate(monthly_values):
            row = {
                   'Council Name':constants[0],
                   'Council Type':constants[1],
                   'Year':constants[2],
                   'Month':months[j],
                   'ESB Connection':k
                  }
            df = pd.DataFrame(row,index = [i])
            tidy_df = pd.concat([tidy_df,df], ignore_index=True)
            
    return tidy_df

#### Formatting the DataFrame that has the records of various cities.

In [None]:
df_city_formatted = format_dataframe(df_city,'City Council')

#### Formatting the DataFrame that has the records of various counties.

In [None]:
df_county_formatted = format_dataframe(df_county, 'County Council')

#### Merging the above formatted DataFrames.

In [None]:
merged_df = pd.concat([df_county_formatted,df_city_formatted], ignore_index=True)

merged_df.sort_values(['Year', 'Council Type'],inplace=True)

merged_df.reset_index(drop=True, inplace=True)

#### Tidying the above merged DataFrame.

In [None]:
tidy_df = tidy_up(merged_df)

#### Saving the tidied DataFrame as a csv file.

In [None]:
tidy_df.to_csv('data/esb_tidy.csv',index = False)
tidy_df.to_csv('C00246376_Python_Assignment_3_Part_3/data/esb_tidy.csv',index = False)