This notebook contains a function to combine data for all cities in a folder.

In [1]:
import pandas as pd
import os
import pathlib

In [2]:
path_data_aqcin = '../data/data (aqcin)'

In [3]:
os.listdir(path_data_aqcin)

['berlin,-germany-air-quality.csv',
 'dresden-bergstraße,-germany-air-quality.csv',
 'frankfurt-schwanheim,-germany-air-quality.csv',
 'hannover,-germany-air-quality.csv',
 'kiel-bahnhofstr.-verk., schleswig-holstein, germany-air-quality.csv',
 'munich-air-quality.csv',
 'potsdam-zentrum,-germany-air-quality.csv',
 'sternschanze,-hamburg, germany-air-quality.csv',
 'stuttgart-bad-cannstatt, germany-air-quality.csv']

In [4]:
def combine_all_data(path_data, ):
    
    """ This function combines all the data of different cities in the given path. """
    
    
    list_dfs = []

    # for loop to iterate over all the files
    for file in os.listdir(path_data):
        try:
            print(file)
            # read the file
            df_temp = pd.read_csv(os.path.join(path_data, file), parse_dates = [0]) # 
            df_temp['city'] = file[:-16]   # get city name from complete file name 
            list_dfs.append(df_temp)
        except:
            print('There is some issue with the files in the provided path. Please make sure that only  \
                   csv files for the data of different cities is present in the folder.')
    
    # concatenate all dataframes
    df = pd.concat(list_dfs)
    
    df.reset_index(inplace = True, drop = True)
    # shift column 'city' to second position 
    city_column = df.pop('city') 
    df.insert(1, 'city', city_column) 
    
    
    return df

In [5]:
df_combined_all_cities = combine_all_data(path_data_aqcin)

berlin,-germany-air-quality.csv
dresden-bergstraße,-germany-air-quality.csv
frankfurt-schwanheim,-germany-air-quality.csv
hannover,-germany-air-quality.csv
kiel-bahnhofstr.-verk., schleswig-holstein, germany-air-quality.csv
munich-air-quality.csv
potsdam-zentrum,-germany-air-quality.csv
sternschanze,-hamburg, germany-air-quality.csv
stuttgart-bad-cannstatt, germany-air-quality.csv


# Data Cleaning

In [6]:
df_combined_all_cities.columns

Index(['date', 'city', ' pm25', ' pm10', ' o3', ' no2', ' co', ' so2'], dtype='object')

In [7]:
df_combined_all_cities.columns = df_combined_all_cities.columns.str.replace(' ', '')

In [8]:
df_combined_all_cities.drop(['o3', 'no2', 'co', 'so2'], axis = 1, inplace = True)

In [9]:
df_combined_all_cities = df_combined_all_cities.sort_values(by = 'date', ascending = True)

In [10]:
df_combined_all_cities.reset_index(drop = True, inplace = True)

In [11]:
df_combined_all_cities

Unnamed: 0,date,city,pm25,pm10
0,2014-08-13,munich,,15
1,2014-08-14,munich,,15
2,2014-08-15,munich,,14
3,2014-08-16,munich,,9
4,2014-08-17,"dresden-bergstraße,-germany",,10
...,...,...,...,...
26813,2023-12-28,"berlin,-germany",40,
26814,2023-12-28,"potsdam-zentrum,-germany",21,
26815,2023-12-28,munich,24,
26816,2023-12-28,"stuttgart-bad-cannstatt, germany",35,


In [12]:
tmp_dir = '../data/data combined (aqcin)'
pathlib.Path(tmp_dir).mkdir(parents=True, exist_ok=True)


df_combined_all_cities.to_csv(os.path.join(tmp_dir, 'Combined for all german cities.csv'), index = False)