In [1]:
# imports
import pandas as pd
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Settings for displaying floats
pd.set_option('display.float_format', '{:,.2f}'.format)

In [4]:
df = pd.read_csv("./data/cleaned_data.csv")
df.head()

Unnamed: 0,Date,Country,City,aqi,co,d,dew,humidity,mepaqi,no2,...,pol,precipitation,pressure,psi,so2,temperature,uvi,wd,wind-gust,wind-speed
0,2014-12-29,AT,Vienna,,0.1,,,,,9.0,...,,,,,2.6,,,,,
1,2014-12-29,AU,Brisbane,,1.9,,,,,4.6,...,,,,,1.1,,,,,
2,2014-12-29,BE,Brussels,,0.1,,,,,23.4,...,,,,,1.6,,,,,
3,2014-12-29,BO,Cochabamba,,,,,,,3.0,...,,,,,,,,,,
4,2014-12-29,BR,São Paulo,,5.5,,,,,17.9,...,,,,,1.1,,,,,


In [5]:
df.dtypes

Date              object
Country           object
City              object
aqi              float64
co               float64
d                float64
dew              float64
humidity         float64
mepaqi           float64
no2              float64
o3               float64
pm1              float64
pm10             float64
pm25             float64
pol              float64
precipitation    float64
pressure         float64
psi              float64
so2              float64
temperature      float64
uvi              float64
wd               float64
wind-gust        float64
wind-speed       float64
dtype: object

In [6]:
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Split 'Date' column into 'year', 'month' and 'day'
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day

# Remove 'Date' column
if 'Date' in df.columns:
    df.drop(columns=['Date'], inplace=True)


In [7]:
# Display the first 5 rows of the dataframe
df.sample(20)

Unnamed: 0,Country,City,aqi,co,d,dew,humidity,mepaqi,no2,o3,...,psi,so2,temperature,uvi,wd,wind-gust,wind-speed,year,month,day
43525,BA,Sarajevo,,,,,,,,,...,,,,,,,2.8,2018,6,15
186255,BD,Dhaka,,,,25.0,70.0,,,,...,,,29.0,,,,1.8,2023,10,7
194774,CZ,Ostrava,,,,6.0,76.0,,11.1,14.9,...,,4.1,9.0,,,1.1,1.2,2024,2,27
51639,NO,Oslo,,0.1,,,32.6,,4.6,36.4,...,,,4.6,,,2.2,0.9,2019,3,31
23204,IN,Delhi,,8.2,,,,,23.6,8.3,...,,6.5,,,,,,2017,2,9
106631,TJ,Dushanbe,,,,5.0,84.0,,,,...,,,8.0,,,,1.5,2020,12,2
127123,KR,Seoul,,4.5,,24.0,,,19.5,19.2,...,,4.3,,,,,,2021,7,13
162627,LT,Kaunas,,0.1,,7.5,90.0,,,15.6,...,,8.0,9.0,,,,3.6,2022,11,11
87336,ID,Jakarta,,,,24.0,72.0,,,,...,,,29.5,,,4.2,3.0,2020,5,11
9630,US,Fresno,,7.8,,,,,17.0,8.8,...,,0.3,,,,,,2015,12,30


In [8]:
# Funktion für Übersicht über dtypes, missing values, unique values etc.
def overview(df):
    '''
    Erstelle einen Überblick über einige Eigenschaften der Spalten eines DataFrames.
    VARs
        df: Der zu betrachtende DataFrame
    RETURNS:
        None
    '''
    display(pd.DataFrame({'dtype': df.dtypes,
                          'total': df.count(),
                          'missing': df.isna().sum(),
                          'missing%': df.isna().mean()*100,
                          'n_uniques': df.nunique(),
                          'uniques%': df.nunique()/df.shape[0]*100,
                          'uniques': [df[col].unique() for col in df.columns]
                         }))


In [9]:
overview(df)

Unnamed: 0,dtype,total,missing,missing%,n_uniques,uniques%,uniques
Country,object,225860,0,0.0,95,0.04,"[AT, AU, BE, BO, BR, CA, CH, CL, CN, CO, CY, C..."
City,object,225860,0,0.0,95,0.04,"[Vienna, Brisbane, Brussels, Cochabamba, São P..."
aqi,float64,6268,219592,97.22,302,0.13,"[nan, 27.0, 28.0, 35.0, 50.0, 59.0, 52.0, 53.0..."
co,float64,133731,92129,40.79,508,0.22,"[0.1, 1.9, nan, 5.5, 4.2, 16.1, 5.1, 3.1, 2.2,..."
d,float64,272,225588,99.88,48,0.02,"[nan, -24.0, -26.0, -27.0, -25.0, 7.0, 0.0, -4..."
dew,float64,146267,79593,35.24,515,0.23,"[nan, 16.0, 14.0, 18.0, 1.0, 7.0, 4.0, 5.0, 20..."
humidity,float64,182043,43817,19.4,937,0.41,"[nan, 84.0, 76.5, 32.0, 22.0, 58.0, 76.0, 81.0..."
mepaqi,float64,2765,223095,98.78,809,0.36,"[nan, 81.0, 81.3, 48.2, 115.8, 59.5, 62.9, 62...."
no2,float64,158692,67168,29.74,683,0.3,"[9.0, 4.6, 23.4, 3.0, 17.9, 4.7, 4.4, 13.6, na..."
o3,float64,151424,74436,32.96,936,0.41,"[nan, 3.7, 2.9, 4.3, 19.5, 20.0, 10.6, 9.4, 21..."


In [10]:
missing_values = df.isna().mean() * 100  # Prozent der fehlenden Werte berechnen
missing_values_sorted = missing_values.sort_values(ascending=False)  # Nach aufsteigender Reihenfolge sortieren

print(missing_values_sorted)


d               99.88
psi             99.84
pol             99.59
mepaqi          98.78
pm1             98.54
aqi             97.22
uvi             95.80
precipitation   92.91
wd              92.72
wind-gust       52.97
co              40.79
dew             35.24
o3              32.96
so2             32.75
no2             29.74
pm10            25.68
wind-speed      21.23
pressure        19.44
humidity        19.40
temperature     19.39
pm25             8.66
month            0.00
year             0.00
Country          0.00
City             0.00
day              0.00
dtype: float64


In [14]:
missing_percent = df.isna().mean() * 100  

# Spalten auswählen, die weniger als 50% fehlende Werte haben
df_cleaned = df.loc[:, missing_percent <= 53]

# Ergebnis ausgeben
print(f"Anzahl der entfernten Spalten: {df.shape[1] - df_cleaned.shape[1]}")
print("Übrige Spalten:", df_cleaned.columns)

df.shape


Anzahl der entfernten Spalten: 10
Übrige Spalten: Index(['Country', 'City', 'co', 'dew', 'humidity', 'no2', 'o3', 'pm10', 'pm25',
       'pressure', 'so2', 'temperature', 'wind gust', 'wind-speed', 'year',
       'month', 'day'],
      dtype='object')


(225860, 27)