# Data science pipeline – first steps

## Import libraries

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport

## Define variables

In [None]:
DROP_MISSING = False

## Loading the data

In [None]:
df = pd.read_csv('data/luzern/Luzern_290922.csv')
print("Dataframe shape:", df.shape)
df.head()

### Columns description

- **reportingmunicipalityid:** Municipality of residence of a person. Enables the calculation by-municipality population figures in combination with the populationType characteristic.

- **statyear:** statistic year

- **ageclass:** age class of the person

- **nationalityclass:** nationality of the person

- **populationtype:**
    - 1 Permanent resident population
    - 2 Non-permanent resident population
    - 3 Resident population at last secondary residence
    - 4 Resident population at other secondary residences.
    - Note: Persons with typeOfResidence = 3 are found here with the characteristics 3 or 4. To avoid double counting a person for the secondary residence, populationType = 3 must be selected.

- **maritalstatusclass:**
    - 1 Single
    - 2 Married, in registered partnership
    - 3 Widowed, partnership dissolved by death
    - 4 Divorced, unmarried, partnership dissolved by court order
    - -9 Not specified.
    - Note: Unmarried persons (marital status = 5) are counted with divorced persons. Persons with dissolved partnership (maritalStatus = 7) are included in widowed (code 3), divorced (code 4) or "not specified" (code -9) depending on the type of partnership dissolution (cancellationPartnerReason).

- **arrivalyearmunicipality:** arrival year to gemeinde Kriens
    - AAAA Jahr
    - 9997 seit geburt
    - 9999 ohne Angabe

- **arrivalyearswitzerland:** Year of moving to Switzerland
    - 1xxx-2017 Jahr
    - -5 seit geburt
    - -9 Ohne Angabe

- **egid:** Building identifier pseudonymized. The building identifier is pseudonymized and is used for linking with the Wohnungstabelle.

- **gbaups:** Construction period GWS  (Gebäude- und Wohnungsstatistik) 
    - 8011 Periode vor 1919
    - 8012 Periode vor 1919 bis 1945
    - 8013 Periode vor 1946 bis 1960
    - 8014 Periode vor 1961 bis 1970
    - 8015 Periode vor 1971 bis 1980
    - 8016 Periode vor 1981 bis 1985
    - 8017 Periode vor 1986 bis 1990
    - 8018 Periode vor 1991 bis 1995
    - 8019 Periode vor 1996 bis 2000
    - 8020 Periode vor 2001 bis 2005
    - 8021 Periode vor 2006 bis 2010
    - 8022 Periode vor 2011 bis 2015
    - 8023 Periode nach 2015

- **gkats:** Building category GWS
    - 1021 single family houses
    - 1025 Apartment buildings
    - 1030 Residential buildings with secondary use
    - 1040 Buildings with partial residential use

- **gastws:** Number of floors GWS. 1 bis 35 Stockwerke im Gebäude

- **gazwot:** Number of apartments in the building GWS.

- **eh:** east hectar coordinates, projected to LV1903 (cf. [EPSG:2056](https://epsg.io/2056)).

- **nh:** north hectar coordinates, projected to LV1903 (cf. [EPSG:2056](https://epsg.io/2056)).

- **ewid:** Apartment identifier. This is the key to "Wohnungen" table.

- **wazimclass:** Number of rooms GWS

- **wareaclass:** Apartment surface GWS im m2


## Data cleaning

In [None]:
# Handle missing values
if DROP_MISSING:
    df.dropna(inplace=True)
    print("New dataframe shape:", df.shape)
else:
    df.arrivalyearswitzerland.fillna(-9, inplace=True)

In [None]:
# Correct data types
def print_df_dtypes(df):
    print("data types:")
    for column, dtype in zip(df.columns, df.dtypes):
        print("\t", column, ":", dtype)

print("Original", end =" ")
print_df_dtypes(df)

df = df.astype({
    'reportingmunicipalityid': pd.CategoricalDtype(),
    'statyear': int,
    'ageclass': int,
    'nationalityclass': pd.CategoricalDtype(),
    'populationtype': pd.CategoricalDtype(),
    'maritalstatusclass': pd.CategoricalDtype(),
    'arrivalyearmunicipality': int,
    'arrivalyearswitzerland': int,
    'gbaups': int,
    'gkats': pd.CategoricalDtype(),
    'wazimclass': pd.CategoricalDtype(),
    'wareaclass': pd.CategoricalDtype()
})
print()
print("Updated", end =" ")
print_df_dtypes(df)

In [None]:
df

## Data exploration

In [None]:
# Basic statistics
df.describe().T

In [None]:
# Basic visualization
hist = df.ageclass.hist()
hist.set_xlabel('Age class')
hist.set_ylabel('# Rows')
_ = hist.set_title('Age class distribution')

## Data analysis

In [None]:
# Selection of specific age range
below26 = df[df.ageclass < 26]
below26

In [None]:
# Visualize nationality distribution of selected sample
pie = below26.nationalityclass.value_counts().plot.pie(autopct='%1.1f%%')
pie.set_ylabel('')
_ = pie.set_title('Nationality distribution of population below 26 years old')

In [None]:
profile = ProfileReport(df, title="IIP1 Luzern Sample Profiling Report")

In [None]:
profile.to_file("sample_report.html")

In [None]:
df.describe(include='all').T

In [None]:
profile_full = ProfileReport(full, title="IIP1 Luzern Full Profiling Report")
profile_full.to_file('full_report.html')

In [None]:
umzuege = pd.read_csv('data/luzern/Umzuege_Luzern_290922.csv')
print("Dataframe shape:", umzuege.shape)
umzuege.head()

In [None]:
umzuege.describe(include='all').T

In [None]:
profile_umzuege = ProfileReport(umzuege, title="IIP1 Luzern Umzuege Profiling Report")
profile_umzuege.to_file('umzuege_report.html')

In [None]:
bewegungen = pd.read_csv('data/luzern/Bewegungen_Luzern_290922.csv')
print("Dataframe shape:", bewegungen.shape)
bewegungen.head()

In [None]:
bewegungen.describe(include='all').T

In [None]:
profile_bewegungen = ProfileReport(bewegungen, title="IIP1 Luzern Bewegungen Profiling Report")
profile_bewegungen.to_file('bewegungen_report.html')

In [None]:
df.ageclass.hist(bins=100)