In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/Library_Usage.csv",low_memory=False)

In [3]:
df.columns

Index(['Patron Type Definition', 'Total Checkouts', 'Total Renewals',
       'Age Range', 'Home Library Definition', 'Circulation Active Month',
       'Circulation Active Year', 'Notice Preference Definition',
       'Provided Email Address', 'Year Patron Registered',
       'Within San Francisco County'],
      dtype='object')

In [None]:
df.head()

In [4]:
x = df['Total Renewals']

In [5]:
df[['Total Renewals', 'Total Checkouts']]

Unnamed: 0,Total Renewals,Total Checkouts
0,0,5
1,0,0
2,0,0
3,1,1
4,0,0
...,...,...
450354,0,0
450355,0,0
450356,0,0
450357,0,0


In [6]:
# auxiliary variable
column_names = ['Total Renewals', 'Total Checkouts'] 

In [7]:
subset = df[column_names]

In [8]:
print(x)

0         0
1         0
2         0
3         1
4         0
         ..
450354    0
450355    0
450356    0
450357    0
450358    0
Name: Total Renewals, Length: 450359, dtype: int64


In [9]:
print(subset)

        Total Renewals  Total Checkouts
0                    0                5
1                    0                0
2                    0                0
3                    1                1
4                    0                0
...                ...              ...
450354               0                0
450355               0                0
450356               0                0
450357               0                0
450358               0                0

[450359 rows x 2 columns]


In [10]:
# Ergebnis der Anweisung df['Patron Type Definition'] == 'Adult' (Series mit bool. Werten) wird in einer neuen Spalte dem Dataframe angehängt 
df['is_adult'] = df['Patron Type Definition'] == 'Adult'

In [11]:
# Der Logarithmus wird auf den Werten der Spalte 'Total Renewals' berechnet und einer neuen Spalte 'log_renewals' zugewiesen
df['log_renewals'] = np.log(df['Total Renewals'] + 1)

##### Aufgabe 2.5 - Fallstudie: Feature Engineering
Ziel ist es, eine neue Variable Membership Duration zu erstellen, die für jeden Kunden die aktive Mitgliedschaft
in Monaten seit der Registrierung misst. 
Die aktive Mitgliedschaft wird wie folgt definiert:
'Membership Duration' = ('Circulation Active Year' - 'Year Patron Registered')*12 + 'Circulation Active Month'

In [12]:
# Werte in der Spalte 'Circulation Active Year' werden in ein nummerisches Datenformat konvertiert
# Function pd.to_numeric converts argument to a numeric type.
# parameter errors: If ‘coerce’, then invalid parsing will be set as NaN
df['Circulation Active Year'] = pd.to_numeric(df['Circulation Active Year'], errors='coerce')

In [13]:
# Die Monatsnamen in der Spalte 'Circulation Active Month' müssen für die Berechnung in ein numerisches Format konvertiert werden
# Zuerst werden die Monatsnamen mit der Funktion pd.to_datetime() in ein Datumsformat konvertieren.
# This function converts a scalar, array-like, Series or DataFrame/dict-like to a pandas datetime object
# Parameter errors: If 'coerce', then invalid parsing will be set as NaT
# strftime() Format code "%b": Month as locale’s abbreviated name

df['Circulation Active Month'] = pd.to_datetime(
    df['Circulation Active Month'],
    errors='coerce',
    format="%b"
)

In [14]:
# Monat als Zahl aus der Spalte extrahieren
# Funktion pandas.Series.dt.month: The month as January=1, December=12
df['Circulation Active Month'] = df['Circulation Active Month'].dt.month

In [15]:
df.head()

Unnamed: 0,Patron Type Definition,Total Checkouts,Total Renewals,Age Range,Home Library Definition,Circulation Active Month,Circulation Active Year,Notice Preference Definition,Provided Email Address,Year Patron Registered,Within San Francisco County,is_adult,log_renewals
0,Senior,5,0,75 years and over,Main,11.0,2022.0,Email,True,2015,False,False,0.0
1,Adult,0,0,45 to 54 years,Main,7.0,2023.0,Email,True,2019,False,True,0.0
2,Adult,0,0,55 to 59 years,Western Addition,3.0,2024.0,Email,True,2022,False,True,0.0
3,Welcome,1,1,20 to 24 years,Richmond,8.0,2022.0,Email,True,2022,False,False,0.693147
4,Senior,0,0,65 to 74 years,Sunset,3.0,2024.0,Print,False,2023,False,False,0.0


In [16]:
x = df['Circulation Active Year']
y = df['Circulation Active Month']
z = df['Year Patron Registered']

In [17]:
# 'Membership Duration' = ('Circulation Active Year' - 'Year Patron Registered')*12 + 'Circulation Active Month'
df['Membership Duration'] = (x - z)*12 + y

In [18]:
df.head()

Unnamed: 0,Patron Type Definition,Total Checkouts,Total Renewals,Age Range,Home Library Definition,Circulation Active Month,Circulation Active Year,Notice Preference Definition,Provided Email Address,Year Patron Registered,Within San Francisco County,is_adult,log_renewals,Membership Duration
0,Senior,5,0,75 years and over,Main,11.0,2022.0,Email,True,2015,False,False,0.0,95.0
1,Adult,0,0,45 to 54 years,Main,7.0,2023.0,Email,True,2019,False,True,0.0,55.0
2,Adult,0,0,55 to 59 years,Western Addition,3.0,2024.0,Email,True,2022,False,True,0.0,27.0
3,Welcome,1,1,20 to 24 years,Richmond,8.0,2022.0,Email,True,2022,False,False,0.693147,8.0
4,Senior,0,0,65 to 74 years,Sunset,3.0,2024.0,Print,False,2023,False,False,0.0,15.0


In [19]:
# Annahme: Einträge mit fehlenden Werten --> Person 0 Monate aktiv Mitglied gewesen ist
# Alle NaN values in der neuen Variable werden mit der Zahl 0 ersetzt
df['Membership Duration'] = df['Membership Duration'].fillna(0)

In [20]:
df['Membership Duration']

0         95.0
1         55.0
2         27.0
3          8.0
4         15.0
          ... 
450354     0.0
450355     8.0
450356     8.0
450357     4.0
450358     4.0
Name: Membership Duration, Length: 450359, dtype: float64

In [21]:
df.isna()

Unnamed: 0,Patron Type Definition,Total Checkouts,Total Renewals,Age Range,Home Library Definition,Circulation Active Month,Circulation Active Year,Notice Preference Definition,Provided Email Address,Year Patron Registered,Within San Francisco County,is_adult,log_renewals,Membership Duration
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450354,False,False,False,False,False,True,True,False,False,False,False,False,False,False
450355,False,False,False,False,False,False,False,False,False,False,False,False,False,False
450356,False,False,False,False,False,False,False,False,False,False,False,False,False,False
450357,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [22]:
df_withMembershipDuration = df.to_csv("../data/Library_Usage_withMembershipDuration.csv")