In [1]:
import os
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
plt.rcParams["figure.figsize"] = (8, 6)
plt.rcParams["figure.dpi"] = 100
plt.style.use("bmh")

In [3]:
os.chdir("..")

In [4]:
df = pd.read_csv("data/cleaned.csv", index_col=0)
df.reset_index(inplace=True, drop=True)

In [5]:
df.head()

Unnamed: 0,wiek,zawod,stan_cywilny,wyksztalcenie,ma_kredyt,kredyt_mieszkaniowy,ma_pozyczke,sposob_kontaktu,miesiac,dl_polaczenia,liczba_polaczen_aktualnej_kampanii,liczba_dni_od_ost_kontaktu,liczba_polaczen_przed_aktualna_kampania,wynik_poprzedniej_kampanii,wsk_zmien_zatrudnienia,wsk_cen_konsum,wsk_zauf_konsum,euribor3m,liczba_pracownikow,target
0,56,housemaid,married,basic.4y,1,1,1,telephone,may,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,37,services,married,high.school,1,0,1,telephone,may,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,40,admin.,married,basic.6y,1,1,1,telephone,may,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,56,services,married,high.school,1,1,0,telephone,may,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,59,admin.,married,professional.course,1,1,1,telephone,may,139,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


# Enkodowanie

## Cechy numeryczne

### Licza\_dni\_od\_ost\_kontaktu

In [7]:
bins = [5,10,15,20,25,30,1000]

In [8]:
df.liczba_dni_od_ost_kontaktu = np.digitize(df.liczba_dni_od_ost_kontaktu, bins)

### Wiek

In [9]:
ss = StandardScaler()

In [37]:
subset = ['euribor3m', 'wiek', 'liczba_pracownikow']
subset.extend(df.columns[df.columns.str.startswith('wsk')])
subset

['euribor3m',
 'wiek',
 'liczba_pracownikow',
 'wsk_zmien_zatrudnienia',
 'wsk_cen_konsum',
 'wsk_zauf_konsum']

In [39]:
df[subset] = ss.fit_transform(df[subset])

## Cechy cykliczne

In [14]:
miesiace = {
    "mar": 3,
    "apr": 4,
    "may": 5,
    "jun": 6,
    "jul": 7,
    "aug": 8,
    "sep": 9,
    "oct": 10,
    "nov": 11,
    "dec": 12,
}
df.miesiac = df.miesiac.map(miesiace) / 12 * np.pi
df.miesiac = np.sin(df.miesiac) * np.cos(df.miesiac)

## Cechy kategoryczne (z pominieciem _wyksztalcenie_ i _zawod_, gdyz te beda poddane procesowi _feature enginnering_)

In [15]:
def pred_categorical_with_knn(df: pd.DataFrame, attrib: str, neighbours=3) -> None:
    model = KNeighborsClassifier(n_neighbors=3)
    X_train = df[~(df[attrib] == "unknown")]
    y_train = X_train[attrib]
    X_train = X_train.drop(attrib, axis=1)

    X_pred = df[df[attrib] == "unknown"]
    y_pred = X_pred[attrib]
    X_pred = X_pred.drop(attrib, axis=1)

    model.fit(X_train, y_train)
    df.loc[X_pred.index, attrib] = model.predict(X_pred)

In [16]:
df["aktywnie_pracujacy"] = ~df.zawod.isin(["student", "unemployed", "retired"]) + 0

In [17]:
subset = df.select_dtypes(include="object")
subset = subset[subset.columns.difference(["wyksztalcenie"])]

zawod_array = df.zawod

In [18]:
encoder = OneHotEncoder()
encoded_df = encoder.fit_transform(subset).toarray()
encoded_df = pd.DataFrame(encoded_df, columns=encoder.get_feature_names_out())
encoded_df = encoded_df.drop("sposob_kontaktu_cellular", axis=1)

df = df.join(encoded_df)
df = df.drop(subset.columns, axis=1)

### Wyksztalcenie (zamiana wartosci _unknown_, przy pomocy algorytmu __KNN__)

In [19]:
attrib = "wyksztalcenie"

In [20]:
pred_categorical_with_knn(df, attrib, 3)

In [21]:
df.wyksztalcenie.value_counts()

university.degree      10624
high.school             7897
basic.9y                4461
professional.course     4357
basic.4y                2421
basic.6y                1481
illiterate                11
Name: wyksztalcenie, dtype: int64

In [22]:
map_wyksztalcenie_do_liczby = {
    "illiterate": 0,
    "basic.4y": 1,
    "basic.6y": 2,
    "basic.9y": 3,
    "high.school": 4,
    "professional.course": 5,
    "university.degree": 6,
}

In [23]:
df.wyksztalcenie = df.wyksztalcenie.map(map_wyksztalcenie_do_liczby)

### Zawod

In [24]:
attrib = "zawod"

In [25]:
subset = df.columns.str.startswith('zawod')
df = df.iloc[:, ~subset]
df[attrib] = zawod_array

In [26]:
pred_categorical_with_knn(df, attrib)

In [27]:
df.zawod.value_counts()

admin.           8916
blue-collar      5972
technician       5607
services         2957
management       2396
entrepreneur     1121
self-employed    1105
retired           979
unemployed        753
student           735
housemaid         711
Name: zawod, dtype: int64

In [28]:
attrib = 'zawod'
df = df.drop(attrib, axis=1).join(pd.get_dummies(df.zawod))

In [40]:
df.head()

Unnamed: 0,wiek,wyksztalcenie,ma_kredyt,kredyt_mieszkaniowy,ma_pozyczke,miesiac,dl_polaczenia,liczba_polaczen_aktualnej_kampanii,liczba_dni_od_ost_kontaktu,liczba_polaczen_przed_aktualna_kampania,wsk_zmien_zatrudnienia,wsk_cen_konsum,wsk_zauf_konsum,euribor3m,liczba_pracownikow,target,aktywnie_pracujacy,sposob_kontaktu_telephone,stan_cywilny_divorced,stan_cywilny_married,stan_cywilny_single,stan_cywilny_unknown,wynik_poprzedniej_kampanii_failure,wynik_poprzedniej_kampanii_nonexistent,wynik_poprzedniej_kampanii_success,admin.,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed
0,1.77434,1,1,1,1,0.25,0.001442,1,6,0,0.722617,0.805097,0.882672,0.780668,0.395747,0,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0,0,1,0,0,0,0,0,0,0
1,-0.178868,4,1,0,1,0.25,-0.132035,1,6,0,0.722617,0.805097,0.882672,0.780668,0.395747,0,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,1,0,0,0
2,0.129533,2,1,1,1,0.25,-0.418057,1,6,0,0.722617,0.805097,0.882672,0.780668,0.395747,0,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,0,0,0,0,0,0,0,0,0,0
3,1.77434,4,1,1,0,0.25,0.176869,1,6,0,0.722617,0.805097,0.882672,0.780668,0.395747,0,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,1,0,0,0
4,2.082742,5,1,1,1,0.25,-0.463821,1,6,0,0.722617,0.805097,0.882672,0.780668,0.395747,0,1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,0,0,0,0,0,0,0,0,0,0


In [42]:
df.to_csv('data/totrain.csv')