In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
plt.rcParams["figure.figsize"] = (8, 6)
plt.rcParams["figure.dpi"] = 100
plt.style.use("bmh")

In [3]:
os.chdir("..")

In [4]:
df = pd.read_csv("data/cleaned.csv", index_col=0)
df.reset_index(inplace=True, drop=True)

In [5]:
df.head()

Unnamed: 0,wiek,zawod,stan_cywilny,wyksztalcenie,ma_kredyt,kredyt_mieszkaniowy,ma_pozyczke,sposob_kontaktu,miesiac,dl_polaczenia,liczba_polaczen_aktualnej_kampanii,liczba_dni_od_ost_kontaktu,liczba_polaczen_przed_aktualna_kampania,wynik_poprzedniej_kampanii,wsk_zmien_zatrudnienia,wsk_cen_konsum,wsk_zauf_konsum,euribor3m,liczba_pracownikow,target
0,56,housemaid,married,basic.4y,1,1,1,telephone,may,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,37,services,married,high.school,1,0,1,telephone,may,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,40,admin.,married,basic.6y,1,1,1,telephone,may,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,56,services,married,high.school,1,1,0,telephone,may,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,59,admin.,married,professional.course,1,1,1,telephone,may,139,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


# Enkodowanie

## Cechy cykliczne

In [6]:
miesiace = {'mar':3, 'apr':4, 'may':5, 'jun':6, 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
df.miesiac = df.miesiac.map(miesiace) / 12 * np.pi
df.miesiac = np.sin(df.miesiac) * np.cos(df.miesiac)

## Cechy kategoryczne (z pominieciem _wyksztalcenie_ i _zawod_, gdyz te beda poddane procesowi _feature enginnering_)

In [7]:
df['aktywnie_pracujacy'] = ~df.zawod.isin(['student', 'unemployed', 'retired']) + 0

In [8]:
subset = df.select_dtypes(include='object')
subset = subset[subset.columns.difference(['wyksztalcenie'])]

zawod_array = df.zawod

In [9]:
encoder = OneHotEncoder()
encoded_df = encoder.fit_transform(subset).toarray()
encoded_df = pd.DataFrame(encoded_df, columns=encoder.get_feature_names_out())
encoded_df = encoded_df.drop('sposob_kontaktu_cellular', axis=1)

df = df.join(encoded_df)
df = df.drop(subset.columns, axis=1)

### Wyksztalcenie (zamiana wartosci _unknown_, przy pomocy algorytmu __KNN__)

In [10]:
attrib = 'wyksztalcenie'

In [11]:
model = KNeighborsClassifier(n_neighbors=3)
X = df[~(df[attrib] == 'unknown')]
y = X[attrib]
X = X[X.columns.difference([attrib])]

model.fit(X, y)

KNeighborsClassifier()

In [12]:
X_pred = df[(df[attrib] == 'unknown')]
y_pred = X_pred[attrib]
X_pred = X_pred[X_pred.columns.difference([attrib])]

df.loc[y_pred.index, attrib] = model.predict(X_pred)

In [14]:
df.wyksztalcenie.value_counts()

university.degree      10731
high.school             7967
basic.9y                4399
professional.course     4374
basic.4y                2355
basic.6y                1415
illiterate                11
Name: wyksztalcenie, dtype: int64

In [34]:
map_wyksztalcenie_do_liczby = {
    'illiterate':0,
    'basic.4y':1,
    'basic.6y':2,
    'basic.9y':3,
    'high.school':4,
    'professional.course':5,
    'university.degree':6
}

In [35]:
df.wyksztalcenie = df.wyksztalcenie.map(map_wyksztalcenie_do_liczby)

### Zawod

In [39]:
attrib = 'zawod'

In [11]:
model = KNeighborsClassifier(n_neighbors=3)
X = df[~(df[attrib] == 'unknown')]
y = X[attrib]
X = X[X.columns.difference([attrib])]

model.fit(X, y)

KNeighborsClassifier()

In [12]:
X_pred = df[(df[attrib] == 'unknown')]
y_pred = X_pred[attrib]
X_pred = X_pred[X_pred.columns.difference([attrib])]

df.loc[y_pred.index, attrib] = model.predict(X_pred)