In [592]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Рассмотрим датасет по данными по безработным синим воротничкам с 1982 по 1991 годы¶

In [595]:
df = pd.read_csv(r"https://vincentarelbundock.github.io/Rdatasets/csv/Ecdat/Benefits.csv")
df.head()

Unnamed: 0,rownames,stateur,statemb,state,age,tenure,joblost,nwhite,school12,sex,bluecol,smsa,married,dkids,dykids,yrdispl,rr,head,ui
0,1,4.5,167,42,49,21,other,no,no,male,yes,yes,no,no,no,7,0.290631,yes,yes
1,2,10.5,251,55,26,2,slack_work,no,no,male,yes,yes,no,yes,yes,10,0.520202,yes,no
2,3,7.2,260,21,40,19,other,no,yes,female,yes,yes,yes,no,no,10,0.43249,yes,yes
3,4,5.8,245,56,51,17,slack_work,yes,no,female,yes,yes,yes,no,no,10,0.5,no,yes
4,5,6.5,125,58,33,1,slack_work,no,yes,male,yes,yes,yes,yes,yes,4,0.390625,yes,no


## Задача: выбрать модель, которая с высокой точностью сможет предсказать вероятность получения пособия по безработице для конкретного работника

### Создаю датасет со значимыми для анализа данными

In [599]:
df_n = df[['statemb', 'state', 'age', 'tenure', 'joblost', 'nwhite',	'school12',	'sex', 'married', 'dkids', 'dykids', 'head', 'ui']]
df_n.head()

Unnamed: 0,statemb,state,age,tenure,joblost,nwhite,school12,sex,married,dkids,dykids,head,ui
0,167,42,49,21,other,no,no,male,no,no,no,yes,yes
1,251,55,26,2,slack_work,no,no,male,no,yes,yes,yes,no
2,260,21,40,19,other,no,yes,female,yes,no,no,yes,yes
3,245,56,51,17,slack_work,yes,no,female,yes,no,no,no,yes
4,125,58,33,1,slack_work,no,yes,male,yes,yes,yes,yes,no


In [601]:
df_n.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4877 entries, 0 to 4876
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   statemb   4877 non-null   int64 
 1   state     4877 non-null   int64 
 2   age       4877 non-null   int64 
 3   tenure    4877 non-null   int64 
 4   joblost   4877 non-null   object
 5   nwhite    4877 non-null   object
 6   school12  4877 non-null   object
 7   sex       4877 non-null   object
 8   married   4877 non-null   object
 9   dkids     4877 non-null   object
 10  dykids    4877 non-null   object
 11  head      4877 non-null   object
 12  ui        4877 non-null   object
dtypes: int64(4), object(9)
memory usage: 495.4+ KB


### Для удобства обработки данных разделяю на числовые и категориальные, и категориальные переменные перевожу в числовой формат

In [604]:
cat_vars = ['joblost', 'nwhite',	'school12',	'sex', 'married', 'dkids', 'dykids', 'head']
num_vars = ['statemb', 'state', 'age', 'tenure']

In [606]:
X = df_n[['joblost', 'nwhite',	'school12',	'sex', 'married', 'dkids', 'dykids', 'head', 'statemb', 'state', 'age', 'tenure']].values
y = df_n['ui'].values

In [608]:
X = pd.get_dummies(df_n[cat_vars], drop_first = True, dtype = int)
X.head()

Unnamed: 0,joblost_position_abolished,joblost_seasonal_job_ended,joblost_slack_work,nwhite_yes,school12_yes,sex_male,married_yes,dkids_yes,dykids_yes,head_yes
0,0,0,0,0,0,1,0,0,0,1
1,0,0,1,0,0,1,0,1,1,1
2,0,0,0,0,1,0,1,0,0,1
3,0,0,1,1,0,0,1,0,0,0
4,0,0,1,0,1,1,1,1,1,1


In [610]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4877 entries, 0 to 4876
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   joblost_position_abolished  4877 non-null   int32
 1   joblost_seasonal_job_ended  4877 non-null   int32
 2   joblost_slack_work          4877 non-null   int32
 3   nwhite_yes                  4877 non-null   int32
 4   school12_yes                4877 non-null   int32
 5   sex_male                    4877 non-null   int32
 6   married_yes                 4877 non-null   int32
 7   dkids_yes                   4877 non-null   int32
 8   dykids_yes                  4877 non-null   int32
 9   head_yes                    4877 non-null   int32
dtypes: int32(10)
memory usage: 190.6 KB


In [612]:
y = pd.get_dummies(df_n['ui'], drop_first = True, dtype = int)
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4877 entries, 0 to 4876
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   yes     4877 non-null   int32
dtypes: int32(1)
memory usage: 19.2 KB


### Нормализация числовых переменных

In [615]:
scaler = MinMaxScaler()
scaler.fit(df_n[num_vars])
X[num_vars] = scaler.transform(df_n[num_vars])
X.head()

Unnamed: 0,joblost_position_abolished,joblost_seasonal_job_ended,joblost_slack_work,nwhite_yes,school12_yes,sex_male,married_yes,dkids_yes,dykids_yes,head_yes,statemb,state,age,tenure
0,0,0,0,0,0,1,0,0,0,1,0.397129,0.369048,0.707317,0.5
1,0,0,1,0,0,1,0,1,1,1,0.799043,0.52381,0.146341,0.025
2,0,0,0,0,1,0,1,0,0,1,0.842105,0.119048,0.487805,0.45
3,0,0,1,1,0,0,1,0,0,0,0.770335,0.535714,0.756098,0.4
4,0,0,1,0,1,1,1,1,1,1,0.196172,0.559524,0.317073,0.0


### Разделение на обучающую, тестовую и прогнозную части 

In [618]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 20)

In [620]:
X_test, X_forecast, y_test, y_forecast = train_test_split(X_test, y_test, test_size = 0.5, random_state = 20)

### Проверяю несколько моделей для оценки какая из них более эффективно прогнозирует результат решения по выплате пособия по безработице

In [623]:
neigh = KNeighborsClassifier(n_neighbors = 3)
neigh.fit(X_train, y_train)
y_pred = neigh.predict(X_test)
print(
    'Precision:', np.round(precision_score(y_test, y_pred),3), 
    'Recall', np.round(recall_score(y_test, y_pred),3)
)

Precision: 0.691 Recall 0.771


  return self._fit(X, y)


In [625]:
for i in range(3, 15, 2):
    neigh = KNeighborsClassifier(n_neighbors = i)
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    print('Neigbors:', i, 
          'Precision:', np.round(precision_score(y_test, y_pred),3), 
          'Recall', np.round(recall_score(y_test, y_pred),3)
         )

  return self._fit(X, y)
  return self._fit(X, y)


Neigbors: 3 Precision: 0.691 Recall 0.771
Neigbors: 5 Precision: 0.701 Recall 0.842
Neigbors: 7 Precision: 0.703 Recall 0.87


  return self._fit(X, y)
  return self._fit(X, y)


Neigbors: 9 Precision: 0.704 Recall 0.882
Neigbors: 11 Precision: 0.703 Recall 0.888


  return self._fit(X, y)
  return self._fit(X, y)


Neigbors: 13 Precision: 0.695 Recall 0.889


#### Модель Neighbors показала лучший результат Precision: 0.703 Recall 0.888 при использовании гиперпараметра Neigbors = 11

In [628]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
y_pred = tree_clf.predict(X_test)
print(
    'Precision:', np.round(precision_score(y_test, y_pred),3), 
    'Recall', np.round(recall_score(y_test, y_pred),3)
)

Precision: 0.703 Recall 0.7


#### Модель DecisionTree показала результат Precision: 0.715 Recall 0.706

In [631]:
rf_clf = RandomForestClassifier(max_depth=20, random_state=20)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print(
    'Precision:', np.round(precision_score(y_test, y_pred),3), 
    'Recall', np.round(recall_score(y_test, y_pred),3)
)

  return fit_method(estimator, *args, **kwargs)


Precision: 0.708 Recall 0.905


In [633]:
for i in range(1, 20, 2):
    rf_clf = RandomForestClassifier(max_depth=i, random_state=20)
    rf_clf.fit(X_train, y_train)
    y_pred = rf_clf.predict(X_test)
    print('Depth:', i, 
          'Precision:', np.round(precision_score(y_test, y_pred),3), 
          'Recall', np.round(recall_score(y_test, y_pred),3)
         )

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Depth: 1 Precision: 0.693 Recall 1.0
Depth: 3 Precision: 0.693 Recall 1.0


  return fit_method(estimator, *args, **kwargs)


Depth: 5 Precision: 0.696 Recall 1.0


  return fit_method(estimator, *args, **kwargs)


Depth: 7 Precision: 0.704 Recall 0.981


  return fit_method(estimator, *args, **kwargs)


Depth: 9 Precision: 0.704 Recall 0.975


  return fit_method(estimator, *args, **kwargs)


Depth: 11 Precision: 0.706 Recall 0.945


  return fit_method(estimator, *args, **kwargs)


Depth: 13 Precision: 0.712 Recall 0.929


  return fit_method(estimator, *args, **kwargs)


Depth: 15 Precision: 0.708 Recall 0.92


  return fit_method(estimator, *args, **kwargs)


Depth: 17 Precision: 0.713 Recall 0.901


  return fit_method(estimator, *args, **kwargs)


Depth: 19 Precision: 0.71 Recall 0.899


#### Модель RandomForest по моему мнению показала лучший результат сочетания точности и полноты предсказания Precision: 0.712 Recall 0.929 при использовании гиперпараметра depth = 13

In [636]:
y_pred = rf_clf.predict(X_forecast)
print(
    'Precision:', np.round(precision_score(y_forecast, y_pred),3), 
    'Recall', np.round(recall_score(y_forecast, y_pred),3)
)

Precision: 0.706 Recall 0.894


### Сравнивая предсказанные значения со значениями истинными, выбранная модель с 70% точностью предсказала результат.

### Это означает, что данная модель с довольно высокой точностью сможет предсказать вероятность получения пособия по безработице для конкретного работника с набором его параметров