In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("edu_enrollees.csv")
df.head(2)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,Xgrp
0,8949.0,city_103,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,,,1,36.0,1.0,train
1,29725.0,city_40,0.776,Male,No relevant experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0.0,train


### 전처리

In [3]:
df = df.drop(columns = ["city", "company_size", "company_type"])

In [4]:
df_na_cnt = pd.DataFrame(dict(dtype = df.dtypes, 
                              na_cnt = df.isna().sum())).reset_index()
df_na_cnt

Unnamed: 0,index,dtype,na_cnt
0,enrollee_id,float64,0
1,city_development_index,float64,0
2,gender,object,4508
3,relevant_experience,object,0
4,enrolled_university,object,386
5,education_level,object,460
6,major_discipline,object,2813
7,experience,object,65
8,last_new_job,object,423
9,training_hours,float64,0


In [5]:
df = df.dropna()

In [6]:
df = df.loc[~df["experience"].isin([">20", "<1"]), ]
df = df.loc[~df["last_new_job"].isin([">4", "never"]), ]

In [7]:
df["experience"] = df["experience"].astype("int")
df["last_new_job"] = df["last_new_job"].astype("int")

In [8]:
len(df)

7522

In [9]:
df_base = df.reset_index(drop = True)
df_base.head(2)

Unnamed: 0,enrollee_id,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,last_new_job,training_hours,target,Xgrp
0,27107.0,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,7,1,46.0,1.0,train
1,23853.0,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,5,1,108.0,0.0,train


### Q1.

In [10]:
df_q1 = df_base[["relevant_experience", "target"]].copy()
df_q1.head(2)

Unnamed: 0,relevant_experience,target
0,Has relevant experience,1.0
1,Has relevant experience,0.0


In [11]:
# No -> A, Has -> B
df_A = df_q1.loc[df_q1["relevant_experience"] ==  "No relevant experience", ]
df_B = df_q1.loc[df_q1["relevant_experience"] == "Has relevant experience", ]

In [12]:
stat_A = df_A["target"].value_counts(normalize = True)[1]
stat_B = df_B["target"].value_counts(normalize = True)[1]
round(stat_A / stat_B, 2)

1.77

In [13]:
df_base.groupby("relevant_experience")["target"].mean()

relevant_experience
Has relevant experience    0.215911
No relevant experience     0.382873
Name: target, dtype: float64

### Q2.

In [14]:
df_q2_obj = df_base.iloc[:, 2:7].copy()
df_q2_obj.head(2)

Unnamed: 0,gender,relevant_experience,enrolled_university,education_level,major_discipline
0,Male,Has relevant experience,no_enrollment,Graduate,STEM
1,Male,Has relevant experience,no_enrollment,Graduate,STEM


In [15]:
df_q2_obj.columns.to_list()

['gender',
 'relevant_experience',
 'enrolled_university',
 'education_level',
 'major_discipline']

In [16]:
df_q2_dum = pd.get_dummies(df_q2_obj, columns = df_q2_obj.columns, 
                           dtype = "int")
df_q2_dum.head(2)

Unnamed: 0,gender_Female,gender_Male,gender_Other,relevant_experience_Has relevant experience,relevant_experience_No relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,enrolled_university_no_enrollment,education_level_Graduate,education_level_Masters,education_level_Phd,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other,major_discipline_STEM
0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1
1,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1


In [17]:
df_q2_dum.columns[[1, 3]]

Index(['gender_Male', 'relevant_experience_Has relevant experience'], dtype='object')

In [18]:
# df_q2_dum = df_q2_dum.iloc[:, [0, 1, 3, 4, 6, 7, 8]]
# df_q2_dum = df_q2_dum.drop(columns = ["gender_Other", "releva ~~~~~~~ 💢"])
df_q2_dum = df_q2_dum.drop(columns = df_q2_dum.columns[[2, 4, 7, 10, 16]])

In [19]:
# pd.Series(df_q2_dum.columns).reset_index()

In [20]:
df_q2_dum.head(2)

Unnamed: 0,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,0,1,1,0,0,1,0,0,0,0,0,0
1,0,1,1,0,0,1,0,0,0,0,0,0


In [21]:
col_x = ["city_development_index", "experience", "last_new_job", "training_hours"]
col_y = ["target", "Xgrp"]

df_job2 = pd.concat([df_base[col_y + col_x], df_q2_dum], axis = 1)

In [22]:
df_job2.head(2)

Unnamed: 0,target,Xgrp,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,1.0,train,0.92,7,1,46.0,0,1,1,0,0,1,0,0,0,0,0,0
1,0.0,train,0.92,5,1,108.0,0,1,1,0,0,1,0,0,0,0,0,0


In [23]:
model_lr = LogisticRegression(random_state = 123, 
                              fit_intercept = True,
                              C = 100000,
                              max_iter = 1000,
                              solver = "liblinear")
model_lr.fit(X = df_job2.drop(columns = ["target", "Xgrp"]),
             y = df_job2["target"])

In [24]:
# model_lr.coef_ # 회귀 계수
np.exp(model_lr.coef_) # 승산비(OR, Odds Ratio)

array([[0.00220435, 0.97190078, 1.10037739, 0.99907422, 0.84961169,
        0.87221096, 0.46740116, 1.67214935, 0.75465354, 1.38142616,
        1.01688281, 1.29909128, 1.11361023, 1.27567972, 1.48869549,
        0.64443085]])

In [25]:
np.floor(np.exp(model_lr.coef_).max() * 100) / 100

1.67

In [26]:
pd.DataFrame(np.exp(model_lr.coef_), columns = df_job2.columns[2:])

Unnamed: 0,city_development_index,experience,last_new_job,training_hours,gender_Female,gender_Male,relevant_experience_Has relevant experience,enrolled_university_Full time course,enrolled_university_Part time course,education_level_Graduate,education_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,0.002204,0.971901,1.100377,0.999074,0.849612,0.872211,0.467401,1.672149,0.754654,1.381426,1.016883,1.299091,1.11361,1.27568,1.488695,0.644431


### Q3.

In [27]:
df_train = df_job2.loc[df_job2["Xgrp"] == "train", ]
df_test  = df_job2.loc[df_job2["Xgrp"] == "test",  ]
len(df_train), len(df_test)

(4706, 2816)

In [28]:
model_knn = KNeighborsClassifier(n_neighbors = 5)
model_knn.fit(X = df_train.drop(columns = ["target", "Xgrp"]),
              y = df_train["target"])
pred = model_knn.predict(df_test.drop(columns = ["target", "Xgrp"]))
pred[:4]

array([0., 0., 0., 0.])

In [29]:
y_t = df_test["target"]
y_p = pred

In [30]:
df_tab = pd.crosstab(y_t, y_p).values
df_tab

array([[1899,  193],
       [ 616,  108]], dtype=int64)

In [31]:
# df_tab.sum()
# df_tab.diagonal().sum()
round(df_tab.diagonal().sum() / df_tab.sum(), 2)

0.71

In [32]:
from sklearn.metrics import accuracy_score
round(accuracy_score(y_true = y_t, y_pred = y_p), 2)

0.71

In [33]:
type(df_train)

pandas.core.frame.DataFrame