In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_validate




In [2]:
df=pd.read_csv('Placement_Data_Full_Class.csv')
df.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


Explaining the Variables
1. gender: Gender (M or F)
2. ssc_p: Secondary Education percentage (10th Grade)
3. ssc_b: Board of Secondary Education (Central or Others)
4. hsc_p: Higher Secondary Education percentage (12th Grade)
5. hsc_b: Board of Higher Secondary Education (Central or Others)
6. hsc_s: Specialization in Higher Secondary Education
7. degree_p: Undergraduate Degree percentage
8. degree_t: Undergraduate Degree Subject
9. workex: Work Experience (1 or 0)
10. etest_p: Employability Test percentage
11. specialisation: Postgraduate Subject
12. mba_p: Postgraduate percentage
13. status: Placement (1 or 0)
14. salary: Salary Offered

In [3]:
df.shape

(215, 15)

In [4]:
df.describe()

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
count,215.0,215.0,215.0,215.0,215.0,215.0,148.0
mean,108.0,67.303395,66.333163,66.370186,72.100558,62.278186,288655.405405
std,62.209324,10.827205,10.897509,7.358743,13.275956,5.833385,93457.45242
min,1.0,40.89,37.0,50.0,50.0,51.21,200000.0
25%,54.5,60.6,60.9,61.0,60.0,57.945,240000.0
50%,108.0,67.0,65.0,66.0,71.0,62.0,265000.0
75%,161.5,75.7,73.0,72.0,83.5,66.255,300000.0
max,215.0,89.4,97.7,91.0,98.0,77.89,940000.0


In [5]:
df.drop(columns='sl_no', inplace=True)

In [6]:
df.nunique()

gender              2
ssc_p             103
ssc_b               2
hsc_p              97
hsc_b               2
hsc_s               3
degree_p           89
degree_t            3
workex              2
etest_p           100
specialisation      2
mba_p             205
status              2
salary             45
dtype: int64

In [7]:
fig = px.pie(df, names='gender', title='Gender Distribution')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
fig = px.pie(df, names='ssc_b', title='Distribution of Board of Secondary Education')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
fig = px.histogram(df,x='ssc_p', histfunc='count', marginal='box', title='Distribution of Secondary Education Percentage')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
fig = px.pie(df,names='hsc_b', title ='Distribution of Board of Higher Secondary Education')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
fig = px.histogram(df,x='hsc_p', marginal='box', title='Distribution of Higher Secondary Education Percentage')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
fig = px.pie(df,names='hsc_s', title='Distribution of Higher Secondary Education Subjects')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
fig = px.histogram(df,x='degree_p', marginal='box', title='Distribution of Undergraduate Degree Percentage')
fig.update_layout(title_x=0.5)
fig.show()


In [None]:
fig = px.pie(df,names='degree_t', title='Distribution of Undergraduate Degree Subject')
fig.update_layout(title_x=0.5)
fig.show()


In [None]:
fig = px.pie(df,names='workex', title='Distribution of Work Experience')
fig.update_layout(title_x=0.5)
fig.show()


In [None]:
fig = px.histogram(df,x='etest_p', marginal='box', title='Employability Test Percentage')
fig.update_layout(title_x=0.5)
fig.show()


In [None]:
fig = px.pie(df,names='specialisation', title='Distribution of Postgraduate Subject')
fig.update_layout(title_x=0.5)
fig.show()


In [None]:
fig = px.histogram(df,x='mba_p', marginal='box', title='Distribution of Postgraduate Percentage')
fig.update_layout(title_x=0.5)
fig.show()


In [None]:
fig = px.pie(df,names='status', title='Distribution of Placement')
fig.update_layout(title_x=0.47)
fig.show()


In [None]:
numerical_col=['ssc_p','hsc_p','degree_p','etest_p','mba_p','salary']
df[numerical_col].isnull().sum()

ssc_p        0
hsc_p        0
degree_p     0
etest_p      0
mba_p        0
salary      67
dtype: int64

In [None]:
categorical_col=['ssc_b','hsc_b','hsc_s','degree_t','workex','specialisation','status']
df[categorical_col].isnull().sum()

ssc_b             0
hsc_b             0
hsc_s             0
degree_t          0
workex            0
specialisation    0
status            0
dtype: int64

In [None]:
num_col_with_na = ['salary']

In [None]:
num_imp = SimpleImputer()
df[num_col_with_na] = num_imp.fit_transform(df[num_col_with_na])

In [None]:
df.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,288655.405405
4,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [None]:
df.groupby(["gender","status"]).size().unstack()


status,Not Placed,Placed
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,28,48
M,39,100


In [None]:
df.groupby('status').mean()


Unnamed: 0_level_0,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Not Placed,57.54403,58.395522,61.134179,69.58791,61.612836,288655.405405
Placed,71.721486,69.926554,68.740541,73.238041,62.579392,288655.405405


In [None]:
df['hsc_s'].value_counts()

Commerce    113
Science      91
Arts         11
Name: hsc_s, dtype: int64

In [None]:
def status_to_number(i):
    if i == 'Placed':return 1
    elif i == 'Not Placed':return 0
    else: return i
df['status_num'] = df['status'].apply(status_to_number)

In [None]:
placement_df = pd.pivot_table(df, index='status', columns='gender',aggfunc='count')
placement_df

Unnamed: 0_level_0,degree_p,degree_p,degree_t,degree_t,etest_p,etest_p,hsc_b,hsc_b,hsc_p,hsc_p,...,specialisation,specialisation,ssc_b,ssc_b,ssc_p,ssc_p,status_num,status_num,workex,workex
gender,F,M,F,M,F,M,F,M,F,M,...,F,M,F,M,F,M,F,M,F,M
status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Not Placed,28,39,28,39,28,39,28,39,28,39,...,28,39,28,39,28,39,28,39,28,39
Placed,48,100,48,100,48,100,48,100,48,100,...,48,100,48,100,48,100,48,100,48,100


In [None]:
degree_placement_df = placement_df.degree_p.reset_index()
fig = px.bar(degree_placement_df, degree_placement_df.status, ['F','M'])
fig.update_layout(title='Placement Distribution(By Gender)', title_x=0.5)
fig.show()



In [None]:
placement_df = pd.pivot_table(df, index='status', columns='specialisation',aggfunc='count')
placement_df

Unnamed: 0_level_0,degree_p,degree_p,degree_t,degree_t,etest_p,etest_p,gender,gender,hsc_b,hsc_b,...,salary,salary,ssc_b,ssc_b,ssc_p,ssc_p,status_num,status_num,workex,workex
specialisation,Mkt&Fin,Mkt&HR,Mkt&Fin,Mkt&HR,Mkt&Fin,Mkt&HR,Mkt&Fin,Mkt&HR,Mkt&Fin,Mkt&HR,...,Mkt&Fin,Mkt&HR,Mkt&Fin,Mkt&HR,Mkt&Fin,Mkt&HR,Mkt&Fin,Mkt&HR,Mkt&Fin,Mkt&HR
status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Not Placed,25,42,25,42,25,42,25,42,25,42,...,25,42,25,42,25,42,25,42,25,42
Placed,95,53,95,53,95,53,95,53,95,53,...,95,53,95,53,95,53,95,53,95,53


In [None]:
placement_specialisation_df = placement_df.degree_p.reset_index()
fig = px.bar(placement_specialisation_df, placement_specialisation_df.status, ['Mkt&HR','Mkt&Fin'])
fig.update_layout(title='Placement Distribution(By Specialisation)', title_x=0.5)
fig.show()

In [None]:
salary_spc = df.groupby('specialisation')['salary'].sum().reset_index()
fig = px.pie(salary_spc, 'specialisation','salary')
fig.update_layout(title='Overall Salary Distribution(By Specialisation)', title_x=0.5)
fig.show()

In [None]:
workex_status = df.groupby('status')['workex'].count().reset_index()
fig = px.pie(workex_status, 'status','workex')
fig.update_layout(title='Placement Distribution(By Work Experience)', title_x=0.5)
fig.show()

In [None]:
gender_exp_df = pd.pivot_table(df, index='workex', columns='gender',aggfunc='count')
gender_exp_df  

Unnamed: 0_level_0,degree_p,degree_p,degree_t,degree_t,etest_p,etest_p,hsc_b,hsc_b,hsc_p,hsc_p,...,specialisation,specialisation,ssc_b,ssc_b,ssc_p,ssc_p,status,status,status_num,status_num
gender,F,M,F,M,F,M,F,M,F,M,...,F,M,F,M,F,M,F,M,F,M
workex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
No,54,87,54,87,54,87,54,87,54,87,...,54,87,54,87,54,87,54,87,54,87
Yes,22,52,22,52,22,52,22,52,22,52,...,22,52,22,52,22,52,22,52,22,52


In [None]:
workex_gender_df = gender_exp_df.degree_p.reset_index()
fig = px.bar(workex_gender_df, workex_gender_df.workex, ['M','F'])
fig.update_layout(title='Gender Distribution(By Work Experience)', title_x=0.5)
fig.show()

In [None]:
ssc_b_gender_df = pd.pivot_table(df, index='ssc_b', columns='gender',aggfunc='count')
ssc_b_gender_df

Unnamed: 0_level_0,degree_p,degree_p,degree_t,degree_t,etest_p,etest_p,hsc_b,hsc_b,hsc_p,hsc_p,...,specialisation,specialisation,ssc_p,ssc_p,status,status,status_num,status_num,workex,workex
gender,F,M,F,M,F,M,F,M,F,M,...,F,M,F,M,F,M,F,M,F,M
ssc_b,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Central,42,74,42,74,42,74,42,74,42,74,...,42,74,42,74,42,74,42,74,42,74
Others,34,65,34,65,34,65,34,65,34,65,...,34,65,34,65,34,65,34,65,34,65


In [None]:
gender_ssc_b_df = ssc_b_gender_df.degree_p.reset_index()
fig = px.bar(gender_ssc_b_df, gender_ssc_b_df.ssc_b, ['M','F'])
fig.update_layout(title='Gender Distribution(By Board of Secondary Education)', title_x=0.5)
fig.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
df['status'] = LabelEncoder().fit_transform(df['status'])

In [None]:
categorical_cols = ['gender', 'hsc_b', 'ssc_b', 'workex', 'specialisation', 'status', 'hsc_s', 'degree_t']
binary_cols = ['gender', 'hsc_b', 'ssc_b', 'workex', 'specialisation']
ordinal_enc = OrdinalEncoder()
df[binary_cols]= ordinal_enc.fit_transform(df[binary_cols])
df[categorical_col]

Unnamed: 0,ssc_b,hsc_b,hsc_s,degree_t,workex,specialisation,status
0,1.0,1.0,Commerce,Sci&Tech,0.0,1.0,1
1,0.0,1.0,Science,Sci&Tech,1.0,0.0,1
2,0.0,0.0,Arts,Comm&Mgmt,0.0,0.0,1
3,0.0,0.0,Science,Sci&Tech,0.0,1.0,0
4,0.0,0.0,Commerce,Comm&Mgmt,0.0,0.0,1
...,...,...,...,...,...,...,...
210,1.0,1.0,Commerce,Comm&Mgmt,0.0,0.0,1
211,1.0,1.0,Science,Sci&Tech,0.0,0.0,1
212,1.0,1.0,Commerce,Comm&Mgmt,1.0,0.0,1
213,1.0,1.0,Commerce,Comm&Mgmt,0.0,1.0,1


In [None]:
hsc_b_enc = OneHotEncoder(drop='first')
hsc_b_dummies = hsc_b_enc.fit_transform(df[['hsc_s']]).toarray()
df.drop(columns=['hsc_s'], inplace=True)
df = pd.concat([df, pd.DataFrame(hsc_b_dummies)], axis=1)
df.head()


Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary,status_num,0,1
0,1.0,67.0,1.0,91.0,1.0,58.0,Sci&Tech,0.0,55.0,1.0,58.8,1,270000.0,1,1.0,0.0
1,1.0,79.33,0.0,78.33,1.0,77.48,Sci&Tech,1.0,86.5,0.0,66.28,1,200000.0,1,0.0,1.0
2,1.0,65.0,0.0,68.0,0.0,64.0,Comm&Mgmt,0.0,75.0,0.0,57.8,1,250000.0,1,0.0,0.0
3,1.0,56.0,0.0,52.0,0.0,52.0,Sci&Tech,0.0,66.0,1.0,59.43,0,288655.405405,0,0.0,1.0
4,1.0,85.8,0.0,73.6,0.0,73.3,Comm&Mgmt,0.0,96.8,0.0,55.5,1,425000.0,1,1.0,0.0


In [None]:
degree_t_enc = OneHotEncoder(drop='first')
degree_t_dummies = degree_t_enc.fit_transform(df[['degree_t']]).toarray()
df.drop(columns=['degree_t'], inplace=True)
df = pd.concat([df, pd.DataFrame(degree_t_dummies)], axis=1)
df.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,status,salary,status_num,0,1,0.1,1.1
0,1.0,67.0,1.0,91.0,1.0,58.0,0.0,55.0,1.0,58.8,1,270000.0,1,1.0,0.0,0.0,1.0
1,1.0,79.33,0.0,78.33,1.0,77.48,1.0,86.5,0.0,66.28,1,200000.0,1,0.0,1.0,0.0,1.0
2,1.0,65.0,0.0,68.0,0.0,64.0,0.0,75.0,0.0,57.8,1,250000.0,1,0.0,0.0,0.0,0.0
3,1.0,56.0,0.0,52.0,0.0,52.0,0.0,66.0,1.0,59.43,0,288655.405405,0,0.0,1.0,0.0,1.0
4,1.0,85.8,0.0,73.6,0.0,73.3,0.0,96.8,0.0,55.5,1,425000.0,1,1.0,0.0,0.0,0.0


# Fitting Random Forest classifier to the dataset

In [None]:
x = df.drop(['status','salary','status_num'], axis=1)
y = df['status'].astype(float)

train_x, test_x, train_y, test_y = train_test_split(x.values,y.values , test_size=0.3, random_state=42)

print(train_x.shape)
print(train_y.shape)

(150, 14)
(150,)


In [None]:
x.head(1)

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,0,1,0.1,1.1
0,1.0,67.0,1.0,91.0,1.0,58.0,0.0,55.0,1.0,58.8,1.0,0.0,0.0,1.0


In [None]:
clf = RandomForestClassifier(100, n_jobs=-1, random_state=42)
clf.fit(train_x,train_y)

In [None]:
pred_y = clf.predict(test_x)

In [None]:
print(confusion_matrix(test_y, pred_y))
print(classification_report(test_y, pred_y))

[[ 9 12]
 [ 3 41]]
              precision    recall  f1-score   support

         0.0       0.75      0.43      0.55        21
         1.0       0.77      0.93      0.85        44

    accuracy                           0.77        65
   macro avg       0.76      0.68      0.70        65
weighted avg       0.77      0.77      0.75        65



In [None]:
lr_model = LogisticRegression()
lr_model.fit(train_x,train_y)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [None]:
lr_model.score(test_x, test_y)

0.8461538461538461

In [None]:
pred_y = lr_model.predict(test_x)
mat = confusion_matrix(test_y,pred_y)

In [None]:
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

         0.0       0.79      0.71      0.75        21
         1.0       0.87      0.91      0.89        44

    accuracy                           0.85        65
   macro avg       0.83      0.81      0.82        65
weighted avg       0.84      0.85      0.84        65



In [None]:
from joblib import load, dump

In [None]:
dump({
    'model':lr_model,
    'ordinal': ordinal_enc,
    'h_board':hsc_b_enc,
    'd_type':degree_t_enc,
    'binary_col_names': ['gender', 'hsc_b', 'ssc_b', 'workex', 'specialisation']
}, 'placement_prediction_model.pk')

['placement_prediction_model.pk']

In [None]:
test_x[0]

array([ 1.  , 69.  ,  1.  , 60.  ,  1.  , 65.  ,  0.  , 87.55,  0.  ,
       52.81,  1.  ,  0.  ,  0.  ,  0.  ])

In [None]:
result = clf.predict(test_x) 
result


array([1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.])

In [None]:
x

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,0,1,0.1,1.1
0,1.0,67.00,1.0,91.00,1.0,58.00,0.0,55.0,1.0,58.80,1.0,0.0,0.0,1.0
1,1.0,79.33,0.0,78.33,1.0,77.48,1.0,86.5,0.0,66.28,0.0,1.0,0.0,1.0
2,1.0,65.00,0.0,68.00,0.0,64.00,0.0,75.0,0.0,57.80,0.0,0.0,0.0,0.0
3,1.0,56.00,0.0,52.00,0.0,52.00,0.0,66.0,1.0,59.43,0.0,1.0,0.0,1.0
4,1.0,85.80,0.0,73.60,0.0,73.30,0.0,96.8,0.0,55.50,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,1.0,80.60,1.0,82.00,1.0,77.60,0.0,91.0,0.0,74.49,1.0,0.0,0.0,0.0
211,1.0,58.00,1.0,60.00,1.0,72.00,0.0,74.0,0.0,53.62,0.0,1.0,0.0,1.0
212,1.0,67.00,1.0,67.00,1.0,73.00,1.0,59.0,0.0,69.72,1.0,0.0,0.0,0.0
213,0.0,74.00,1.0,66.00,1.0,58.00,0.0,70.0,1.0,60.23,1.0,0.0,0.0,0.0


In [None]:
ordinal_enc.categories_

[array(['F', 'M'], dtype=object),
 array(['Central', 'Others'], dtype=object),
 array(['Central', 'Others'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Mkt&Fin', 'Mkt&HR'], dtype=object)]

In [None]:
hsc_b_enc.categories_

[array(['Arts', 'Commerce', 'Science'], dtype=object)]

In [None]:
degree_t_enc.transform([['Sci&Tech']]).toarray()[0].tolist()


X does not have valid feature names, but OneHotEncoder was fitted with feature names



[0.0, 1.0]

In [None]:
test_x[0]

array([ 1.  , 69.  ,  1.  , 60.  ,  1.  , 65.  ,  0.  , 87.55,  0.  ,
       52.81,  1.  ,  0.  ,  0.  ,  0.  ])

In [None]:
x.head(1).to_dict()


DataFrame columns are not unique, some columns will be omitted.



{'gender': {0: 1.0},
 'ssc_p': {0: 67.0},
 'ssc_b': {0: 1.0},
 'hsc_p': {0: 91.0},
 'hsc_b': {0: 1.0},
 'degree_p': {0: 58.0},
 'workex': {0: 0.0},
 'etest_p': {0: 55.0},
 'specialisation': {0: 1.0},
 'mba_p': {0: 58.8},
 0: {0: 0.0},
 1: {0: 1.0}}