# Topics
1. Solving Classification Problem


In [1]:
import pandas as pd

In [2]:
hr_data = pd.read_csv('HR_comma_sep.csv.txt')

In [15]:
hr_data.rename(columns={'sales':'dept'}, inplace=True)

In [16]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,salary_en
0,0.38,0.53,2,157,3,0,1,0,sales,low,1
1,0.8,0.86,5,262,6,0,1,0,sales,medium,2
2,0.11,0.88,7,272,4,0,1,0,sales,medium,2
3,0.72,0.87,5,223,5,0,1,0,sales,low,1
4,0.37,0.52,2,159,3,0,1,0,sales,low,1


In [17]:
hr_data.left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

### Inferances
* Target(left) is discrete in nature so a classification problem

In [18]:
target_data = hr_data.left

### Taking care of categorical columns
* dept & salary is categorical
* ML algorithms only understands numerical data
* We will use preprocessor LabelEncoder to convert this into numerical data


In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
le_salary = LabelEncoder()

In [21]:
le_salary.fit(hr_data.salary)

LabelEncoder()

In [22]:
le_salary.transform(['low','medium','low'])

array([1, 2, 1], dtype=int32)

In [23]:
hr_data['salary_en'] = le_salary.transform(hr_data.salary)

In [24]:
le = LabelEncoder()

In [25]:
le.fit_transform(hr_data.salary)

array([1, 2, 2, ..., 1, 1, 1])

In [26]:
le_dept = LabelEncoder()

In [27]:
le_dept.fit(hr_data.dept)

LabelEncoder()

In [28]:
le_dept.transform(['sales','support'])

array([7, 8], dtype=int32)

In [29]:
le_dept.classes_

array(['IT', 'RandD', 'accounting', 'hr', 'management', 'marketing',
       'product_mng', 'sales', 'support', 'technical'], dtype=object)

In [30]:
hr_data['dept_en'] = le_dept.transform(hr_data.dept)

In [31]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,salary_en,dept_en
0,0.38,0.53,2,157,3,0,1,0,sales,low,1,7
1,0.8,0.86,5,262,6,0,1,0,sales,medium,2,7
2,0.11,0.88,7,272,4,0,1,0,sales,medium,2,7
3,0.72,0.87,5,223,5,0,1,0,sales,low,1,7
4,0.37,0.52,2,159,3,0,1,0,sales,low,1,7


In [32]:
#getting feature data from the complete information
feature_data = hr_data.drop(columns=['dept','salary','left'])

In [33]:
#splitting feature & target data into train & test
from sklearn.model_selection import train_test_split

In [34]:
trainX, testX, trainY, testY = train_test_split(feature_data, target_data)

In [35]:
trainX[:2]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_en,dept_en
11389,0.16,0.42,3,182,3,1,0,1,6
3629,0.15,0.67,6,209,5,0,0,1,9


In [36]:
# Observation : You are not training the model with string information

# Classification Algorithms
* LogisticRegression, RandomForestClassifier, DecisicionTreeClassifier

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
rf = RandomForestClassifier()

In [39]:
rf.fit(trainX,trainY)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [40]:
data = [0.38,0.53,2,157,3,1,0,'support','high']

In [41]:
le_salary.transform(['high'])

array([0], dtype=int32)

In [42]:
le_dept.transform(['support'])

array([8], dtype=int32)

In [43]:
data = [0.38,0.53,2,157,3,1,0,8,0]

In [44]:
rf.predict([data])

array([1], dtype=int64)

In [45]:
rf.predict(testX)

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [46]:
rf.score(testX,testY)

0.9930666666666667

In [47]:
from sklearn.linear_model import LogisticRegression

In [48]:
lr = LogisticRegression()

In [49]:
lr.fit(trainX,trainY)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [50]:
lr.score(testX,testY)

0.76

In [51]:
lr.predict(testX)

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [52]:
testX['lr_predict'] = lr.predict(testX)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [53]:
testX['rf_predict'] = rf.predict(testX.drop(columns=['lr_predict']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [54]:
testX['actual'] = testY

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [55]:
testX.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_en,dept_en,lr_predict,rf_predict,actual
13297,0.9,0.49,3,259,6,0,0,2,0,0,0,0
5814,0.76,0.65,4,195,3,0,0,1,7,0,0,0
11086,0.9,0.49,3,259,6,0,0,2,0,0,0,0
12261,0.82,0.97,5,263,5,0,0,2,3,0,1,1
5552,0.87,0.94,4,256,2,0,0,2,8,0,0,0


In [57]:
testX[testX.actual != testX.rf_predict].shape

(26, 12)

In [56]:
testX[testX.actual != testX.rf_predict]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_en,dept_en,lr_predict,rf_predict,actual
5149,0.44,0.36,2,136,3,0,0,0,9,0,1,0
6263,0.9,0.87,4,231,5,0,0,1,4,0,1,0
1199,0.75,0.74,6,134,3,0,0,2,8,0,0,1
5847,0.77,0.85,5,221,5,0,0,1,9,0,1,0
704,0.74,0.54,5,216,3,0,0,1,7,0,0,1
12410,0.28,0.45,6,218,4,0,0,1,2,0,0,1
1665,0.14,0.55,6,175,5,0,0,2,0,0,0,1
410,0.28,0.45,6,218,4,0,0,1,2,0,0,1
1755,0.63,0.76,2,157,4,0,0,1,1,0,0,1
1469,0.4,0.65,2,296,5,0,0,2,7,1,0,1


# Adult Data

In [62]:
cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship'
        ,'race','sex','capital-gain','capital-loss','hours-per-week','native-country','Salary']

In [63]:
adult_data = pd.read_csv('adultData.csv' , names=cols)

In [64]:
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [65]:
adult_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
Salary            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 2.6+ MB


In [75]:
cat_adult_data =  adult_data.select_dtypes(include=['object'])

In [77]:
cat_adult_data.columns

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'Salary'],
      dtype='object')

In [78]:
from sklearn.preprocessing import LabelEncoder

In [79]:
le = LabelEncoder()

In [85]:
for col in cat_adult_data.columns:
    col = str(col)
    adult_data[col+"_en"] = le.fit_transform(adult_data[col])

In [86]:
adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,Salary,workclass_en,education_en,marital-status_en,occupation_en,relationship_en,race_en,sex_en,native-country_en,Salary_en
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,<=50K,7,9,4,1,1,4,1,39,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,<=50K,6,9,2,4,0,4,1,39,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,<=50K,4,11,0,6,1,4,1,39,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,<=50K,4,1,2,6,0,2,1,39,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,<=50K,4,9,2,10,5,2,0,5,0


In [87]:
adult_data.Salary.value_counts()

 <=50K    24720
 >50K      7841
Name: Salary, dtype: int64

In [88]:
adult_data.Salary_en.value_counts()

0    24720
1     7841
Name: Salary_en, dtype: int64

In [91]:
numeric_adult_data = adult_data.drop(columns= cat_adult_data.columns)

In [93]:
numeric_adult_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age                  32561 non-null int64
fnlwgt               32561 non-null int64
education-num        32561 non-null int64
capital-gain         32561 non-null int64
capital-loss         32561 non-null int64
hours-per-week       32561 non-null int64
workclass_en         32561 non-null int32
education_en         32561 non-null int32
marital-status_en    32561 non-null int32
occupation_en        32561 non-null int32
relationship_en      32561 non-null int32
race_en              32561 non-null int32
sex_en               32561 non-null int32
native-country_en    32561 non-null int32
Salary_en            32561 non-null int32
dtypes: int32(9), int64(6)
memory usage: 2.6 MB


In [94]:
feature_data = numeric_adult_data.drop(columns=['Salary_en'])

In [95]:
feature_data.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_en,education_en,marital-status_en,occupation_en,relationship_en,race_en,sex_en,native-country_en
0,39,77516,13,2174,0,40,7,9,4,1,1,4,1,39
1,50,83311,13,0,0,13,6,9,2,4,0,4,1,39
2,38,215646,9,0,0,40,4,11,0,6,1,4,1,39
3,53,234721,7,0,0,40,4,1,2,6,0,2,1,39
4,28,338409,13,0,0,40,4,9,2,10,5,2,0,5


In [98]:
target_data = numeric_adult_data.Salary_en

In [99]:
target_data.shape

(32561,)

In [100]:
#splitting feature & target data into train & test
from sklearn.model_selection import train_test_split

In [101]:
trainX, testX, trainY, testY = train_test_split(feature_data, target_data)

In [102]:
trainX[:2]

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_en,education_en,marital-status_en,occupation_en,relationship_en,race_en,sex_en,native-country_en
21436,31,354464,9,0,0,40,4,11,2,8,0,4,1,39
4669,38,91711,13,0,0,50,2,9,2,10,0,4,1,39


In [105]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [106]:
rf = RandomForestClassifier()
lr = LogisticRegression()

In [107]:
rf.fit(trainX,trainY)
lr.fit(trainX,trainY)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [111]:
print("Accuracy of RandomForestClassifier :", rf.score(testX,testY))
print("Accuracy of LogisticRegression :",lr.score(testX,testY))

Accuracy of RandomForestClassifier : 0.8527207959710109
Accuracy of LogisticRegression : 0.8000245670065103


In [112]:
testX['lr_predict'] = lr.predict(testX)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [113]:
testX['rf_predict'] = rf.predict(testX.drop(columns=['lr_predict']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [114]:
testX['actual'] = testY

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [116]:
testX[testX.actual != testX.rf_predict].shape

(1199, 17)

In [120]:
testX.head(10)

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_en,education_en,marital-status_en,occupation_en,relationship_en,race_en,sex_en,native-country_en,lr_predict,rf_predict,actual
9145,44,177240,15,10520,0,40,2,14,4,10,1,4,1,39,1,1,1
16961,28,176683,13,5178,0,50,4,9,2,4,0,4,1,39,1,1,1
31708,19,194260,10,0,0,20,4,15,4,12,3,4,0,39,0,0,0
28633,21,174907,12,0,0,32,4,7,4,12,3,4,0,39,0,0,0
9387,29,217200,9,0,0,40,4,11,2,3,0,4,1,39,0,0,0
19821,24,172496,14,0,0,50,4,12,4,13,1,4,1,39,0,0,0
8106,23,252153,10,0,0,28,4,15,4,8,1,4,0,39,0,0,0
15484,36,370585,9,0,0,40,0,11,2,0,0,2,1,39,0,0,0
3303,47,664821,13,0,0,40,4,9,2,8,0,4,1,8,0,0,0
22487,39,123983,13,0,0,40,4,9,0,12,1,1,1,3,1,0,0
