**DATA PREPARATION**

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [None]:
main_data = pd.read_csv('Attrition_data.csv')

In [None]:
main_data.head()

In [None]:
location_clean = pd.read_csv('location_clean.csv')
location_clean.head()

In [None]:
data = pd.merge(main_data, location_clean, how= 'inner',left_on = 'S.No', right_on='id' )
data.drop('id',axis =1, inplace = True)
data.head()

In [None]:
assert location_clean.shape[0] == data.shape[0]

In [None]:
data.shape

**Converting the columns into right datatypes and extracting data**

In [None]:
data = data.rename(columns = {'Engagement Score (% Satisfaction)':'sat_score'})
data['sat_score'] = data['sat_score'].apply(lambda x:x[:-1])
data.head()

In [None]:
data['sat_score'] = data['sat_score'].astype('int')

In [None]:
data.isna().sum().sum()

In [None]:
data.isna().sum()

In [None]:
data = data[data['doubtful']=='NO']
data.isna().sum()

In [None]:
np.where(data.isna())

In [None]:
data.iloc[[  2,  23,  63, 193],:]

In [None]:
data.info()

In [None]:
data['Last Rating'] = data['Last Rating'].apply(lambda x: str(x))
data.info()

In [None]:
def to_float(x):
    try:
        return float(x)
    except ValueError as v:
        return float(x.replace(' ',''))
data['Tenure'] = data['Tenure'].apply(to_float)
data.head(10)

In [None]:
from datetime import datetime
def converter(x):
    try:
        return datetime.strptime(x, '%d-%b-%y')
    except:
        return datetime.strptime(x, '%d-%m-%y')  # for these values in DOJ column '''05-07-10,02-09-10,01-08-11,12-03-04,05-07-10,,01-06-11,09-08-07,05-05-08,12-10-09,07-02-11'''

data['DOL_date'] = data['In Active Date'].apply(converter)
data['DOJ_date'] = data['DOJ'].apply(converter)
data.head()

In [None]:
data.drop(['DOJ','In Active Date'], axis = 1, inplace =True)

In [None]:
data['Designation'].value_counts()

In [None]:
data['Grade'].value_counts()

In [None]:
data.groupby('Designation')['Grade'].apply(lambda x: x.unique())

In [None]:
data['Zone'].value_counts()

In [None]:
data['Zone'] = data['Zone'].apply(lambda x: x.lower()) ## CENTRAL and central, north and North, south and South pairs were present
data['Zone'].value_counts()

In [None]:
data['Marital Status'].value_counts()

In [None]:
data['Gender'].value_counts()

In [None]:
data['Education'].value_counts()

In [None]:
data.columns

In [None]:
data.drop(['EmpID','Emp Name','Attrition ','Designation'],axis =1 , inplace =True)
data.head()

**Feature Engineering**

In [None]:
data['tenure_days'] = (data['DOL_date'] - data['DOJ_date']).apply(lambda x:x.days)
data.head()

In [None]:
data.columns

In [None]:
data = data.rename(columns = {'S.No':'id', 'Last Rating':'rating','Monthly Income':'income','Marital Status': 'marital_status'})
data.head()

In [None]:
data.columns = [col.lower() for col in data.columns]
data.head()

In [None]:
data['location'].isna().sum()

In [None]:
data.columns

In [None]:
data = data.drop(['doubtful','location','changed'] ,axis = 1)

In [None]:
data.head()
data.shape

In [None]:
data.to_csv('data_complete_location.csv', index= False)

**EXPLORATORY DATA ANALYSIS**

Univariate visualization

In [None]:
numeric_col = [col for col in data.columns if data[col].dtype in ['int64','int32','float64'] and col not in ['id','tenure']]
numeric_col

In [None]:
data[numeric_col].hist(figsize=(16, 8));

In [None]:
_, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 8))
i = 0
j = 0
for col in numeric_col:
    _=sns.distplot(data[col], ax=axes[i][j]);
    _=plt.xticks(rotation=90)
    j+=1
    if j==2:
        i+=1
        j=0

In [None]:
_, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 8))
i = 0
j = 0
for col in numeric_col:
    _=sns.boxplot(data[col], ax=axes[i][j]);
    _=plt.xticks(rotation=90)
    j+=1
    if j==2:
        i+=1
        j=0

In [None]:
cat_cols = [col for col in data.columns if data[col].dtype == 'object']
cat_cols

In [None]:
%matplotlib inline
_, axes = plt.subplots(nrows=5, ncols=2,sharey=True, figsize=(16, 24))
# plt.subplot_tool() ## Works for interactive
plt.subplots_adjust(hspace=0.8)
i = 0
j = 0
for col in cat_cols:
    if col == 'location': continue
    g=sns.countplot(x=col, data=data, ax=axes[i][j], order = list(data[col].value_counts().reset_index()['index']));
    if col in  ['remarks','corrected_location','district','state']:
        _=g.set_xticklabels(g.get_xticklabels(), rotation=90)
#     _ = plt.xticks(rotation=90)
    j+=1
    if j==2:
        i+=1
        j=0

Multivariate visualization

In [None]:
corr_matrix = data[numeric_col].corr()
sns.heatmap(corr_matrix, annot = True);

In [None]:
numeric_col

In [None]:
cat_cols

In [None]:
_, axes = plt.subplots(nrows=5, ncols=2,sharey=True, figsize=(16, 30))
plt.subplots_adjust(hspace=0.8)

i = 0
j = 0
for col in cat_cols:
    if col == 'location': continue
    g=sns.boxplot(x=col,y='tenure_days', data=data, ax=axes[i][j]);
    if col in  ['remarks','corrected_location','district','state']:
        _=g.set_xticklabels(g.get_xticklabels(), rotation=90)
#     _ = plt.xticks(rotation=90)
    j+=1
    if j==2:
        i+=1
        j=0

In [None]:
plt.figure(figsize= (8,6))
sns.boxplot(x='grade', y='income', data=data[data['income']<1e5], order = sorted(data['grade'].unique()));

**CLUSTERING**

In [None]:
%reset -f


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline 
import seaborn as sns
sns.set()

In [None]:
data = pd.read_csv('data_complete_location.csv')
data.head()

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Filter data
left_emp =  data[['sat_score', 'rating']]
# Create groups using K-means clustering.

ss= StandardScaler()
left_emp_scaled = ss.fit_transform(left_emp)
left_emp_scaled.shape
kmeans = KMeans(n_clusters = 4, random_state = 10).fit(left_emp_scaled)

In [None]:
left_emp['label'] = kmeans.labels_
# Draw scatter plot
_ = plt.scatter(left_emp['sat_score'], left_emp['rating'], c=left_emp['label'],cmap='Accent')
_ = plt.xlabel('Satisfaction Level')
_ = plt.ylabel('Last Evaluation')
_ = plt.title('4 Clusters of employees who left')
plt.show()

In [None]:
left_emp =  data[['tenure', 'income']]
left_emp = left_emp[left_emp['income']<1e5]
# Create groups using K-means clustering.

ss= StandardScaler()
left_emp_scaled = ss.fit_transform(left_emp)
left_emp_scaled.shape
kmeans = KMeans(n_clusters =4 , random_state = 10).fit(left_emp_scaled)

left_emp['label'] = kmeans.labels_
# Draw scatter plot
_ = plt.scatter(left_emp['tenure'], left_emp['income'], c=left_emp['label'],cmap='Accent')
_ = plt.xlabel('Tenure')
_ = plt.ylabel('Income')
_ = plt.title('4 Clusters of employees who left')
plt.show()

In [None]:
left_emp =  data[['age', 'income']]
left_emp = left_emp[left_emp['income']<1e5]
# Create groups using K-means clustering.

ss= StandardScaler()
left_emp_scaled = ss.fit_transform(left_emp)
left_emp_scaled.shape
kmeans = KMeans(n_clusters =6 , random_state = 10).fit(left_emp_scaled)

left_emp['label'] = kmeans.labels_
# Draw scatter plot
_=plt.scatter(left_emp['age'], left_emp['income'], c=left_emp['label'],cmap='Accent')
_=plt.xlabel('Age')
_=plt.ylabel('Income')
_=plt.title('6 Clusters of employees who left')
plt.show()

**FREQUENT PATTERN MINING**

In [None]:
%reset -f


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline 
import seaborn as sns
sns.set()

In [None]:
data = pd.read_csv('data_complete_location.csv')
data.head()

In [None]:
grade_int = {'E1':1,'E2':2,'M1':3,'M2':4,'M3':5,'M4':6,'CXO':7}
data['grade_int'] = data['grade'].apply(lambda x: grade_int[x])

In [None]:
not_required =  ['grade','dol_date','doj_date','id','corrected_location','district']
selected_cats = [ col for col in data.columns if data[col].dtype=='object' and col not in not_required]
selected_cats

In [None]:
selected_nums = [col for col in data.columns if col not in selected_cats+not_required]
selected_nums

**Frequent Item Sets**
Some points to be noted:

Income is dependent on the grade of the employee.
Age and income are positively correlated
Due to the above two points, only grade is considered for the frequent item sets calculation
Tenure and sat_score are binned so as to be used for frequent itemset calculation purpose.

In [None]:
sns.distplot(data['tenure'])
sns.distplot(data['sat_score'])


In [None]:
def sat_binner(x):
    return x//20 + 1 if not x%20 == 0 else x//20
data['sat_binned'] = data['sat_score'].apply(sat_binner).astype('object')

In [None]:
def tenure_binner(x):
    return x//2 + 1 if not x%2 == 0 else x//2
data['tenure_binned'] = data['tenure'].apply(tenure_binner).astype('object')

In [None]:
cols_for_frequent_items = ['grade','gender','education','rating','marital_status','zone','remarks','tenure_binned','sat_binned']
data_fp = data[cols_for_frequent_items]
data_fp_enc = pd.get_dummies(data_fp, columns = data_fp.columns)
data_fp_enc.head()

In [None]:
pd.set_option('max_colwidth', 100)


In [None]:
pip install mlxtend


In [None]:
from mlxtend.frequent_patterns import apriori

freq_pattern = apriori(data_fp_enc, min_support=0.20, use_colnames=True)
freq_pattern['length'] = freq_pattern['itemsets'].apply(lambda x: len(x) )
freq_pattern[freq_pattern['length']>=4].sort_values('support',ascending= False)

In [None]:
fp2 = data[(data['gender']== 'Male') & (data['grade']=='E1') & (data['education'] =='Bachelors') & (data['tenure']<=2) ]
fp2.groupby('remarks').size().sort_values(ascending = False)

In [None]:
# Interesting FP 
fp1 = data[(data['gender']== 'Male') & (data['grade']=='E1') & (data['education'] =='Bachelors') & (data['remarks']=='Issues with the Manager') ]


**TENURE PREDICTION**

In [None]:
%reset -f


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
# from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
# from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
from math import sqrt
np.random.seed(42)

In [None]:
data = pd.read_csv('data_complete_location.csv')
data.head()

In [None]:
data_pred = data[['id','grade','tenure','gender','education','age','rating','income','sat_score','marital_status',\
                 'zone','remarks']]

In [None]:
X = data_pred.drop(['id','tenure'], axis =1)
y = data_pred['tenure']

In [None]:
selected_cats = [col for col in X.columns if X[col].dtype == 'object']
selected_nums = [col for col in X.columns if col  not in selected_cats]

In [None]:
X = pd.get_dummies(X, columns = selected_cats)
train_samples = int(0.9*data_pred.shape[0])
train_indices = list(range(train_samples))
val_indices = list(range(train_samples, data_pred.shape[0]))
train_X = X.loc[train_indices, : ]
train_y = y.loc[train_indices]
val_X = X.loc[val_indices, : ]
val_y = y.loc[val_indices]

train_X.shape
train_y.shape
val_X.shape
val_y.shape

In [None]:
ss= StandardScaler()
train_X_scaled = pd.DataFrame(ss.fit_transform(train_X), columns = train_X.columns)
train_y_logged = np.log1p(train_y)
val_X_scaled = pd.DataFrame(ss.transform(val_X), columns = val_X.columns)
# val_y = np.log1p(val_y)

In [None]:
def fit_model(model):
    if model == DecisionTreeRegressor:
        reg = model(random_state = 291)
    else:
        reg = model()
    reg.fit(train_X_scaled, train_y_logged)
    val_y_hat = np.expm1(reg.predict(val_X_scaled))
    print(f'MAE: {mean_absolute_error(val_y_hat, val_y)}')
    print(f'RMSE: {sqrt(mean_squared_error(val_y_hat, val_y))}')
#     return sqrt(mean_squared_error(val_y_hat, val_y))
    fig, ax = plt.subplots(1,2, figsize=(16,4))
    
    ax[0].plot(list(range(len(val_y))), val_y_hat, label= 'Predicted Tenure (in yrs)')
    ax[0].plot(list(range(len(val_y))), val_y, label = 'Original  Tenure (in yrs)')
    ax[0].legend(loc = 'best')
    ax[0].set_title('Predictions')
    
    print(f'Using model : {model}')
    if model in [Lasso, Ridge, LinearRegression]:
        coeff_df = pd.DataFrame(reg.coef_, train_X_scaled.columns, columns=['Coefficient'])  

    elif model in [XGBRegressor,DecisionTreeRegressor]:
        coeff_df = pd.DataFrame(reg.feature_importances_, train_X_scaled.columns, columns=['Coefficient'])  
        
    else:
        print("No feature importance graph for DummyRegressor")
        return 
    
    coeff_df["abs"] = coeff_df.Coefficient.apply(np.abs)
    coeff_df = coeff_df.sort_values(by="abs", ascending=False).drop("abs", axis=1)
    
    ax[1].bar(coeff_df.index[:15],coeff_df['Coefficient'][:15])
    _ = plt.xticks(rotation=90)
    ax[1].set_title('Feature importance')

In [None]:
fit_model(DummyRegressor)


In [None]:
fit_model(LinearRegression)


In [None]:
fit_model(XGBRegressor)


In [None]:
def plot_ensemble(model1, model2):
    if model1 == DecisionTreeRegressor:
        m1  = model1(random_state=291)
    else:
        m1 = model1()
    m1.fit(train_X_scaled, train_y_logged)
    m2 = model2()
    m2.fit(train_X_scaled, train_y_logged)
    val_y_hat = (np.expm1(m1.predict(val_X_scaled)) + np.expm1(m2.predict(val_X_scaled)))/2.0
    print(f'MAE: {mean_absolute_error(val_y_hat, val_y)}')
    print(f'RMSE: {sqrt(mean_squared_error(val_y_hat, val_y))}')
    
    fig, ax = plt.subplots(1,1)
    
    ax.plot(list(range(len(val_y))), val_y_hat, label= 'Predicted Tenure (in yrs)')
    ax.plot(list(range(len(val_y))), val_y, label = 'Original  Tenure (in yrs)')
    ax.legend(loc = 'best')
    ax.set_title('Predictions')

In [None]:
plot_ensemble(LinearRegression, XGBRegressor)


In [None]:
plot_ensemble(DecisionTreeRegressor, XGBRegressor)
