- **Author: Laxmi Vanam**
- **Email: laxmivanam05@gmail.com **

## This script pulls in several features and the target salary data, builds and tests several predictive models to predict the salary on unseen data using the best model. This uses the concept of Object oriented programming and is built using the 4 pillars.


### Import packages 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import gc


#Preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures

#modeling
from sklearn.model_selection import cross_validate, cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

### Defines Data class to create train and test dataframes from the input files

In [2]:
class Dataprep:    
    
    def __init__ (self, train_feature_file, train_target_file, test_feature_file,  
                    target, id_col):
        '''To initialize instance variables passed through object'''
        
        self.id_col = id_col
        self.target = target
        self.train_df = self._create_train_dataframe(train_feature_file,train_target_file)      
        self.test_df = self._create_test_dataframe(test_feature_file)
        
   
    
    def _create_train_dataframe(self,train_feature_file,train_target_file,validate_train_files = True,clean_dataset = True
                                ):
        '''Private method to create train dataframe by preprocessing and label encoding categorical columns'''
        self.train_feature_df = self._load_data(train_feature_file)
        self.train_target_df = self._load_data(train_target_file)    
        
        if validate_train_files:
            self.validate_train_records( self.train_feature_df, self.train_target_df,self.id_col)
            train_df = self._merge_dfs(self.train_feature_df,self.train_target_df,self.id_col)    
            
        if clean_dataset:
            train_df = self._cleandataframe(train_df,id_col,target)
            train_df = self._shuffledataframe(train_df)
            
             
        return train_df    
    
    def _create_test_dataframe(self,test_file ):
        '''Private method to create test dataframe by label encoding categorical columns'''
     
        test_df = self._load_data(test_file)   
        
        #if label_encode:            
            #self._label_encode_check(test_df,self.category_cols)            
        return test_df    
    
    def print_dataframe_shape(self, df, df_name):
        '''To print the shape of the train/ test dataset'''
        print('The shape of the %s dataset is %s' %(df_name, df.shape))     
           
    
    def _load_data(self,file):
        '''Private method to load input files to pandas dataframes'''
        return pd.read_csv(file)
    
    def validate_train_records(self,train_feature_df, train_target_df, id_col):  
        if ((train_feature_df[id_col].nunique() != len(train_feature_df)) 
            | (train_target_df[id_col].nunique() != len(train_target_df))):
            print ('Duplicate ID records exist in the dataframes')
        print('The number of records in features file that are not in target file are',len(set(train_feature_df[id_col])) - len(set(train_target_df[id_col])))
        print('The number of records in target file that are not in features file are',len(set(train_target_df[id_col])) - len(set(train_feature_df[id_col])))
        
    
    def _merge_dfs(self,dataframe1,dataframe2, key , how = 'inner',left_index = False, right_index = False):
        '''Private method to merge train features and the target column for train dataframe'''
        return pd.merge(dataframe1,dataframe2,on = key)#.reset_index(drop = True, inplace = True)
    
   
    def _cleandataframe(self,df,id_col, target): 
        '''Private method to drop duplicates and also the records with no salary from training dataframe'''
        df = df.drop_duplicates(subset = id_col)
        df = df[df[target] >0]
        return df
    
    def _shuffledataframe(self,df): 
        '''Private method to shuffle train dataset for modeling'''
        df = shuffle(df).reset_index(drop = True)
        return df    
        

## Defines input parameters for the Dataprep class

In [3]:
#Define input files path
input_file_path = 'C:/Users/laxmi/CompAnalysis/rawdata/'

#Define input files
train_feature_file = input_file_path + 'train_features.csv'
train_target_file =  input_file_path + 'train_salaries.csv'
test_feature_file =  input_file_path + 'test_features.csv'

#Define other parameters for Data class
category_cols = ['companyId','jobType','degree','major','industry']
numeric_cols = ['yearsExperience','milesFromMetropolis']
target = 'salary'
id_col = 'jobId'

Instantiating Dataprep class:

In [4]:
data = Dataprep(train_feature_file,train_target_file,test_feature_file, target, id_col)

The number of records in features file that are not in target file are 0
The number of records in target file that are not in features file are 0


Exploring datasets:

In [5]:
data.print_dataframe_shape(data.train_df, 'train')
data.print_dataframe_shape(data.test_df, 'test')

The shape of the train dataset is (999995, 9)
The shape of the test dataset is (1000000, 8)


## Defines This is for encoding categorical variables

In [6]:
class PreprocessingCatCols:
    ''''''
    def __init__ (self,df_train, df_test, category_cols):
            self.df_train = df_train
            self.df_test= df_test
            self.category_cols = list(category_cols)        
            self.existing_label_encoded_cols = {}
                   
    def label_encode_check(self,df,cols):
        '''Private method to check for existence of label encoders for categorical columns and uses it. Otherwise, it creates new label encoders'''
        for col in cols:                    
            if col in self.existing_label_encoded_cols:
                self._encode_now(df,col,self.existing_label_encoded_cols[col].all())                 
            else:
                self._encode_now(df,col)     
        return self.encoded_df        
                
    def inverse_encode_check(self,cols):
        '''Method to inverse label encoded values to the original values'''
        for col in cols:                    
            if col in self.existing_label_encoded_cols:
                self._inverselabel_encode_now(df,col)                 
            else:
                raise ValueError("Label incoders must be defined before calling inverse function")    
                
                                  
    def _inverse_encode_now(self,col):
        '''Private method to create label encoders for the given categorical column and adds them to the dictionary'''    
        le = self.existing_label_encoded_cols[col]
        df[col]  = le.inverse_transform(df[col])       
                                
    def _encode_now(self,df,col, le = None):
        '''Private method to create label encoders for the given categorical column and adds them to the dictionary'''    
        if le:     
            df[col] = le.transform(df[col])
        else:                
            le = LabelEncoder()
            le.fit(df[col])
            df[col] = le.transform(df[col])
            self.existing_label_encoded_cols[col] = df[col] 
            self.encoded_df =  pd.DataFrame(self.existing_label_encoded_cols)
            
    def concat_encodedcat_numericfields(self, df1, df2) :
        return pd.concat([df1,df2], axis=1)
        

Instantiating Preprocessing class:

In [7]:
PreprocessingCatCols = PreprocessingCatCols( data.train_df,data.test_df,  category_cols)

Encoding train values:

In [8]:
encoded_train_df = PreprocessingCatCols.label_encode_check(data.train_df,   PreprocessingCatCols.category_cols)

In [9]:
encoded_test_df = PreprocessingCatCols.label_encode_check(data.test_df,   PreprocessingCatCols.category_cols)

In [10]:
final_train_df = PreprocessingCatCols.concat_encodedcat_numericfields(encoded_train_df,data.train_df[numeric_cols])

In [11]:
final_test_df = PreprocessingCatCols.concat_encodedcat_numericfields(encoded_test_df,data.train_df[numeric_cols])

### Define label/ parameters for FeatureEngineering and ModelContainer class

In [12]:
#Label for turning feature engineering on/off
Addinggroupstats_label = True

#Parameters for ModelContainer class
num_procs = 4   #Number of processes for parallel runs
verbose_lvl = 0 #Verbose level for modeling


### Defining FeatureEngineering class to add additional fetures for modeling 

In [13]:
class Addinggroupstats:
    
    def __init__ (self,data, cols_to_filter = None):
        '''To initialize instance variSEz2ables passed through object'''
        self.data = data
        self.target_col = 'salary'
        self.cols_to_filter = ['yearsExperience','milesFromMetropolis']
        self.categoies_for_grouping = ['companyId','jobType','degree','major','industry']
        self.groups = data.train_df.groupby(self.categoies_for_grouping)
        self.train_features = data.train_df.iloc[:,2:-1]
    
    def add_group_statistics(self):        
        '''To add group statistics for each of the categorical columns'''
        self.group_stats_df = pd.DataFrame({})
        self.group_stats_df = self._get_group_statistics()  
        self.merged_data = self._merge_derived_columns_to_original(final_train_df, self.group_stats_df, self.categoies_for_grouping)
        return self.merged_data
    
    def  _get_group_statistics(self):
        '''To calculate various statistics of target salary'''
        target_col = self.data.target
        
        group_stats_df = pd.DataFrame({'group_mean': self.groups[target_col].mean()}) 
        group_stats_df['group_max'] = self.groups[target_col].max()
        group_stats_df['group_min'] = self.groups[target_col].min()
        group_stats_df['group_std'] = self.groups[target_col].std()
        group_stats_df['group_median'] = self.groups[target_col].median()
        group_stats_df.reset_index(inplace = True)
        
        return group_stats_df
    
    def _merge_derived_columns_to_original(self,df1, df2, keys , how = 'left',fillna = False):
        '''To merge the statistics to the original columns in train data frame'''
        merged_df = pd.merge(df1, df2, on=keys)
        merged_df.fillna(0, inplace = True)
        return merged_df
        

Instantiating Addinggroupstats class:

In [14]:
if Addinggroupstats_label:
    feature_generator = Addinggroupstats(data)
    feature_generator.add_group_statistics()

#### Defines ModelContainer class to explore and analyse various models

In [15]:
class ModelContainer:
   
    def __init__(self,modellist = []):
        ''' To initialize instance variables passed through object'''   
        self.best_model = None
        self.predictions = None
        self.modellist = modellist
        self.scores = {} 
                
    def add_model(self, modelname):
        '''To add a new model to the list'''
        self.modellist.append(modelname)
        
    def cross_validate(self,  features,target,  k = 3, num_procs = num_procs):
        '''To cross validate models using given data, k value and the number of processes'''
        self.trainfeatures = features#data.train_df.iloc[:,2:-1]
        self.traintarget = target#data.train_df.iloc[:,-1]
   
        for model in self.modellist:
            score = cross_val_score(model,self.trainfeatures, self.traintarget,cv = k
                                                 ,scoring=('neg_mean_squared_error')
                                                )     
            self.scores[model] =  -1*np.mean(score)
           
    def select_best_model(self):
        '''To choose the best model of all the given ones by the score'''
        self.best_model = min(self.scores, key = self.scores.get)
        return self.best_model
    
        
    def best_model_fit(self,traifeatures,traintarget):
        '''To fit the best model to the train dataset'''  
        self.best_model.fit(traifeatures, traintarget)
        
    def best_model_predict(self,testfeatures):
        '''To predict the target value for the test dataset'''
        self.predictions = self.best_model.predict(testfeatures)    
        
    def save_results(self):
        ''' To save the test results if needed'''
        pass
    
    @staticmethod     # use as decorator   
    def get_feature_importance(model, cols):
        '''Static method which can be accessed outside the class with no self/ cls parameter as well'''
        if hasattr(model, 'feature_importances_'):
            feature_importance = model.feature_importances_
            feature_imp_df = pd.DataFrame({'Features':cols,'Importance' : feature_importance})         
            feature_imp_df.sort_values(by = 'Importance', ascending = False,inplace = True )
            feature_imp_df.set_index('Features', inplace=True, drop=True)
            feature_imp_df.plot.bar()
            plt.show()
        else:
            print("Feature Importance does not exist for the current model")
        
    def print_summary(self):
        '''To print summary of the final model'''
        print('\nModel Summary:\n')
        self.get_feature_importance (self.best_model,data.train_df.iloc[:,2:-1].columns) 
            

Instantiating Modelcontainer class:

In [16]:
modelcontainer = ModelContainer()
modelcontainer.add_model(LinearRegression())
modelcontainer.add_model(Ridge(alpha = 1.0))

modelcontainer.add_model(RandomForestRegressor(n_estimators = 60,n_jobs = num_procs, max_depth = 15, min_samples_split = 80, max_features = 8,  verbose = verbose_lvl))

modelcontainer.add_model(GradientBoostingRegressor(n_estimators = 40, max_depth = 7, loss = 'ls',   verbose = verbose_lvl))

In [17]:
modelcontainer.cross_validate(feature_generator.merged_data,data.train_df.iloc[:,-1], k=2)
modelcontainer.scores

{LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False): 1499.1246097719795,
 Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
       normalize=False, random_state=None, solver='auto', tol=0.001): 1499.1246097032472,
 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=15, max_features=8, max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=80, min_weight_fraction_leaf=0.0,
                       n_estimators=60, n_jobs=4, oob_score=False,
                       random_state=None, verbose=0, warm_start=False): 1500.9937395431825,
 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                           init=None, learning_rate=0.1, loss='ls', max_depth=7,
                           max_features=None, max_leaf_nodes=None,


In [18]:
modelcontainer.select_best_model()

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [19]:
modelcontainer.best_model_fit(data.train_df.iloc[:,1:-1],data.train_df.iloc[:,-1])

In [20]:
modelcontainer.best_model_predict(data.test_df.iloc[:,1:])

In [21]:
modelcontainer.print_summary()


Model Summary:

Feature Importance does not exist for the current model


Saving the model:

In [22]:
filename = 'CompensationPredictor.csv'
joblib.dump(modelcontainer.best_model, filename)

['CompensationPredictor.csv']

This joblib can be loaded further on future data.