<a href="https://colab.research.google.com/github/IshantWadhwa4/MLFramework/blob/master/Code_ML_FrameWork_Tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Framework 



## Table of Contents

1.   [Get Data](#Section1)
2.   [Basic EDA](#Section2)
3.   [Pre-Modeling](#Section3)
4.   [Modeling](#Section4)
5.   [Post Modeling](#Section5)

<a id = Section1></a>
## 1. Get Data

### Get Data from Database

### Get Data from CSV/Text/Excel Files

In [0]:
import pandas as pd
data_path = 'https://storage.googleapis.com/industryanalytics/trans_fraud_data.csv'
df = pd.read_csv(filepath_or_buffer = data_path)

### Get Data from API's

### Web scraping

<a id = Section2></a>
## 2. Basic EDA

### 1. Pandas Profiling

In [0]:
!pip install pandas_profiling==2.5.0

In [0]:
import pandas_profiling
report = pandas_profiling.ProfileReport(df)
#covert profile report as html file
report.to_file("EDA.html")

### 2. DataFrame info,describe,head 

In [0]:
df.head(10)

In [0]:
df.info(verbose=True)

In [0]:
# pd.options.display.float_format = "{:.2f}".format
df.describe()

### 3. Null and Zero values

In [0]:
# LIb : need pandas lib for this function.
# Parameters : Only required dataframe for which you want zeros and null for each column
# return: dataframe with number of zeros and null

def get_number_zeros_null(df):
  '''
       LIb : need pandas lib for this function.

       Input : Only required dataframe for which you want zeros and null for each column       
       Output: dataframe with number of zeros and null
  '''
  null_zero_dict={ }
  null_zero_dict['Number_of_nulls'] = df.isnull().sum()
  null_zero_dict['Number_of_zeros'] = (df==0).astype(int).sum()
  return pd.DataFrame(null_zero_dict).T

### 4. Datatype of Columns

In [0]:
# convert type of columns

def convert_type(df,list_column,list_type):
  '''
    convert column's data type
    
    Input: dataframe, list of columns you want to convert, type in which you want to convert
    output: pandas data frame   
  '''
  for k,col in enumerate(list_column):
    df[col] = df[col].astype(list_type[k])
  return df

### 5. Distribution of Data Visualization

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

In [0]:
def get_numeric_data_columns(df):
  '''
      return list of all numeric data columns name
  '''
  return list(df._get_numeric_data().columns)

In [0]:
def get_catagorical_data_columns(df):
  '''
      return list of all catagoric data columns name
  '''
  return list(set(df.columns) - set(df._get_numeric_data().columns))

#### 4.1. Catagorical Data Distribution Single column
    

In [0]:
def draw_countPlot_grid(df):
  import math
  fig=plt.figure(num=None, figsize=(12, 10), dpi=80, facecolor='w', edgecolor='k')
  list_columns = get_catagorical_data_columns(df)
  n_rows = math.ceil(len(list_columns)/3)
  n_cols = 3
  for i, var_name in enumerate(list_columns):
    if len(df[var_name].unique()) < 8:
      ax=fig.add_subplot(n_rows,n_cols,i+1)
      sns.countplot(x = var_name, data=df)
      ax.set_title(var_name+" Distribution")
  fig.tight_layout()  # Improves appearance a bit.
  plt.show()

#### 4.2. Numerical Data Distribution Single column
    

In [0]:
def draw_distributionPlot_grid(df):
  import math
  fig=plt.figure(num=None, figsize=(12, 10), dpi=80, facecolor='w', edgecolor='k')
  list_columns = get_numeric_data_columns(df)
  n_rows = math.ceil(len(list_columns)/3)
  n_cols = 3
  for i, var_name in enumerate(list_columns):
    ax=fig.add_subplot(n_rows,n_cols,i+1)
    sns.distplot(df[var_name],hist=True,axlabel=var_name)
    ax.set_title(var_name+" Distribution")
  fig.tight_layout()  # Improves appearance a bit.
  plt.show()

In [0]:
# how to see distribution of data if more than 5000 columns are there
# Solution: One thing is find SD of all the columns and is SD is large than distribution is not good 
import numpy as np

def get_SD_columns(df):
  columns_numeric = get_numeric_data_columns(df)
  dist_sd = {}
  for col in columns_numeric:
    dist_sd[col] = np.std(df[col])
  result = pd.DataFrame(dist_sd,index=[0]).T
  result['columns_name'] = result.index
  result['SD'] = result[0]
  result.reset_index(drop=True,inplace=True)
  result.drop([0],axis=1,inplace=True)
  return result

#### 4.3. scatter plot relation b/w columns

In [0]:
import plotly.express as px
def plot_scatter_plotlyexpress(df,number_of_rows = 2000):
  fig = px.scatter_matrix(df[:number_of_rows], dimensions= list(df.columns))
  fig.show()

In [0]:
import plotly.express as px
# class is any catagorical column name mainly target variable to see values
def plot_scatter_class_plotlyexpress(df,class,number_of_rows = 2000):
  fig = px.scatter_matrix(df[:number_of_rows], dimensions= list(df.columns),color = class)
  fig.show()

#### 4.4. Heatmap for corelation

In [0]:
# with all columns
def heatmap_allcolumns(df):
  sns.heatmap(df=df.corr(),annot=True, cmap="Blues")


In [0]:
# create Heatmap for highly co-related(given threshold) columns
# seaborn pandas and numpy

def create_seaborn_heatmap_highcorelated(df,posThreshold,negThreshold):
  '''
      create Heatmap for highly co-related(given threshold) columns

      Input: dataframe, positive threshold, negitive threshold
      Plot: Heatmap
  '''
  df_corr = df.corr()
  tempdf = df_corr[(df_corr > posThreshold) | (df_corr < -negThreshold)]
  tempdf.replace(to_replace=1,value=np.nan,inplace=True)
  tempdf.dropna(axis=1,how='all',inplace=True)
  tempdf.dropna(axis=0,how='all',inplace=True)
  sns.heatmap(tempdf,annot=True, cmap="Blues")


#### 4.5. Box plot for Outliers

In [0]:
def draw_boxPlot_grid(df):
  import math
  fig=plt.figure(num=None, figsize=(12, 10), dpi=80, facecolor='w', edgecolor='k')
  list_columns = get_numeric_data_columns(df)
  n_rows = math.ceil(len(list_columns)/3)
  n_cols = 3
  for i, var_name in enumerate(list_columns):
    ax=fig.add_subplot(n_rows,n_cols,i+1)
    sns.boxplot( y=df[var_name])
    ax.set_title(var_name+" Distribution")
  fig.tight_layout()  # Improves appearance a bit.
  plt.show()

In [0]:
## NOTE*** Before use this do dataframe scaling 
# If number of columns are very large we can use this.

# The main motive is to find difference between max value and the quantile_threshold value and 
# draw a line graph to see is there any possible outlier 
# If SD=0 That mean all rows has same value in that column

def get_thresholdDiff_outliers(df,quantile_threshold):
  '''
      The main motive is to find difference between max value and the quantile_threshold value and 
      draw a line graph to see is there any possible outlier. 
      
      input: dataframe, quantile_threshold
      output: df with column name and diff of max value and the quantile_threshold
  '''
  columns_numeric = get_numeric_data_columns(df)
  quantile = []
  dict_quantile={}
  for col in columns_numeric:
    dict_quantile[col] = df[col].max() - df[col].quantile(quantile_threshold)
  result = pd.DataFrame(dict_quantile,index=[0]).T
  result['columns_name'] = result.index
  result['Difference'] = result[0]
  result.reset_index(drop=True,inplace=True)
  result.drop([0],axis=1,inplace=True)
  return result


### 6. Some questions you want answers from your data 

### 7. EDA Results

 Questions:
 1. How many null values and zeros in the columns.
      1. How i will fill na and zeros.(mean,median,mode,use simple models    and groupby )   
 2. Distribution of data 
      1. Need upsampling(smote) or downsampling?
      2. Need log trasformation for normal distribution?
      3. Need scaling of data?
      4. Need to solve Biasness in data?
 3. Heat map
      1. Corelation between columns(what columns are required)
          Highly corelated columns can be removed
 4. Outliers
      1. Need to remove outliars or change its value to 75% etc  

<a id = Section3></a>
## 3. Pre Modeling

### 1. Solve for null and zero values

In [0]:
get_number_zeros_null(df)
# 1. First see the percentage for null values, if it is more than 60% and no chance of geeting the values we can skip/remove that column
# 2. See zeros has meaning in the data or they are just reprentation of null, if it is like null than replace all zero with null. (np.nan)
# Solution
# 1. Replace null with mean,median,mode (Chances of bisness of data)
# 2. Groupby with some columns and calculate mean,median,mode and than replace with null
# 3. Split null and not null, with not null create a model(classification/regration) and calculate for null values with that model


In [0]:
# Task 1

# Remove column if it has more than threshold null values(threshold in percentage % )

def remove_null_columns(df,threshold):
  '''
      Input: Dataframe,threshold
      Output: list of columns to be drop
  '''
  null_values = get_number_zeros_null(df)
  null_values.loc['null_percantage'] = (null_values.loc['Number_of_nulls']/df.shape[0])* 100
  drop_column = []
  for col in null_values.columns:
    if null_values.loc['null_percantage',col] >= threshold:
      drop_column.append(col)
  return drop_column


In [0]:
# Task 2

# Replace null with mean median or mode, for numeric values median for catagorical value with mode

def replace_null(df):
  numeric_columns = get_numeric_data_columns(df)
  catagoric_columns = get_catagorical_data_columns(df)
  for col in numeric_columns:
    df[col].fillna(df[col].median(),inplace=True)
  for col in catagoric_columns:
    df[col].fillna(df[col].mode(),inplace=True)
  return df


def replace_null_columns(df,list_columns):
  numeric_columns = get_numeric_data_columns(df)
  catagoric_columns = get_catagorical_data_columns(df)
  for col in list_columns:
    if col in numeric_columns:
      df[col].fillna(df[col].median(),inplace=True)
    elif col in catagoric_columns:
      f[col].fillna(df[col].mode(),inplace=True)
  return df





### 2. Solve for Outliars

In [0]:
# First check the box plot and if outliars are meaningfull than do nothing. else you can do below tasks
# Task 1: If we have very less outliars in a column like 1% than we can remove that row
# Task 2: If ouliars are large you can replace tham with null and treat them as null values  

#### 2.1 Solve for SD is zero

In [0]:
# if any column has zero SD that mean all the values are same so we can remove that column
def remove_column_SD_Zero(df):
  numeric_columns = get_numeric_data_columns(df)
  drop_column = []
  for col in numeric_columns:
    if np.std(df[col])== 0 :
      drop_column.append(col)
  return drop_column

df.drop(remove_column_SD_Zero(data),axis=1,inplace=True)

### 3. Solve for Dis-balalance Data

### 4. Binning

### 5. Feature Engineering 

#### 5.1. Grouping Operations

#### 5.2. Feature Split

#### 5.3. Extracting Date

#### Explore featuretools lib


### 6.Feature Selection


#### 6.1. Correlation 

#### 6.2. Chi-Squared

#### 6.3. Recursive Feature Elimination

#### 6.4. Lasso: SelectFromModel

#### 6.5. Tree-based: SelectFromModel

#### We can use above all

### 7. Encoding

#### 7.1 One-Hot Encoding

#### 7.2 Label Encoding 

#### 7.3 Helmert Encoding

#### 7.4 Hashing Encoding

#### 7.5 M-estimator Encoding

### 8. Dimensionality Reduction

#### 8.1. Principal Component Analysis(PCA)

In [0]:
# We will do dimred using PCA with sklearn

# Always use scale data for improving performance
# Use X for dimensionality reduction 
# This will do the fast computation and not much change in the accuracy
# we dont know the columns made by PCA

def dimred_PCA(X,information_loss):
  '''
    input: information_loss how much info loss is good for you in percentage, X is the dependent variables/columns dataframe

  '''
  from sklearn.decomposition import PCA
  info = 1 - (infomation_loss/100)
  pca = PCA(info).fit(X)
  #Variance_Explained_PCA_graph (pca)
  print('number of columns left are: {}'.format(pca.n_components_))
  transform_df = pca.transform(X)
  return transform_df


def Variance_Explained_PCA_graph (pca):
  var=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)
  plt.ylabel('% Variance Explained')
  plt.xlabel('Number of Features')
  plt.title('PCA Analysis')
  plt.ylim(30,100.5)
  plt.style.context('seaborn-whitegrid')
  plt.plot(var)



#### 8.2 Linear Discriminant Analysis (LDA)

### 9. Scaling of data

In [0]:
# Scaling of the value X
# StandardScaler

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X)
x_sc = sc.transform(X)
x_sc_df = pd.DataFrame(x_sc,columns=X.columns)

### 10. Log Trasformation

<a id = Section4></a>
## 4. Modeling

### 1. Data Spliting

In [0]:
# Normal Split
# Spliting of data
X = df.drop('target',axis = 1)
y= df['target']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [0]:
# Shuffle split is use as CV in cross validation and Grid search same as train test split 

# n_split is number of split and test_size is how much % of rows you want in test data set
from sklearn.model_selection import ShuffleSplit

def do_shufflesplit(n_splits,test_size):  
  return ShuffleSplit(n_splits = n_splits, test_size = test_size/100, train_size = 1-(test_size/100), random_state = 0)

### 2. Cross Validation 

In [0]:
# Cross validation with SKlearn
from sklearn.model_selection import cross_validate

def do_crossValidation(algo_object,X,y,cv,scoring):
  '''
      Return : return data frame 
  '''
  algo_name = algo_object.__class__.__name__ 
  cc = cross_validate(algo_object, X, y, cv=cv, return_train_score=True, return_estimator=True, n_jobs=-1, scoring=scroing)
  return get_crossValidation_Result(algo_name,cc)

In [0]:
def get_crossValidation_Result(algo_name,result):
  result_dict = {}
  result_dict['Algo Name'] = algo_name
  result_dict['Time'] = result['fit_time'].mean()
  result_dict['Algo Train Accuracy Score']= result['train_score'].mean()
  result_dict['Algo Test Accuracy Score'] = result['test_score'].mean()
  result_dict['Algo Test Accuracy 3*STD'] = result['test_score'].std()*3
  return pd.DataFrame(result_dict) 

### 3. Grid Search

In [0]:
# Lavi Code for GridSearch

# Helper Class for Initilizing GridSearch

import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV

class EstimatorSelectionHelper:

    # Init function as we create object of this class this function call
    def __init__(self, models, params):
      '''
      models and params are dict
      '''
      if not set(models.keys()).issubset(set(params.keys())):
        missing_params = list(set(models.keys()) - set(params.keys()))
        raise ValueError("Some estimators are missing parameters: %s" % missing_params)
      self.models = models
      self.params = params
      self.keys = models.keys()
      self.grid_searches = {}
      self.best_params = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=True):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs  
            self.best_params[key]  = str(gs.best_params_)

            # print (gs.best_params_.feature_importances_ )
            # try:
            #   print(gs.best_params_.feature_importances_ )
            #   self.feature_importance[key]= gs.best_params_.feature_importances_ 
            # except AttributeError:
            #   pass

    # def Feature_Importance(self):
    #   for each

    # def returnFeatureImportance(self):


    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

### 4. Model *Libs*

### 5. List of Model and Parameters

In [0]:
# List of models

ML_classification_Algo = [
    # Ensemble Learning   
    ensemble.RandomForestClassifier(),    
    #GLM
    linear_model.LogisticRegressionCV(),
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()    
    ]

### 6. Model Implementation

<a id = Section5></a>
## 5. Post Modeling

### 1. Accuracy

### 2. Confusion Metrics

### 3. Precession, Recall, F1, AUC ROC

### 4. RMSE

## 6. ML Interpretation | Explaninable AI

LIME and SHAP lib

## 7. Model Deployment | MLOps

## 8. Create Dashbord