In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import missingno as mssno
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder
from imblearn.over_sampling import SMOTE,SMOTENC,RandomOverSampler
from sklearn.metrics import accuracy_score,classification_report,f1_score,confusion_matrix

from sklearn.feature_selection import mutual_info_classif,SelectKBest,chi2
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import joblib

# Import Data

In [None]:
df=pd.read_csv("/kaggle/input/iip-data/data.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
print("Rows-",df.shape[0])
print("Columns-",df.shape[1])

In [None]:
df.info()

In [None]:
## dropping the ID column
df=df.drop(['ID'],axis=1)

In [None]:
df.describe(exclude=object).T

> # 1. Exploratory Data Analysis

In [None]:
numerical_features = df.select_dtypes(exclude=object)
print("Numerical Features Names = ",numerical_features.columns)
print("Numerical Features Count",len(numerical_features.columns))

discrete_features=[col for col in numerical_features.columns if len(df[col].unique())<25]
print("Discrete Numerical Features",len(discrete_features))
continous_features =  [ col for col in numerical_features.columns if col not in discrete_features]
print("Continous Numerical Features",len(continous_features))

In [None]:
categorical_features=df.select_dtypes(include=object)
print("Categorical Features Names =",categorical_features.columns)
print("Categorical Features Count",len(categorical_features.columns))

In [None]:
## consdering " ?" has a missing values
df.replace(" ?",np.NAN,inplace=True)

## Handling missing values

In [None]:
df.isnull().sum()

In [None]:
# function returns the column names having more than 50% missing values
def missing_values(df):
    missing_df=pd.DataFrame(100*df.isnull().sum()/df.shape[0],columns=['% Missing'])
    return missing_df[missing_df['% Missing']>0]
     

In [None]:
missing_values(df)

In [None]:
def plot_count(df, col,title_name):
    # Set background color
    plt.rcParams['figure.facecolor'] = '#FFFAF0'
    
    f, ax = plt.subplots(1, 2, figsize=(12, 6))
    plt.subplots_adjust(wspace=0.1)

    s1 = df[col].value_counts()
    N = len(s1)

    outer_sizes = s1
    inner_sizes = s1/N

    outer_colors = ['#9E3F00', '#eb5e00', '#ff781f']
    inner_colors = ['#ff6905', '#ff8838', '#ffa66b']

    ax[0].pie(
        outer_sizes,colors=outer_colors, 
        labels=s1.index.tolist(), 
        startangle=90, frame=True, radius=1.4, 
        explode=([0.05]*(N-1) + [.3]),
        wedgeprops={'linewidth' : 1, 'edgecolor' : 'white'}, 
        textprops={'fontsize': 12, 'weight': 'bold'}
    )

    textprops = {
        'size': 13, 
        'weight': 'bold', 
        'color': 'white'
    }

    ax[0].pie(
        inner_sizes, colors=inner_colors,
        radius=1, startangle=90,
        autopct='%1.f%%', explode=([.1]*(N-1) + [.3]),
        pctdistance=0.8, textprops=textprops
    )

    center_circle = plt.Circle((0,0), .68, color='black', fc='white', linewidth=0)
    ax[0].add_artist(center_circle)

    x = s1
    y = s1.index.tolist()
    sns.barplot(
        x=x, y=y, ax=ax[1],
        palette='YlOrBr_r', orient='horizontal'
    )

    ax[1].spines['top'].set_visible(False)
    ax[1].spines['right'].set_visible(False)
    ax[1].tick_params(
        axis='x',         
        which='both',      
        bottom=False,      
        labelbottom=False
    )

    for i, v in enumerate(s1):
        ax[1].text(v, i+0.1, str(v), color='black', fontweight='bold', fontsize=12)

    plt.setp(ax[1].get_yticklabels(), fontweight="bold")
    plt.setp(ax[1].get_xticklabels(), fontweight="bold")
    ax[1].set_xlabel(col, fontweight="bold", color='black')
    ax[1].set_ylabel('count', fontweight="bold", color='black')

    f.suptitle(f'{title_name}', fontsize=18, fontweight='bold')
    plt.tight_layout()
    plt.show()
    plt.rcParams['figure.facecolor'] = '#FFFFFF'

In [None]:
plot_count(df, 'income_above_limit', 'income_above_limit Distribution')

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(24, 15))

# Distribution of total_employed
total_employed = df['total_employed'].value_counts()
sns.barplot(x=total_employed.index, y=total_employed.values, palette='bright', ax=axs[0,0])
axs[0,0].set_title('Distribution of total_employed', fontdict={'fontname': 'Monospace', 'fontsize': 20, 'fontweight': 'bold'})
axs[0,0].set_xlabel('total_employed', fontdict={'fontname': 'Monospace', 'fontsize': 20})
axs[0,0].set_ylabel('Number of families', fontdict={'fontname': 'Monospace', 'fontsize': 20})
axs[0,0].tick_params(labelsize=10)

# Distribution of Income limit
income = df['income_above_limit'].value_counts()
sns.barplot(x=income.index, y=income.values, palette='bright', ax=axs[0,1])
axs[0,1].set_title('Distribution of Income limit', fontdict={'fontname': 'Monospace', 'fontsize': 20, 'fontweight': 'bold'})
axs[0,1].set_xlabel('Income', fontdict={'fontname': 'Monospace', 'fontsize': 20})
axs[0,1].set_ylabel('Number of people', fontdict={'fontname': 'Monospace', 'fontsize': 20})
axs[0,1].tick_params(labelsize=10)

citizenship = df['citizenship'].value_counts()
sns.barplot(y=citizenship.index,x=citizenship.values,palette='bright',ax=axs[1,0])
axs[1,0].set_title('Distribution of citizenship', fontdict={'fontname': 'Monospace', 'fontsize': 20, 'fontweight': 'bold'})
axs[1,0].set_ylabel('citizenship', fontdict={'fontname': 'Monospace', 'fontsize': 20})
axs[1,0].set_xlabel('Number of people', fontdict={'fontname': 'Monospace', 'fontsize': 20})
axs[1,0].tick_params(labelsize=10)

edu = df['education'].value_counts()
sns.barplot(x=edu.values, y=edu.index, palette='Paired',ax=axs[1,1])
axs[1,1].set_title('Distribution of Education', fontdict={'fontname': 'Monospace', 'fontsize': 20, 'fontweight': 'bold'})
axs[1,1].set_xlabel('Number of people', fontdict={'fontname': 'Monospace', 'fontsize': 20})
axs[1,1].set_ylabel('Education', fontdict={'fontname': 'Monospace', 'fontsize': 20})
axs[1,1].tick_params(labelsize=12)

# Adjust spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
df1=df.copy(deep=True)
df1['education'].value_counts()

In [None]:
df1.replace({' 7th and 8th grade':'upto 12th',' Less than 1st grade':'upto 12th',
             ' 1st 2nd 3rd or 4th grade':'upto 12th',
            ' 5th or 6th grade':'upto 12th',' 9th grade':'upto 12th',' 11th grade':'upto 12th',
                ' 10th grade':'upto 12th',' 12th grade no diploma':'upto 12th'},inplace=True)

In [None]:
df1['education'].value_counts().plot(kind='barh')

In [None]:
print("Income below limit for people upto 12th:",len(df1[(df1['education']=='upto 12th') & (df1['income_above_limit']=='Below limit')])
/len(df1[df1['education']=='upto 12th'])*100)
print("Income Above limit for people upto 12th:",len(df1[(df1['education']=='upto 12th') & (df1['income_above_limit']=='Above limit')])
      /len(df1[df1['education']=='upto 12th'])*100)

<div class="alert alert-success" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 

99% of people with education upto only 12th are under below income limit

In [None]:
df['citizenship'].value_counts()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
x_data=df['citizenship'].value_counts()
axs[0].set_title("Types of citizens")
sns.barplot(x=x_data.values,y=x_data.index,ax=axs[0])



axs[1].set_title("Types of Citizens By Income limit")
data1=len(df.loc[(df['citizenship']=='Native') & (df['income_above_limit']=='Below limit')])/len(df.loc[(df['citizenship']=='Native')])
data2=len(df.loc[(df['citizenship']!='Native') & (df['income_above_limit']=='Below limit')])/len(df.loc[(df['citizenship']!='Native')])
series=(pd.Series({'Native citizenship having income below limit':data1,
              'Non-Native citizenship having income below limit':data2}))
sns.barplot(y=series.values,x=series.index,ax=axs[1])
plt.show()

<div class="alert alert-success" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 
Both Native and Non native has same rate of Income

In [None]:
data=pd.Series({'Employed more than 1 but income is below limit:':len(df.loc[(df['total_employed']>1) & (df['income_above_limit']=='Below limit')])/len(df.loc[(df['total_employed']>1)]) * 100,
'Employed only 1 but income is below limit':len(df.loc[(df['total_employed']==1) & (df['income_above_limit']=='Below limit')])/len(df.loc[(df['total_employed']==1)]) * 100,
'None employed and income is below limit':len(df.loc[(df['total_employed']==0) & (df['income_above_limit']=='Below limit')])/len(df.loc[(df['total_employed']==0)]) * 100})
sns.barplot(y=data.index,x=data.values)
plt.show()

In [None]:
print('People above income limit with more than 1 employed:',len(df.loc[(df['total_employed']>1) & (df['income_above_limit']=='Above limit')]))
print('People above income limit with only 1 employed:',len(df.loc[(df['total_employed']==1) & (df['income_above_limit']=='Above limit')]))

In [None]:
# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot the distribution of Age
sns.histplot(df['age'], bins=30, ax=axes[0, 0])
axes[0, 0].set_title('Distribution of Age', fontsize=16)
axes[0, 0].set_xlabel('Age', fontsize=12)
axes[0, 0].set_ylabel('Number of people', fontsize=12)

# Plot the distribution of working_week_per_year
sns.histplot(df['working_week_per_year'], bins=30, color='red', ax=axes[0, 1])
axes[0, 1].set_title('Distribution of working_week_per_year', fontsize=16)
axes[0, 1].set_xlabel('working_week_per_year', fontsize=12)
axes[0, 1].set_ylabel('Number of people', fontsize=12)

# Plot the distribution of importance_of_record
sns.histplot(df['importance_of_record'], bins=50, color='blue', ax=axes[1, 0])
axes[1, 0].set_title('Distribution of importance_of_record', fontsize=16)
axes[1, 0].set_xlabel('importance_of_record', fontsize=12)
axes[1, 0].set_ylabel('Number of people', fontsize=12)

# Plot the distribution of wage_per_hour
sns.histplot(df['wage_per_hour'], bins=35, ax=axes[1, 1])
axes[1, 1].set_title('Distribution of wage_per_hour', fontsize=16)
axes[1, 1].set_xlabel('wage_per_hour', fontsize=12)
axes[1, 1].set_ylabel('Number of people', fontsize=12)

# Adjust layout
plt.tight_layout()

# Show the subplots
plt.show()

In [None]:
race = df['race'].value_counts()

plt.figure(figsize=(20, 10))
plt.pie(race.values, labels=race.index,
        startangle=50, autopct='%1.1f%%')
centre_circle = plt.Circle((0, 0), 0.7, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title('race distribution', fontdict={
          'fontname': 'Monospace', 'fontsize': 30, 'fontweight': 'bold'})
plt.axis('equal')
plt.legend(prop={'size': 15},loc='upper left')
plt.show()

In [None]:
print('Males with income below limit:',len(df.loc[(df['gender']==' Male') & (df['income_above_limit']=='Below limit')])/len(df.loc[(df['gender']==' Male')]) * 100, '%')
print('Females with income below limit:',len(df.loc[(df['gender']!=' Male') & (df['income_above_limit']=='Below limit')])/len(df.loc[(df['gender']!=' Male')]) * 100, '%')

<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 
There is No Gender Discremination, No High Difference 

In [None]:
# Creating a countplot of income across age
plt.figure(figsize=(20, 7))
sns.countplot(x=df['age'], hue=df['income_above_limit'])
plt.title('Distribution of Income across Age', fontdict={
          'fontname': 'Monospace', 'fontsize': 20, 'fontweight': 'bold'})
plt.xlabel('Age', fontdict={'fontname': 'Monospace', 'fontsize': 15})
plt.ylabel('Number of people', fontdict={
           'fontname': 'Monospace', 'fontsize': 15})
plt.tick_params(labelsize=12)
plt.xticks(rotation=90)
plt.legend(loc=1, prop={'size': 15})
plt.show()

In [None]:
# Creating a countplot of income across Marital Status
plt.figure(figsize=(20, 7))
sns.countplot(x=df['marital_status'], hue=df['income_above_limit'])
plt.title('Income across Marital Status', fontdict={
          'fontname': 'Monospace', 'fontsize': 20, 'fontweight': 'bold'})
plt.xlabel('Marital Status', fontdict={
           'fontname': 'Monospace', 'fontsize': 15})
plt.ylabel('Number of people', fontdict={
           'fontname': 'Monospace', 'fontsize': 15})
plt.tick_params(labelsize=12)
plt.legend(loc=1, prop={'size': 15})
plt.show()

In [None]:
df_num=df.select_dtypes(exclude=object)
corr = df_num.corr(method='pearson')
with sns.axes_style("ticks"):
    f, ax = plt.subplots(figsize=(15, 10))
    ax = sns.heatmap(corr, square=True,annot=True)
plt.show()

<div class="alert alert-success" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 

The most number of people are young, white, male, high school graduates working 50 weeks per year.
From the correlation heatmap, we can see that the dependent feature 'income' is highly correlated with capital gains, total_employed, industry code, working week per year

> # 2.Data Preprocessing

### Handling Missing Values

In [None]:
missing_df=missing_values(df)

In [None]:
print(f"{len(missing_df)} Column(s) have missing values")

In [None]:
print(f"{missing_df[missing_df['% Missing']>50].shape[0]} Column(s) have missing values more than 50%")
missing_df[missing_df['% Missing']>50].index

In [None]:
## Dropping columns having missing values greater than 50%
columns_to_drop=['class', 'education_institute', 'unemployment_reason', 'is_labor_union',
       'occupation_code_main', 'under_18_family',
       'veterans_admin_questionnaire', 'migration_code_change_in_msa',
       'migration_prev_sunbelt', 'migration_code_move_within_reg',
       'migration_code_change_in_reg', 'residence_1_year_ago',
       'old_residence_reg', 'old_residence_state']

In [None]:
print('The shape of data before dropping columns with high proportion of NaN values is - ')
print(f'The shape of data is {df.shape}')

df = df.drop(columns_to_drop,axis=1)

print('---------------------------------------------------------------------------------------------------\n')
print('The shape of data after dropping columns with high proportion of NaN values is - ')
print(f'The shape of data is {df.shape}')

In [None]:
missing_values(df)

<div class="alert alert-danger" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 

Country of birth own, father, mother are not important features that may have any affect on model

In [None]:
df = df.drop(['country_of_birth_own','country_of_birth_father','country_of_birth_mother'],axis=1)

In [None]:
len(missing_values(df))

In [None]:
df.shape

### Dealing With Outliers 

In [None]:
df.plot(kind="box", 
                vert=True,
                subplots=True, 
                layout = (6,5), 
                figsize=(20,20), 
                sharex=False, 
                sharey=False);

<div class="alert alert-danger" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
    
These columns are having most outliers 
'wage_per_hour','gains','losses','stocks_status','importance_of_record'
<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 

Some of the possible ways to tackle this outlier is to transform values, clip values, drop values or to retain information

<div class="alert alert-danger" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 

* Clipping doesn't seem nice option as it will delete the correct information
* We will check the number of outliers per column to see what can be done

In [None]:
for i in ['wage_per_hour','gains','losses','stocks_status','importance_of_record']:
    IQR=df[i].quantile(0.75)-df[i].quantile(0.25)
    lower=df[i].quantile(0.25)-(IQR*1.5)
    upper=df[i].quantile(0.75)+(IQR*1.5)
    
    outliers=[]
    
    for j in df[i]:
        if j<lower:
            outliers.append(j)
    for k in df[i]:
        if k>upper:
            outliers.append(k)
    print(f'The percentage of outliers in {i}:',(len(outliers)/len(df)*100),'%')
    

<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;">

As the outliers percent are less we will proceed with retaining information and check if transformation can help

In [None]:
df.sample(5)

In [None]:
df=df.drop(['household_stat'],axis=1)

In [None]:
numerical_fetures=df.select_dtypes(exclude=object)
categorical_features=df.select_dtypes(include=object)
print("Numerical Features Count",len(numerical_features.columns))
print("Categorical Features Count",len(categorical_features.columns))

### Handling Imbalanced Data

In [None]:
print('The class Imbalance in the data is given below')
print(df['income_above_limit'].value_counts(),'\n')
print('The class imbalance in terms of percentage is given below ')
print(f"{df['income_above_limit'].value_counts(normalize=True)*100}")

<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 

Clearly we have a highly imbalanced dataset available with us and hence we need to perform steps to mitigate the imbalance accordingly. So we are going Upsample the minority class (Here, minority class is 'Above limit')
<div class="alert alert-danger" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 

Downsampling would not be good here as the that would decrease our samples from 200000 to 24000

In [None]:
X=df.drop(['income_above_limit'],axis=1)
y=df['income_above_limit']

In [None]:
#creating the column_name index dictionary
cat_col_index={col_name:i for i,col_name in enumerate(X.columns) if col_name in categorical_features.columns}

list(cat_col_index.values())

In [None]:
oversampling=SMOTENC(categorical_features=list(cat_col_index.values()),random_state=37)

In [None]:
X_resampled,y_resampled=oversampling.fit_resample(X,y)

In [None]:
X_resampled.shape

In [None]:
y_resampled.shape

In [None]:
y_resampled.value_counts()

<div class="alert alert-success" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 
    Data is Balanced

### Train-Test Split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_resampled,y_resampled,test_size=0.3,random_state=91)

In [None]:
print(f"shape of training dataset{X_train.shape}")
print(f"shape of testing dataset{X_test.shape}")

In [None]:
X_train.reset_index(drop=True,inplace=True)
X_train.head()

In [None]:
X_test.reset_index(drop=True,inplace=True)
X_test.head()

### Encoding

<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 

The correct approach to performing data preparation with a train-test split evaluation is to fit the data preparation on the training set, then apply the transform to the train and test sets. This requires that we first split the data into train and test sets. We can then define the scaling, encoding methods and call the fit() function on the training set, then apply the transform() function on the train and test sets to create a normalized version of each dataset.
    

#### 1.Encoding Features

In [None]:
numerical_features.columns

In [None]:
categorical_features.columns

In [None]:
cat_columns=['gender', 'education', 'marital_status', 'race', 'is_hispanic',
       'employment_commitment', 'industry_code_main', 'household_summary',
       'tax_status', 'citizenship']
## OHe encoder
encoder=OneHotEncoder(sparse_output=False,handle_unknown='ignore')
X_train_encoded=encoder.fit_transform(X_train[cat_columns])
X_train_encoded=pd.DataFrame(X_train_encoded,columns=encoder.get_feature_names_out(cat_columns))
X_train_encoded=pd.concat([X_train[numerical_features.columns],X_train_encoded],axis=1)
X_train_encoded.shape

In [None]:
X_test_encoded=encoder.transform(X_test[cat_columns])
X_test_encoded=pd.DataFrame(X_test_encoded,columns=encoder.get_feature_names_out(cat_columns))
X_test_encoded=pd.concat([X_test[numerical_features.columns],X_test_encoded],axis=1)
X_test_encoded.shape

#### 2.Encoding Target Variables

In [None]:
y_train_encoded=y_train.replace({'Below limit':0,'Above limit':1})
y_test_encoded=y_test.replace({'Below limit':0,'Above limit':1})

## Model Building

In [None]:
models={
    "LR":LogisticRegression(),
    "Knnclassifier":KNeighborsClassifier(),
    "dt":DecisionTreeClassifier(),
    "rf":RandomForestClassifier()
}


In [None]:
def model_evalutions(models,X_train,X_test,y_train,y_test,params=None):
    f1_dict=dict()
    final_models=dict()
    for i in range(len(list(models))):
        ## Fit the model
        model=list(models.values())[i]
        model.fit(X_train,y_train)
        final_models[list(models.keys())[i]]=model
        ## prediction with unseen data
        y_pred=model.predict(X_test)
        f1_dict[list(models.keys())[i]]=f1_score(y_test,y_pred)
    return f1_dict,final_models

In [None]:
f1_score_dict,final_models=model_evalutions(models,X_train_encoded,X_test_encoded,y_train_encoded,
                                            y_test_encoded)

In [None]:
f1_score_dict

<div class="alert alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 
1. Out of the all Classifer Algorithms **Random Forest Algorithm** Performance is best
2. Performing Hyper-Parameter Tunning inorder to improve the accuracy 



In [None]:
params={
 'max_depth': [30,50,70],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400,600]}

<div class="alert alert-success" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
 Using RandomSearchCV Because of Large Dataset

In [None]:
# model=RandomForestClassifier(random_state=77)
# final_model=RandomizedSearchCV(model,param_distributions=params,verbose=1,n_iter=1,scoring='accuracy')

In [None]:
# final_model.fit(X_train_encoded,y_train_encoded)

In [None]:
# final_model.best_estimator_

In [None]:
# trial_model=RandomForestClassifier(n_estimators=1000,max_depth=70,random_state=88)

In [None]:
# trial_model.fit(X_train_encoded,y_train_encoded)

In [None]:
# y_pred=trial_model.predict(X_test_encoded)

In [None]:
# f1_score(y_test_encoded,y_pred)

<div class="alert alert-success" style="font-size:14px; font-family:verdana; line-height: 1.7em;"> 
Slight Difference seen after hyperparameter Tunning - Model Accuracy is improved from 96.40% to 96.42%

In [None]:
# joblib.dump(trial_model,"Model_RF.joblib")
# joblib.dump(encoder,"OHE_enc.joblib")

In [None]:
joblib.dump(final_models['rf'],"final_model.joblib")