In [29]:
# Standard libraries
import sys  # System-specific parameters and functions
import os   # Miscellaneous operating system interfaces
import warnings  # Warning control
warnings.filterwarnings("ignore")

# Data manipulation
import pandas as pd  # Data manipulation and analysis
import numpy as np  # Numerical operations

# Visualization
import matplotlib.pyplot as plt  # Plotting library
import seaborn as sns  # Statistical data visualization
from matplotlib.colors import ListedColormap  # Colormap utilities

# Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler  # Preprocessing tools
from sklearn import model_selection, metrics, preprocessing  # Model selection, evaluation, and preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV  # Model selection and evaluation
from sklearn.tree import DecisionTreeClassifier  # Decision tree classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier  # Ensemble methods
from xgboost import XGBClassifier  # Extreme Gradient Boosting
from sklearn.neighbors import KNeighborsClassifier  # k-Nearest Neighbors
from sklearn.neural_network import MLPClassifier  # Multi-layer Perceptron

# Statistical analysis
from statsmodels.stats.outliers_influence import variance_inflation_factor  # Variance inflation factor
from scipy.stats import pointbiserialr, chi2_contingency, spearmanr, entropy  # Statistical functions
from statsmodels.graphics.gofplots import qqplot  # Q-Q plot
from collections import Counter  # Container datatypes

# Tabulate
from tabulate import tabulate  # Pretty-print tabular data

# Set visualization style
#sns.set()  # Set Seaborn default style
#plt.style.use('ggplot')  # Set ggplot style for matplotlib

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

# Balancing techniques
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

In [30]:
#import encoded dataset
import io

df = pd.read_csv('Cleaned2.csv')
df.head()

Unnamed: 0,Ville_id,sex,Age,Married,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
0,91,1,28,1,10,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1,57,1,23,1,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,1
2,115,1,22,1,9,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
3,97,1,27,1,10,4,52667108,19698904,49647648,397715,44042267,1,0,1,22288055,18751329,0,7781123,69219765.0,0
4,42,0,59,0,10,6,82606287,17352654,23399979,80877619,74503502,0,0,0,53384566,20731006,1,20100562,43419447.0,0


In [31]:
# unique levels of education_order
df['education_level'].unique()

array([10,  8,  9,  1, 12,  7,  6,  4, 11, 13, 14,  5, 18,  3, 17, 16, 19,
        2])

In [32]:
# Make sex,Married,incoming_own_farm,incoming_business,incoming_no_business,labor_primary,depressed catergorical

# Make categorical columns
categorical_cols = ['sex', 'Married', 'incoming_own_farm', 'incoming_business',
                   'incoming_no_business', 'labor_primary', 'depressed']
for col in categorical_cols:
    df[col] = pd.Categorical(df[col])


# Verify the changes
print(df.dtypes)

#Make education_level ar ordinal catergorical


Ville_id                    int64
sex                      category
Age                         int64
Married                  category
education_level             int64
total_members               int64
gained_asset                int64
durable_asset               int64
save_asset                  int64
living_expenses             int64
other_expenses              int64
incoming_own_farm        category
incoming_business        category
incoming_no_business     category
incoming_agricultural       int64
farm_expenses               int64
labor_primary            category
lasting_investment          int64
no_lasting_investmen      float64
depressed                category
dtype: object


In [33]:
# drop Ville_id
df = df.drop(columns=['Ville_id'])
df


Unnamed: 0,sex,Age,Married,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
0,1,28,1,10,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1,1,23,1,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,1
2,1,22,1,9,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
3,1,27,1,10,4,52667108,19698904,49647648,397715,44042267,1,0,1,22288055,18751329,0,7781123,69219765.0,0
4,0,59,0,10,6,82606287,17352654,23399979,80877619,74503502,0,0,0,53384566,20731006,1,20100562,43419447.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,1,25,1,7,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1405,1,28,1,10,6,15711078,24023054,15506558,10476722,71588707,1,0,0,23022095,1021536,0,1823477,47384361.0,0
1406,1,66,0,1,1,42440731,22861940,22562605,12545372,56534257,1,0,0,12545373,10454478,0,46444572,10454478.0,1
1407,1,51,1,12,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0


In [34]:
X=df.drop(columns=['depressed'])
y=df['depressed']

#20% allocated for test data and 80% for train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)

In [35]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [36]:
continuous_vars = df.select_dtypes(include=['int64', 'float64']).columns
Continuous_Variables_in_X_train = continuous_vars

categorical_vars = df.select_dtypes(include=['object', 'category']).columns
Categorical_Variables_in_X_train = [var for var in categorical_vars if var != 'depressed']

Date_Month_Year= df.select_dtypes(include=['int32']).columns

# Display the variables
print("Continuous Variables in X_train:")
print(Continuous_Variables_in_X_train)
print("\nCategorical Variables in X train:")
print(Categorical_Variables_in_X_train)


Continuous Variables in X_train:
Index(['Age', 'education_level', 'total_members', 'gained_asset',
       'durable_asset', 'save_asset', 'living_expenses', 'other_expenses',
       'incoming_agricultural', 'farm_expenses', 'lasting_investment',
       'no_lasting_investmen'],
      dtype='object')

Categorical Variables in X train:
['sex', 'Married', 'incoming_own_farm', 'incoming_business', 'incoming_no_business', 'labor_primary']


#### checking imbalance of response

In [37]:
print ('Total not committed fraud :  {} and its percentage is {} %'.format(df.depressed.value_counts()[0], round(df.depressed.value_counts()[0]/df.depressed.value_counts().sum()*100,2)) )
print ('Total committed fraud :  {} and its percentage is {} %'.format(df.depressed.value_counts()[1], round(df.depressed.value_counts()[1]/df.depressed.value_counts().sum()*100,2)) )

Total not committed fraud :  1174 and its percentage is 83.32 %
Total committed fraud :  235 and its percentage is 16.68 %


## --------------------------------------------------------------------------------------------------------

In [38]:
df_backup = df.copy()
df_backup.head()

Unnamed: 0,sex,Age,Married,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen,depressed
0,1,28,1,10,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
1,1,23,1,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,1
2,1,22,1,9,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0,0
3,1,27,1,10,4,52667108,19698904,49647648,397715,44042267,1,0,1,22288055,18751329,0,7781123,69219765.0,0
4,0,59,0,10,6,82606287,17352654,23399979,80877619,74503502,0,0,0,53384566,20731006,1,20100562,43419447.0,0


## Upsampled

In [39]:
X_backup = df_backup.drop(columns=['depressed'])
y_backup = df_backup['depressed']

#20% allocated for test data and 80% for train data
X_train_backup, X_test_backup, y_train_backup, y_test_backup = train_test_split(X_backup, y_backup, test_size=0.2, random_state=28)

In [40]:
#separate majority and minority classes
majority_class = X_train_backup[y_train_backup == 0]
minority_class = X_train_backup[y_train_backup == 1]

print("Size of majority class before upsampling:", majority_class.shape[0])
print("Size of minority class before upsampling:", minority_class.shape[0])

Size of majority class before upsampling: 938
Size of minority class before upsampling: 189


In [41]:
#upsample minority class
minority_upsampled = resample(minority_class,
                              replace=True,
                              n_samples=len(majority_class),
                              random_state=28)

X_upsampled = np.vstack([majority_class, minority_upsampled])
y_upsampled = np.concatenate([np.zeros(len(majority_class)), np.ones(len(majority_class))])

#shuffle data
shuffle_indices = np.arange(len(X_upsampled))
np.random.shuffle(shuffle_indices)
X_upsampled = X_upsampled[shuffle_indices]
y_upsampled = y_upsampled[shuffle_indices]

unique_classes, class_counts = np.unique(y_upsampled, return_counts=True)
print("Class counts after upsampling:")
for cls, count in zip(unique_classes, class_counts):
  print(f"Class {int(cls)}: {count}")

Class counts after upsampling:
Class 0: 938
Class 1: 938


### Logistic Regression

### Random Forest

In [42]:
np.random.seed(28)

rf = RandomForestClassifier(random_state=28)
param_grid = {
    'n_estimators': [50, 80, 100, 120, 150],
    'max_depth': [3, 4, 5, None],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'log2',None]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_upsampled, y_upsampled)

# Print the best hyperparameters
print(grid_search.best_params_)

# Print the best score
print(grid_search.best_score_)

{'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 80}
0.953622695035461


In [49]:
rf=grid_search.best_estimator_


y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))


Accuracy score of this model:  79.08 %
Misclassification rate of this model:  20.92 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.834     0.936     0.882       236
           1      0.118     0.043     0.063        46

    accuracy                          0.791       282
   macro avg      0.476     0.490     0.473       282
weighted avg      0.717     0.791     0.749       282



In [43]:
rf = RandomForestClassifier(n_estimators=120, max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=4, random_state=28)
rf.fit(X_upsampled,y_upsampled)
y_pred = rf.predict(X_test)

accuracy = round(metrics.accuracy_score(y_test, y_pred)*100, 2)
print("Accuracy score of this model: ", accuracy, "%")

misclassification_rate = round(np.mean(y_test != y_pred)*100, 2)
print("Misclassification rate of this model: ", misclassification_rate, "%")

print("\nReport card of this model: ")
print(metrics.classification_report(y_test, y_pred, digits=3))

conf_matrix = metrics.confusion_matrix(y_test, y_pred)
labels = ['Actual Negative', 'Actual Positive']
columns = ['Predicted Negative', 'Predicted Positive']
confusion_table = tabulate(conf_matrix, headers=columns, showindex=labels, tablefmt='grid')
print("Confusion Matrix:")
print(confusion_table)

Accuracy score of this model:  79.08 %
Misclassification rate of this model:  20.92 %

Report card of this model: 
              precision    recall  f1-score   support

           0      0.834     0.936     0.882       236
           1      0.118     0.043     0.063        46

    accuracy                          0.791       282
   macro avg      0.476     0.490     0.473       282
weighted avg      0.717     0.791     0.749       282

Confusion Matrix:
+-----------------+----------------------+----------------------+
|                 |   Predicted Negative |   Predicted Positive |
| Actual Negative |                  221 |                   15 |
+-----------------+----------------------+----------------------+
| Actual Positive |                   44 |                    2 |
+-----------------+----------------------+----------------------+


In [54]:
X_new=np.array([[1,28,1,10,8,7527224,1601457,154785,224455223,1247851,0,1,0,2244558,214566,1,4875662,44778855]])

In [55]:
X_new

array([[        1,        28,         1,        10,         8,   7527224,
          1601457,    154785, 224455223,   1247851,         0,         1,
                0,   2244558,    214566,         1,   4875662,  44778855]])

In [57]:
y_new=rf.predict(X_new)
y_new

array([0.])

In [50]:
X_train

Unnamed: 0,sex,Age,Married,education_level,total_members,gained_asset,durable_asset,save_asset,living_expenses,other_expenses,incoming_own_farm,incoming_business,incoming_no_business,incoming_agricultural,farm_expenses,labor_primary,lasting_investment,no_lasting_investmen
1169,1,28,1,10,8,7527224,1601537,15269301,22440203,12812296,0,0,0,20686522,49769988,1,95923813,86071491.0
102,1,23,1,9,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0
1289,1,22,1,8,5,1753683,80076849,86296158,15081141,76873773,0,0,1,66730708,14436076,1,23888196,40461049.0
723,1,17,1,7,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0
801,0,36,1,14,3,15077211,19026259,23399979,20019213,32831509,0,0,1,46711493,27782218,0,39388544,10932714.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,1,35,1,6,4,67399603,27786667,32030739,3203074,17937214,1,0,0,12812296,91821451,0,10399473,42093727.0
1283,1,28,1,8,5,28912201,22861940,23399979,26692283,28203066,0,0,0,30028818,31363432,0,28411718,28292707.0
1056,1,36,0,18,3,36350497,89045456,48046108,61392255,19378597,0,0,0,40038424,20019212,1,46151904,70067245.0
1302,1,25,1,7,6,14654063,22861940,57848663,53491335,56854563,0,0,0,24023056,46711496,0,24710065,14413832.0


In [45]:
#Install streamlit

In [46]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.40.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<6,>=2.1.5 (from streamlit)
  Downloading watchdog-5.0.3-py3-none-manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading streamlit-1.40.0-py2.py3-none-any.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-5.0.3-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.3/79.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[

In [47]:
import pickle


In [58]:
data = {"model": rf}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [63]:
with open("saved_steps.pkl", 'rb') as file:
    data = pickle.load(file)

rf_loaded = data["model"]

In [61]:
y_n=rf_loaded.predict(X_new)

In [62]:
y_n

array([0.])

In [66]:
#unique levels in each column
for col in df.columns:
    print(f"Unique levels in column '{col}':")
    print(df[col].unique())
    print()

Unique levels in column 'sex':
[1, 0]
Categories (2, int64): [0, 1]

Unique levels in column 'Age':
[28 23 22 27 59 35 34 21 32 29 84 38 56 24 25 44 26 40 55 43 51 53 36 19
 31 41 20 18 37 33 49 48 42 46 30 45 52 39 17 57 63 62 60 69 74 85 47 66
 76 72 65 64 54 73 81 68 91 80 58 67 78 61 87 50 71 82 75 79 70]

Unique levels in column 'Married':
[1, 0]
Categories (2, int64): [0, 1]

Unique levels in column 'education_level':
[10  8  9  1 12  7  6  4 11 13 14  5 18  3 17 16 19  2]

Unique levels in column 'total_members':
[ 5  4  6  8  3  9  2  1  7 10 12 11]

Unique levels in column 'gained_asset':
[28912201 52667108 82606287 35937466 41303144 12013633 11087568  1018915
 12390944 16521259 93596368  1108353 37172832 17142671 75696259 53694088
 24781887 82477118 20651573 11151849 86736603 22375139 10429825 47847064
 57142133 42236978 27572495 52253772 61954716 41303146 49563774  6712542
 22925111 13080371 16521257 67399603 21617699 41823944 99127548 57824402
 75386055 36350497 19207828 59

In [67]:
X.columns

Index(['sex', 'Age', 'Married', 'education_level', 'total_members',
       'gained_asset', 'durable_asset', 'save_asset', 'living_expenses',
       'other_expenses', 'incoming_own_farm', 'incoming_business',
       'incoming_no_business', 'incoming_agricultural', 'farm_expenses',
       'labor_primary', 'lasting_investment', 'no_lasting_investmen'],
      dtype='object')