In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

In [15]:
csv_file = 'Global Missing Migrants Dataset.csv'  # Replace with the path to your CSV file
data = pd.read_csv(csv_file)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

print(train_data.shape)
print(test_data.shape)

(10416, 19)
(2604, 19)


In [16]:
# Columns to one-hot encode
columns_to_encode = ['Incident Type', 'Incident year', 'Reported Month', 'Region of Origin', 'Region of Incident', 'Country of Origin', 'Cause of Death', 'Migration route', 'Location of death', 'Information Source', 'UNSD Geographical Grouping']

# Perform one-hot encoding on train and test data
train_encoded = pd.get_dummies(train_data, columns=columns_to_encode, drop_first=False)
test_encoded = pd.get_dummies(test_data, columns=columns_to_encode, drop_first=False)

In [17]:
# Assuming 'Coordinates' is the name of the column containing coordinates
# Split the coordinates into separate latitude and longitude columns
train_encoded[['Latitude', 'Longitude']] = train_encoded['Coordinates'].str.split(', ', expand=True).astype(float)
test_encoded[['Latitude', 'Longitude']] = test_encoded['Coordinates'].str.split(', ', expand=True).astype(float)

# Drop the original 'Coordinates' column if it's no longer needed
train_encoded = train_encoded.drop(columns=['Coordinates'])
test_encoded = test_encoded.drop(columns=['Coordinates'])

# Get the common columns between train_encoded and test_encoded
common_columns = train_encoded.columns.intersection(test_encoded.columns)

# Exclude 'Latitude' and 'Longitude' from common_columns
int_columns = common_columns.difference(['Latitude', 'Longitude'])

# Fill NaN values with 0
train_encoded = train_encoded.fillna(0)
test_encoded = test_encoded.fillna(0)

In [18]:
# Convert selected columns to integers (1 for True, 0 for False)
train_encoded[int_columns] = train_encoded[int_columns].astype(int)
test_encoded[int_columns] = test_encoded[int_columns].astype(int)

In [19]:
# Get the columns present in train_encoded but not in test_encoded
missing_columns_train = set(train_encoded.columns) - set(test_encoded.columns)

# Add missing columns to test_encoded and fill them with zeros
for column in missing_columns_train:
    test_encoded[column] = 0

# Get the columns present in test_encoded but not in train_encoded
missing_columns_test = set(test_encoded.columns) - set(train_encoded.columns)

# Add missing columns to train_encoded and fill them with zeros
for column in missing_columns_test:
    train_encoded[column] = 0

# Display the shape of the original and encoded data
print("Original Train Shape:", train_data.shape)
print("Encoded Train Shape:", train_encoded.shape)
print("Original Test Shape:", test_data.shape)
print("Encoded Test Shape:", test_encoded.shape)

Original Train Shape: (10416, 19)
Encoded Train Shape: (10416, 11741)
Original Test Shape: (2604, 19)
Encoded Test Shape: (2604, 11741)


In [20]:
# Display the first few rows of the encoded train data
train_encoded.head()

Unnamed: 0,Number of Dead,Minimum Estimated Number of Missing,Total Number of Dead and Missing,Number of Survivors,Number of Females,Number of Males,Number of Children,Incident Type_Cumulative Incident,Incident Type_Incident,"Incident Type_Incident,Split Incident",...,"Location of death_About 67 miles off the coast of Bireuen Regency, Aceh, Indonesia","Location of death_Kopla River, 800 meters from Grgelj, Slovenia","Location of death_Khôr 'Angar, Obock, Djibouti","Information Source_Vox Populi, Texas Public Radio, La Opinion, Breitbart","Information Source_AP via Miami Herald, El Pais, Politica Expansion, Fundación para la Justicia","Information Source_Vox Pópuli, Líder Web","Location of death_6100 Upper Valley Road, El Paso, Texas 79932, United States of America","Location of death_Nimruz Province, Afghanistan","Location of death_Three kilometers south of the Cerrito Prieto milestone in the Salar de Coipasa sector, Colchane, Tarapacá region, Chile","Information Source_Manager Online, Thairath Online, Siamrath, Amarin 34 HD"
469,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10548,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
11889,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9899,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5460,2,0,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Display the first few rows of the encoded test data
test_encoded.head()

Unnamed: 0,Number of Dead,Minimum Estimated Number of Missing,Total Number of Dead and Missing,Number of Survivors,Number of Females,Number of Males,Number of Children,Incident Type_Cumulative Incident,Incident Type_Incident,"Incident Type_Incident,Split Incident",...,"Location of death_Djiboutian desert around Guahere, Obock region, Republic of Djibouti","Location of death_Train tracks near Atotonilco de Tula, Hidalgo, Mexico","Location of death_Off the coast of Al Khums, Libya - Rescue done by fishermen, boat departed from Al Khums on 11 November at 21:00","Location of death_Tekeze River, near Himora, Ethiopia",Information Source_Noreste,Information Source_Agence de Presse Africaine (APA),"Information Source_InfoMigrants, ANSA","Location of death_Nis-Belgrade highway, near Drazevac, Serbia","Location of death_Off the coast of Lampedusa, Italy - boat departed from Tunisia","Location of death_Juchitán de Zaragoza Municipal Police Headquarters, Oaxaca, Mexico"
811,1,0,1,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5945,4,0,4,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
353,1,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3409,6,0,6,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4900,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
x_train = train_encoded.drop(['Number of Dead'], axis=1) 
y_train = train_encoded['Number of Dead']                         

x_test = test_encoded.drop(['Number of Dead'], axis=1)  
y_test = test_encoded['Number of Dead']        

In [23]:
#Ensure that different features are on the same scale
ct = ColumnTransformer([('se', StandardScaler(), ['Minimum Estimated Number of Missing', 'Total Number of Dead and Missing',
    'Number of Survivors', 'Number of Females', 'Number of Males', 'Number of Children'])], remainder='passthrough')

In [24]:
# Define the threshold for feature selection
threshold = 0.0055  # Adjust this threshold based on your feature importance scores

# Train a Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)

# Get feature importances
feature_importances = rf.feature_importances_

selected_features_idx = np.where(feature_importances >= threshold)[0]

# Sort selected features by importance
sorted_idx = np.argsort(feature_importances[selected_features_idx])[::-1]

# Print the top N most important features
top_n = min(len(selected_features_idx), 20)  # Choose the number of top features you want to display
for i in range(top_n):
    print(f"{x_train.columns[selected_features_idx[sorted_idx[i]]]}: {feature_importances[selected_features_idx[sorted_idx[i]]]}")

Total Number of Dead and Missing: 0.1746561924231366
Number of Males: 0.054656942668561935
Longitude: 0.04432724759185444
Latitude: 0.04071293408586714
Minimum Estimated Number of Missing: 0.03220640820703916
Number of Survivors: 0.024079209770801562
Number of Females: 0.023419170990654353
Number of Children: 0.017107334907172166
Cause of Death_Mixed or unknown: 0.008468290527743363
Cause of Death_Drowning: 0.008378902280545962
UNSD Geographical Grouping_Uncategorized: 0.00755575327480042
Region of Incident_Northern Africa: 0.007004506477278733
Reported Month_October: 0.0067349345993057575
Cause of Death_Vehicle accident / death linked to hazardous transport: 0.006230195494349394
Incident Type_Incident: 0.006133372760891451
Cause of Death_Sickness / lack of access to adequate healthcare: 0.0060385662688219165
Region of Origin_Unknown: 0.005846118132849715
Country of Origin_Unknown: 0.0057458928423766535
Migration route_Sahara Desert crossing: 0.005672607792460555
Reported Month_August:

In [25]:
# Create a common feature selection instance using SelectFromModel
feature_selector = SelectFromModel(RandomForestClassifier(random_state=42), threshold=threshold)

In [29]:
random_forest_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('RandomForest', RandomForestClassifier(random_state=42))])
adaboost_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('Adaboost', AdaBoostClassifier(random_state=42))])
ExtraTree_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('ExtraTreeClassifier', ExtraTreesClassifier(random_state=42))])
BaggingClassifier_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('BaggingClassifier', BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=42))])
GradientBoost_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('GradientBoosting', GradientBoostingClassifier(random_state=42))])
dtree_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('DecisionTree', DecisionTreeClassifier(random_state=42))])
knn_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('KNN', KNeighborsClassifier())])
lr_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('LogisticRegression', LogisticRegression(random_state=42))])
sgd_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('StochasticGradient', SGDClassifier(random_state=42))])
mlp_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('MLPClassifier', MLPClassifier(random_state=42))])
naive_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('NaiveBayes', GaussianNB())])
svc_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('SVM', SVC(random_state=42))])
catboost_pipeline = Pipeline([('transformer', ct), ('feature_selector', feature_selector), ('CatBoost', CatBoostClassifier(random_state=42, silent=True))])

In [31]:
pipeline_list = [random_forest_pipeline, adaboost_pipeline, ExtraTree_pipeline, BaggingClassifier_pipeline, GradientBoost_pipeline,
                dtree_pipeline, knn_pipeline, lr_pipeline, sgd_pipeline, mlp_pipeline, naive_pipeline, svc_pipeline, catboost_pipeline]

In [32]:
pipe_dict = {0: "RandomForest", 1: "Adaboost", 2: "ExtraTree", 3: "BaggingClassifier", 4: "GradientBoosting", 5: "DecisionTree",
            6: "KNN", 7: "Logistic", 8: "SGD Classifier", 9: "MLPClassifier", 10: "NaiveBayes",
            11: "SVM", 12: "Catboost"}

accuracy_scores = {}

In [33]:
for idx, pipe in enumerate(pipeline_list):
    try:
        score = cross_val_score(pipe, x_train, y_train, cv=3, scoring='accuracy')
        accuracy_scores[idx] = score.mean()
        print(pipe_dict[idx], ":", score.mean())
    except Exception as e:
        accuracy_scores[idx] = None
        print("Error for", pipe_dict[idx], ":", e)

RandomForest : 0.8984254992319508
Adaboost : 0.7786098310291859
ExtraTree : 0.8602150537634409
BaggingClassifier : 0.953821044546851
GradientBoosting : 0.6286482334869432
DecisionTree : 0.9519969278033794
KNN : 0.6935483870967741
Logistic : 0.6976766513056836
SGD Classifier : 0.6059907834101382
MLPClassifier : 0.8000192012288786
NaiveBayes : 0.09110983102918586
SVM : 0.6859639016897082
Catboost : 0.9433563748079878


In [34]:
def evaluate_model(model, x_train, y_train, x_test, y_test):
    model = model.fit(x_train, y_train)
    predict_train_y = model.predict(x_train)
    predict_test_y = model.predict(x_test)
    
    print("**Accuracy Score**")
    train_accuracy = accuracy_score(y_train, predict_train_y)
    test_accuracy = accuracy_score(y_test, predict_test_y)
    print("Train Accuracy is: %s"%(train_accuracy))
    print("\nTest Accuracy is: %s"%(test_accuracy))
    print("---------------------------------------------------------")
    
    print("\n**Accuracy Error**")
    train_error = (1-train_accuracy)
    test_error = (1-test_accuracy)
    print("Train Error: %s"%(train_error))
    print("\nTest Error: %s"%(test_error))
    print("---------------------------------------------------------")
    
    print("\n**Classification Report**")
    train_cf_report = pd.DataFrame(classification_report(y_train, predict_train_y, output_dict = True))
    test_cf_report = pd.DataFrame(classification_report(y_test, predict_test_y, output_dict = True))
    print("Train Classification Report:")
    print(train_cf_report)
    print("\n Test Classification Report:")
    print(test_cf_report)
    print("---------------------------------------------------------")
    
    print("\n**Confusion Matrix**")
    train_conf = confusion_matrix(y_train, predict_train_y)
    test_conf = confusion_matrix(y_test, predict_test_y)
    print("Train Confusion Matrix Report:")
    print((train_conf))
    print("\n Test Confusion Matrix Report:")
    print((test_conf))

In [35]:
# Sort the accuracy scores dictionary by accuracy in descending order
sorted_accuracy_scores = {k: v for k, v in sorted(accuracy_scores.items(), key=lambda item: item[1], reverse=True)}

# Select the top 3 models based on accuracy
top_3_models_idx = list(sorted_accuracy_scores.keys())[:3]

# Set Pandas display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Loop through the top 3 models and print their classifier functions and evaluation results
for idx in top_3_models_idx:
    model_name = pipe_dict[idx]
    model = pipeline_list[idx]
    
    # Print an attention-grabbing title
    print("*" * 30)
    print(f"Model: {model_name}")
    print("*" * 30)
    
    # Print the model's accuracy score
    print(model_name, ":", accuracy_scores[idx])
    
    # Evaluate and print the model's performance
    evaluate_model(model, x_train, y_train, x_test, y_test)
    
    print("\n")

# Restore the default Pandas display options after your loop
pd.reset_option('display.max_columns')
pd.reset_option('display.max_rows')

******************************
Model: BaggingClassifier
******************************
BaggingClassifier : 0.953821044546851
**Accuracy Score**
Train Accuracy is: 0.9983678955453149

Test Accuracy is: 0.9692780337941628
---------------------------------------------------------

**Accuracy Error**
Train Error: 0.0016321044546850905

Test Error: 0.030721966205837226
---------------------------------------------------------

**Classification Report**
Train Classification Report:
                    0           1            2          3           4  \
precision    0.980645     0.99972     1.000000    0.99812    1.000000   
recall       0.997812     0.99972     0.998203    0.99812    0.987448   
f1-score     0.989154     0.99972     0.999101    0.99812    0.993684   
support    457.000000  7135.00000  1113.000000  532.00000  239.000000   

                    5      6          7          8          9         10  \
precision    0.994012    1.0   1.000000   0.986842   0.980392   1.000000   
r