In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [3]:
refugee_df = pd.read_csv('Resources/demographic_ml_df.csv')

In [4]:
# Preprocessing
r_df = refugee_df.loc[(refugee_df['Male total'] != 0) | (refugee_df['Female total'] != 0)]
r_df = r_df.drop(['Male total', 'Unnamed: 0', 'Female total'], axis=1).reset_index(drop=True)
r_df.rename(columns={'total': 'total_refugees'}, inplace=True)

In [5]:
# Define the features and target variable
X = r_df.drop(['country_asylum'], axis=1)
y = r_df['country_asylum']

In [6]:
# Define the columns to encode
cat_features = ['country_origin']

# Create a ColumnTransformer object to apply the encoder
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), cat_features)
    ])

# Apply the preprocessor to the data
X_processed = preprocessor.fit_transform(X)

# Create a LabelEncoder for country_asylum
le = LabelEncoder()

# Fit and transform the country_asylum column in y
y_processed = le.fit_transform(y)

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.3, random_state=42)

# Train the classifier on the training data
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = dt_classifier.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       0.00      0.00      0.00        35
           2       0.00      0.00      0.00        57
           3       0.00      0.00      0.00        22
           5       0.00      0.00      0.00       195
           6       0.06      0.09      0.07        47
           7       0.00      0.00      0.00        37
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00        18
          11       0.03      0.15      0.05        55
          12       0.00      0.00      0.00       251
          13       0.00      0.00      0.00         9
          14       0.00      0.00      0.00        92
          15       0.00      0.00      0.00        21
          16       0.00      0.00      0.00        34
          17       0.04      0.04      0.04        46
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# Save the trained Decision Tree Classifier
joblib.dump(dt_classifier, 'decision_tree_classifier_model.pkl')

# Load the classifier model
loaded_classifier = joblib.load('decision_tree_classifier_model.pkl')

In [12]:
# Example: Predict the asylum country for refugees from 'USA'
country_origin = 'Iraq'

# Preprocess the input and make a prediction
input_data = preprocessor.transform(pd.DataFrame({'country_origin': [country_origin]}))
predicted_asylum = loaded_classifier.predict(input_data)

# Convert the predicted label back to its original string representation
predicted_asylum_country = le.inverse_transform(predicted_asylum)

print(f"Predicted asylum country for refugees from {country_origin}: {predicted_asylum_country[0]}")

Predicted asylum country for refugees from Iraq: Egypt
