In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer
import matplotlib.pyplot as plt

# Load the dataset
refugee_df = pd.read_csv('Resources/demographic_ml_df.csv')

# Drop rows for countries that do not report demographic data
r_df = refugee_df.loc[(refugee_df['Male total'] != 0) | (refugee_df['Female total'] != 0)].reset_index()
r_df.drop(['index','Male total','Unnamed: 0', 'Female total'], axis=1, inplace=True)
r_df.reset_index(drop=True, inplace=True)
r_df = r_df.rename(columns={'total': 'total_refugees'})



In [11]:
r_df

Unnamed: 0,year,country_origin,country_asylum,share_borders,female_0to4,female_5to11,female_12to17,female_18to59,female_60,female_other,...,male_18to59,unknown_demographic,male_60,male_other,total_refugees,recognized_decisions,complementary_protection,rejected,otherwise_closed,total_decisions
0,2001,Afghanistan,Azerbaijan,0,8,12,10,38,0,0,...,118,0,0,0,243,226.0,0.0,45.0,0.0,271.0
1,2001,Afghanistan,Belarus,0,9,40,36,81,6,0,...,199,0,5,0,454,85.0,0.0,25.0,51.0,161.0
2,2001,Afghanistan,Belgium,0,0,0,0,0,0,144,...,0,0,0,213,357,100.0,0.0,42.0,0.0,142.0
3,2001,Afghanistan,Cambodia,0,0,0,0,5,0,0,...,7,0,0,0,17,14.0,0.0,0.0,19.0,33.0
4,2001,Afghanistan,Chile,0,0,0,0,0,0,0,...,5,0,0,0,5,5.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29138,2021,Zimbabwe,Namibia,0,0,0,11,12,0,0,...,17,0,0,0,48,0.0,0.0,0.0,16.0,16.0
29139,2021,Zimbabwe,Romania,0,5,0,0,0,0,0,...,0,0,0,0,5,,,,,
29140,2021,Zimbabwe,South Africa,1,0,43,146,1481,73,0,...,1975,0,70,0,3965,13.0,0.0,737.0,207.0,957.0
29141,2021,Zimbabwe,Switzerland,0,0,9,0,0,0,0,...,0,0,0,0,9,0.0,0.0,5.0,0.0,5.0


In [18]:
# Define the features and target variable
X = r_df.drop(['country_asylum'], axis=1)
y = r_df['country_asylum'].reset_index()



In [24]:
X

Unnamed: 0,year,country_origin,share_borders,female_0to4,female_5to11,female_12to17,female_18to59,female_60,female_other,male_0to4,...,male_18to59,unknown_demographic,male_60,male_other,total_refugees,recognized_decisions,complementary_protection,rejected,otherwise_closed,total_decisions
0,2001,Afghanistan,0,8,12,10,38,0,0,9,...,118,0,0,0,243,226.0,0.0,45.0,0.0,271.0
1,2001,Afghanistan,0,9,40,36,81,6,0,9,...,199,0,5,0,454,85.0,0.0,25.0,51.0,161.0
2,2001,Afghanistan,0,0,0,0,0,0,144,0,...,0,0,0,213,357,100.0,0.0,42.0,0.0,142.0
3,2001,Afghanistan,0,0,0,0,5,0,0,0,...,7,0,0,0,17,14.0,0.0,0.0,19.0,33.0
4,2001,Afghanistan,0,0,0,0,0,0,0,0,...,5,0,0,0,5,5.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29138,2021,Zimbabwe,0,0,0,11,12,0,0,0,...,17,0,0,0,48,0.0,0.0,0.0,16.0,16.0
29139,2021,Zimbabwe,0,5,0,0,0,0,0,0,...,0,0,0,0,5,,,,,
29140,2021,Zimbabwe,1,0,43,146,1481,73,0,0,...,1975,0,70,0,3965,13.0,0.0,737.0,207.0,957.0
29141,2021,Zimbabwe,0,0,9,0,0,0,0,0,...,0,0,0,0,9,0.0,0.0,5.0,0.0,5.0


In [25]:
# Define the columns to encode
cat_features = ['country_origin']

# Create a ColumnTransformer object to apply the encoder
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), cat_features)
    ])


In [29]:
# Apply the preprocessor to the data
X_processed = preprocessor.fit_transform(X)

In [30]:
# Create a LabelEncoder for country_asylum
le = LabelEncoder()

# Fit and transform the country_asylum column in y
y_processed = le.fit_transform(y['country_asylum'])

In [32]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.3, random_state=42)

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 6]
}

# Create the GridSearchCV object
rf_clf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the best accuracy score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best hyperparameters:", best_params)
print("Best accuracy score:", best_score)

# Create a random forest classifier with the best hyperparameters
rf_clf = RandomForestClassifier(n_estimators=best_params['n_estimators'], 
                                 max_depth=best_params['max_depth'], 
                                 min_samples_split=best_params['min_samples_split'])

# Train the classifier on the training data
rf_clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_clf.predict(X_test)

print(y_pred)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



Best hyperparameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Best accuracy score: 0.053970588235294124
[139 139 139 ... 139 139 139]
Accuracy: 0.05387166876358229
