In [1]:
#loading and transforming data: 
import pandas as pd
refugee_df = pd.read_csv('Resources/demographic_ml_df.csv')

#preprocessing data: 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

#building models:
from sklearn.ensemble import RandomForestClassifier 

#evaluate models: 
from sklearn.model_selection import GridSearchCV

#displaying models:
import matplotlib.pyplot as plt

In [2]:
rows_and_cols = refugee_df.shape
print('There are {} rows and {} columns.\n'.format(
    rows_and_cols[0], rows_and_cols[1]))

There are 61199 rows and 26 columns.



In [72]:
#drop rows that for countries that do not report demographic data. 

r_df = refugee_df.loc[(refugee_df['Male total'] != 0) | (refugee_df['Female total'] != 0)].reset_index()
r_df.drop(['index','Male total','Unnamed: 0', 'Female total'], axis=1, inplace=True)
r_df.reset_index(drop=True, inplace=True)
r_df = r_df.rename(columns={'total': 'total_refugees'})

In [94]:
# Split the data into features (X) and target (y)
X = r_df.drop('country_asylum', axis=1)
y = r_df['country_asylum']

# One hot encode the categorical features
cat_features = ['country_origin']
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X[cat_features])
X_encoded = encoder.transform(X[cat_features])

# Concatenate the encoded features with the numerical features
num_features = ['share_borders', 'total_refugees','recognized_decisions', 'complementary_protection', 'rejected','otherwise_closed', 'total_decisions', 'female_0to4','female_5to11','female_12to17','female_18to59','female_60','female_other','male_0to4','male_5to11','male_12to17','male_18to59','male_60','male_other','unknown_demographic']
X_num = X[num_features].values
X_encoded = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names_out(cat_features))
X_processed = pd.concat([pd.DataFrame(X_num, columns=num_features), X_encoded], axis=1)
X_processed.fillna(0, inplace=True)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)

In [96]:
# Create a random forest classifier with 100 trees
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_clf.predict(X_test)

print(y_pred)

['Sierra Leone' 'Sudan' 'Zimbabwe' ... 'Malaysia' 'Switzerland' 'Turkey']


In [97]:
# Evaluate the accuracy of the model
accuracy = rf_clf.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.5822944069541347
