In [124]:
#loading and transforming data: 
import pandas as pd
refugee_df = pd.read_csv('Resources/demographic_ml_df.csv')

#preprocessing data: 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

#building models:
from sklearn.ensemble import RandomForestClassifier 

#evaluate models: 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer

#displaying models:
import matplotlib.pyplot as plt

In [125]:
rows_and_cols = refugee_df.shape
print('There are {} rows and {} columns.\n'.format(
    rows_and_cols[0], rows_and_cols[1]))

There are 61199 rows and 26 columns.



In [126]:
#drop rows that for countries that do not report demographic data. 

r_df = refugee_df.loc[(refugee_df['Male total'] != 0) | (refugee_df['Female total'] != 0)].reset_index()
r_df.drop(['index','Male total','Unnamed: 0', 'Female total'], axis=1, inplace=True)
r_df.reset_index(drop=True, inplace=True)
r_df = r_df.rename(columns={'total': 'total_refugees'})

In [127]:
# Split the data into features (X) and target (y)
X = r_df.drop('country_asylum', axis=1)
y = r_df['country_asylum']

# One hot encode the categorical features
cat_features = ['country_origin']
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X[cat_features])
X_encoded = encoder.transform(X[cat_features])

# Concatenate the encoded features with the numerical features
num_features = ['share_borders', 'total_refugees','recognized_decisions', 'complementary_protection', 'rejected','otherwise_closed', 'total_decisions', 'female_0to4','female_5to11','female_12to17','female_18to59','female_60','female_other','male_0to4','male_5to11','male_12to17','male_18to59','male_60','male_other','unknown_demographic']
X_num = X[num_features].values
X_encoded = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names_out(cat_features))
X_processed = pd.concat([pd.DataFrame(X_num, columns=num_features), X_encoded], axis=1)
X_processed.fillna(0, inplace=True)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)

In [137]:
#Determine which parameters are best for the Random Forest Classifier using Grid Search CV. 

rf_clf = RandomForestClassifier()

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 6]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the best accuracy score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best hyperparameters:", best_params)
print("Best accuracy score:", best_score)



Best hyperparameters: {'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 200}
Best accuracy score: 0.555686274509804


In [138]:
# Create a random forest classifier with 100 trees
rf_clf = RandomForestClassifier(n_estimators=200, max_depth = 30, min_samples_split = 2)

# Train the classifier on the training data
rf_clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_clf.predict(X_test)

print(y_pred)

['Sierra Leone' 'Sudan' 'Brazil' ... 'Malaysia' 'Switzerland' 'Turkey']


In [139]:
# Evaluate the accuracy of the model
accuracy = rf_clf.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.583552556330779


In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Define the columns to encode
cat_features = ['country_origin']

# Create a ColumnTransformer object to apply the encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('label', LabelEncoder(), ['country_asylum']),
        ('onehot', OneHotEncoder(), cat_features)
    ])

# Apply the preprocessor to the data
X_processed = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.3, random_state=42)

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 4, 6]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the best accuracy score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best hyperparameters:", best_params)
print("Best accuracy score:", best_score)

NameError: name 'X' is not defined