# Import library

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor

In [2]:
%%HTML
    <style type="text/css">
        table.dataframe td, table.dataframe th {
            border-style: solid;
        }
</style>

# Read data

In [3]:
data = pd.read_csv('DATA/migration_nz.csv')
print(f"shape of data: {data.shape}")
data.head()

shape of data: (86526, 5)


Unnamed: 0,Measure,Country,Citizenship,Year,Value
0,Arrivals,Oceania,New Zealand Citizen,1979,11817.0
1,Arrivals,Oceania,Australian Citizen,1979,4436.0
2,Arrivals,Oceania,Total All Citizenships,1979,19965.0
3,Arrivals,Antarctica,New Zealand Citizen,1979,10.0
4,Arrivals,Antarctica,Australian Citizen,1979,0.0


# Data preprocessing

In [4]:
data['Measure'].unique()

array(['Arrivals', 'Departures', 'Net'], dtype=object)

In [5]:
data['Measure'].replace("Arrivals",0,inplace=True)
data['Measure'].replace("Departures",1,inplace=True)
data['Measure'].replace("Net",2,inplace=True)

data['Measure'].unique()

array([0, 1, 2], dtype=int64)

In [6]:
data['CountryID'] = pd.factorize(data.Country)[0]
data['CitID'] = pd.factorize(data.Citizenship)[0]
data.drop('Country', axis=1, inplace=True)
data.drop('Citizenship', axis=1, inplace=True)
data.head()

Unnamed: 0,Measure,Year,Value,CountryID,CitID
0,0,1979,11817.0,0,0
1,0,1979,4436.0,0,1
2,0,1979,19965.0,0,2
3,0,1979,10.0,1,0
4,0,1979,0.0,1,1


# Check missing value

In [7]:
data.isnull().sum()

Measure       0
Year          0
Value        72
CountryID     0
CitID         0
dtype: int64

In [8]:
data["Value"].fillna(data["Value"].median(),inplace=True)

# Train test split data

In [9]:
X = data[['CountryID','Measure','Year','CitID']].values
Y = data['Value'].values

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=9)
print(f"shape of X_train: {X_train.shape}")
print(f"shape of X_test: {X_test.shape}")
print(f"shape of y_train: {y_train.shape}")
print(f"shape of y_test: {y_test.shape}")

shape of X_train: (60568, 4)
shape of X_test: (25958, 4)
shape of y_train: (60568,)
shape of y_test: (25958,)


# Create randomforest model 

In [10]:
rf = RandomForestRegressor(n_estimators=70, max_features=3, max_depth=5, n_jobs=-1)
rf.fit(X_train ,y_train)
score = rf.score(X_test, y_test)
print(f"Score for RandomForestRegressor: {score}")

Score for RandomForestRegressor: 0.739986213089047
