In [105]:
# Import the necessary modules and libraries
# pip install sklearn.experimental

import numpy as np
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import random

In [27]:
# Load train and test data
df = pd.read_csv("communities.data", header=None);

In [28]:
# Remove unpredictive features
df = df.drop([0,1,2,3,4],axis=1);

# Remove goal from features
df = df.drop(127, axis=1);

In [29]:
df

Unnamed: 0,5,6,7,8,9,10,11,12,13,14,...,117,118,119,120,121,122,123,124,125,126
0,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,0.29,0.32,...,0.29,0.12,0.26,0.20,0.06,0.04,0.9,0.5,0.32,0.14
1,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,?,0.02,0.12,0.45,?,?,?,?,0.00,?
2,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,?,0.01,0.21,0.02,?,?,?,?,0.00,?
3,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,0.34,0.21,...,?,0.02,0.39,0.28,?,?,?,?,0.00,?
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,?,0.04,0.09,0.02,?,?,?,?,0.00,?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,0.01,0.40,0.10,0.87,0.12,0.16,0.43,0.51,0.35,0.30,...,?,0.01,0.28,0.05,?,?,?,?,0.00,?
1990,0.05,0.96,0.46,0.28,0.83,0.32,0.69,0.86,0.73,0.14,...,?,0.02,0.37,0.20,?,?,?,?,0.00,?
1991,0.16,0.37,0.25,0.69,0.04,0.25,0.35,0.50,0.31,0.54,...,0.25,0.08,0.32,0.18,0.08,0.06,0.78,0,0.91,0.28
1992,0.08,0.51,0.06,0.87,0.22,0.10,0.58,0.74,0.63,0.41,...,0.19,0.03,0.38,0.33,0.02,0.02,0.79,0,0.22,0.18


In [30]:
# We can see that we have a lot of features with missing values
# There are multiple ways of treating such phenomenon
# One of which is replace with mean imputation however since our dataset is multivariate
# it is better to avoid mean imputation
# Another way is listwise imputation However our dataset has a lot of variable with missing data which
# could impose a problem of having very few entries in our cleaned dataset.
# As these variables are predictive in determining ViolentCrimesPerPop we cannot remove the columns completely
# So my suggestion is to try to use regression imputation to determine these values.

# We had a certain problem with str values so we make sure that for columns containing empty values the type of the column
# Stays float32;

df = df.replace("?", np.nan).astype("float64")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


column no 30 is of type: <class 'numpy.float32'>
column no 101 is of type: <class 'numpy.float32'>
column no 102 is of type: <class 'numpy.float32'>
column no 103 is of type: <class 'numpy.float32'>
column no 104 is of type: <class 'numpy.float32'>
column no 105 is of type: <class 'numpy.float32'>
column no 106 is of type: <class 'numpy.float32'>
column no 107 is of type: <class 'numpy.float32'>
column no 108 is of type: <class 'numpy.float32'>
column no 109 is of type: <class 'numpy.float32'>
column no 110 is of type: <class 'numpy.float32'>
column no 111 is of type: <class 'numpy.float32'>
column no 112 is of type: <class 'numpy.float32'>
column no 113 is of type: <class 'numpy.float32'>
column no 114 is of type: <class 'numpy.float32'>
column no 115 is of type: <class 'numpy.float32'>
column no 116 is of type: <class 'numpy.float32'>
column no 117 is of type: <class 'numpy.float32'>
column no 121 is of type: <class 'numpy.float32'>
column no 122 is of type: <class 'numpy.float32'>
c

In [31]:
# Use impute regression for missing variables
imputer = IterativeImputer(BayesianRidge())
impute_data = pd.DataFrame(imputer.fit_transform(df))



In [36]:
impute_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,112,113,114,115,116,117,118,119,120,121
0,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,0.29,0.32,...,0.290000,0.12,0.26,0.20,0.060000,0.040000,0.900000,0.500000,0.32,0.140000
1,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.321703,0.02,0.12,0.45,0.041574,0.028157,0.866306,0.639758,0.00,0.212728
2,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.280888,0.01,0.21,0.02,0.050528,-0.008062,1.137582,0.499376,0.00,0.255733
3,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,0.34,0.21,...,0.235669,0.02,0.39,0.28,0.080584,0.033792,0.796642,0.461159,0.00,0.225014
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.230210,0.04,0.09,0.02,0.031025,0.017185,0.793476,0.307233,0.00,0.179882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,0.01,0.40,0.10,0.87,0.12,0.16,0.43,0.51,0.35,0.30,...,0.250469,0.01,0.28,0.05,0.057594,0.025910,0.578187,0.505908,0.00,0.183910
1990,0.05,0.96,0.46,0.28,0.83,0.32,0.69,0.86,0.73,0.14,...,0.351105,0.02,0.37,0.20,0.074346,0.047210,0.656974,0.584959,0.00,0.227828
1991,0.16,0.37,0.25,0.69,0.04,0.25,0.35,0.50,0.31,0.54,...,0.250000,0.08,0.32,0.18,0.080000,0.060000,0.780000,0.000000,0.91,0.280000
1992,0.08,0.51,0.06,0.87,0.22,0.10,0.58,0.74,0.63,0.41,...,0.190000,0.03,0.38,0.33,0.020000,0.020000,0.790000,0.000000,0.22,0.180000


In [51]:
# Load X and y
X = impute_data;
y = X[121];

# Drop column class from X
X = X.drop(121,axis=1);

# Separate the dataset into a training and a test set and keep the class included in the set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1);

In [92]:
RandomForestRegressor?

In [108]:
# The best way to find the most optimal parameters for a decision tree would be to use gridsearch
model = RandomForestRegressor()
param_grid = {
    "n_estimators": [10, 100, 300],
    "criterion": ['mse'],
    "max_depth": [5, 10],
}

grid = GridSearchCV(estimator=model, cv=3, param_grid=param_grid, verbose=2)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] criterion=mse, max_depth=5, n_estimators=10 .....................
[CV] ...... criterion=mse, max_depth=5, n_estimators=10, total=   0.1s
[CV] criterion=mse, max_depth=5, n_estimators=10 .....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ...... criterion=mse, max_depth=5, n_estimators=10, total=   0.1s
[CV] criterion=mse, max_depth=5, n_estimators=10 .....................
[CV] ...... criterion=mse, max_depth=5, n_estimators=10, total=   0.1s
[CV] criterion=mse, max_depth=5, n_estimators=100 ....................
[CV] ..... criterion=mse, max_depth=5, n_estimators=100, total=   1.3s
[CV] criterion=mse, max_depth=5, n_estimators=100 ....................
[CV] ..... criterion=mse, max_depth=5, n_estimators=100, total=   1.3s
[CV] criterion=mse, max_depth=5, n_estimators=100 ....................
[CV] ..... criterion=mse, max_depth=5, n_estimators=100, total=   1.3s
[CV] criterion=mse, max_depth=5, n_estimators=300 ....................
[CV] ..... criterion=mse, max_depth=5, n_estimators=300, total=   3.6s
[CV] criterion=mse, max_depth=5, n_estimators=300 ....................
[CV] ..... criterion=mse, max_depth=5, n_estimators=300, total=   3.5s
[CV] criterion=mse, max_depth=5, n_estimators=300 ....................
[CV] .

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   43.2s finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'criterion': ['mse'], 'max_depth': [5, 10],
                         'n_estimators': [10, 100, 300]},
             verbose=2)

In [109]:
print (grid.best_params_)

{'criterion': 'mse', 'max_depth': 10, 'n_estimators': 300}


In [110]:
model = RandomForestRegressor(**grid.best_params_)
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, n_estimators=300)

In [111]:
from sklearn.metrics import mean_squared_error
print("train mse:", mean_squared_error(y_train, model.predict(X_train)))
print("test mse:", mean_squared_error(y_test, model.predict(X_test)))

train mse: 0.0005569368726979071
test mse: 0.0021034261550762602


In [106]:
# Baseline for comparison
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train)
print("train mse:", mean_squared_error(y_train, baseline_model.predict(X_train)))
print("test mse:", mean_squared_error(y_test, baseline_model.predict(X_test)))

train mse: 0.0003520935600085602
test mse: 0.00048604136164582374
