# Africa Soil Property Prediction Challenge 

**SOC, pH, Ca, P, Sand** are the five target variables for predictions. The data have been monotonously transformed from the original measurements and thus include negative values. 

In [1]:
#https://www.kaggle.com/c/afsis-soil-properties

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error,auc,explained_variance_score




In [3]:
filename = 'training.csv'
train_frame = pd.read_csv(filename)
train_frame.head(5)

Unnamed: 0,PIDN,m7497.96,m7496.04,m7494.11,m7492.18,m7490.25,m7488.32,m7486.39,m7484.46,m7482.54,...,REF7,RELI,TMAP,TMFI,Depth,Ca,P,pH,SOC,Sand
0,XNhoFZW5,0.302553,0.301137,0.299748,0.300354,0.302679,0.303799,0.301702,0.298936,0.298126,...,-0.646673,1.687734,0.190708,0.056843,Topsoil,-0.295749,-0.041336,-1.129366,0.353258,1.269748
1,9XNspFTd,0.270192,0.268555,0.266964,0.267938,0.271013,0.272346,0.26987,0.266976,0.266544,...,-0.646673,1.687734,0.190708,0.056843,Subsoil,-0.387442,-0.231552,-1.531538,-0.264023,1.692209
2,WDId41qG,0.317433,0.316265,0.314948,0.315224,0.316942,0.317764,0.316067,0.313874,0.313301,...,-0.814516,1.80666,0.190708,0.056843,Topsoil,-0.248601,-0.224635,-0.259551,0.064152,2.091835
3,JrrJf1mN,0.261116,0.259767,0.258384,0.259001,0.26131,0.262417,0.260534,0.258039,0.257246,...,-0.814516,1.80666,0.190708,0.056843,Subsoil,-0.332195,-0.318014,-0.577548,-0.318719,2.118477
4,ZoIitegA,0.260038,0.258425,0.256544,0.25703,0.259602,0.260786,0.258717,0.256352,0.255902,...,-0.780242,0.430513,0.190708,0.056843,Topsoil,-0.43835,-0.01021,-0.699135,-0.310905,2.164148



**Check number of missing values**

In [4]:
train_frame.shape

(1157, 3600)

In [5]:
train_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1157 entries, 0 to 1156
Columns: 3600 entries, PIDN to Sand
dtypes: float64(3598), object(2)
memory usage: 31.8+ MB


In [6]:
train_frame.isnull().sum()

PIDN        0
m7497.96    0
m7496.04    0
m7494.11    0
m7492.18    0
m7490.25    0
m7488.32    0
m7486.39    0
m7484.46    0
m7482.54    0
m7480.61    0
m7478.68    0
m7476.75    0
m7474.82    0
m7472.89    0
m7470.97    0
m7469.04    0
m7467.11    0
m7465.18    0
m7463.25    0
m7461.32    0
m7459.39    0
m7457.47    0
m7455.54    0
m7453.61    0
m7451.68    0
m7449.75    0
m7447.82    0
m7445.89    0
m7443.97    0
           ..
m615.188    0
m613.259    0
m611.331    0
m609.402    0
m607.474    0
m605.545    0
m603.617    0
m601.688    0
m599.76     0
BSAN        0
BSAS        0
BSAV        0
CTI         0
ELEV        0
EVI         0
LSTD        0
LSTN        0
REF1        0
REF2        0
REF3        0
REF7        0
RELI        0
TMAP        0
TMFI        0
Depth       0
Ca          0
P           0
pH          0
SOC         0
Sand        0
Length: 3600, dtype: int64

In [7]:
labelEncoder = LabelEncoder()
output_columns = [ 'Ca','P','pH','SOC','Sand']
output_labels = train_frame[output_columns].values
train_frame.drop(output_columns,axis=1,inplace=True)
del train_frame['PIDN']


cat_columns = train_frame.dtypes.pipe(lambda x: x[x == 'object']).index
for col in cat_columns:
    train_frame[col] = labelEncoder.fit_transform(train_frame[col])
   

**In order to decrease the prediction error , and avoid overfitting, we use PCA to reduce the dimensionality Also it fast the training time**

In [8]:
pca = PCA(whiten=True)
pca.fit(train_frame)
variance = pd.DataFrame(pca.explained_variance_ratio_)
np.cumsum(pca.explained_variance_ratio_)

array([0.70373743, 0.79193861, 0.85345842, ..., 1.        , 1.        ,
       1.        ])

In [9]:
pca = PCA(n_components=5,whiten=True)
pca = pca.fit(train_frame)
dataPCA = pca.transform(train_frame)

In [10]:
dataPCA.shape

(1157, 5)

In [11]:
 
X_train,X_test,y_train,y_test = train_test_split(dataPCA,output_labels,test_size=0.1,random_state=42)

In [12]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1041, 5)
(1041, 5)
(116, 5)
(116, 5)


### First we try Lasso and Ridge Linear Models,where each model add  L1 & L2 *regularization penalty* respectively to Linear Regression and avoid Overfitting

In [13]:
ridge_model = Ridge()
ridge_model.fit(X_train,y_train)
ridge_predict = ridge_model.predict(X_test)

In [14]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, ridge_predict))
print ("R2 Score : ",ridge_model.score(X_test,y_test))


Mean squared error: 0.81
R2 Score :  0.39596206551960667


In [15]:
lasso_model = Lasso()
lasso_model.fit(X_train,y_train)
lasso_predict = lasso_model.predict(X_test)

In [16]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, lasso_predict))
print ("R2 Score : ",lasso_model.score(X_test,y_test))


Mean squared error: 1.37
R2 Score :  -0.014371932311444074


### Let's Try Random Forest Regressor may it get good performance

In [17]:
rf = RandomForestRegressor(n_estimators=51,min_samples_leaf=5,min_samples_split=3,random_state=42)
rf.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=5, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=51, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [18]:
rf_predict = rf.predict(X_test)

In [19]:
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, rf_predict))
print ("R2 Score : ",rf.score(X_test,y_test))


Mean squared error: 0.53
R2 Score :  0.6032224592615675
