<a href="https://colab.research.google.com/github/NikuDubenco/DS-Unit-2-Classification-1/blob/master/Nicolae_Dubenco_random_forests_ordinal_encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries

In [63]:
!pip install category_encoders



In [0]:
%matplotlib inline
import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeClassifier
pd.set_option('display.float_format', '{:.2f}'.format)


## Import and read CSV files

In [0]:
# Merge train_feartures.csv & train_labels.csv
train = pd.merge(pd.read_csv('https://drive.google.com/uc?export=download&id=14ULvX0uOgftTB2s97uS8lIx1nHGQIB0P'),
                pd.read_csv('https://drive.google.com/uc?export=download&id=1r441wLr7gKGHGLyPpKauvCuUOU556S2f'))

# Read test_features.csv & sample_admission.csv
test = pd.read_csv('https://drive.google.com/uc?export=download&id=1wvsYl9hbRbZuIuoaLWCsW_kbcxCdocHz')
sample_submission = pd.read_csv('https://drive.google.com/uc?export=download&id=1kfJewnmhowpUo381oSn3XqsQ6Eto23XV')

# Split train into train & val
train, val = train_test_split(train, train_size=.80, test_size=.20, 
                             stratify=train['status_group'], random_state=42)

In [40]:
# Identify the dimension of the 'train' data set
train.shape

(47520, 40)

## Exploring categorical features

In [0]:
# function that will give a vision on what columns double data:
def identify_double_columns_data(X):
  """Identify the columns that contain the same data in the data set,
  type of data categorical"""
  for column in X.select_dtypes(exclude='number').columns:
    if X[column].nunique() <= 30:
      print('--------' * 10)
      print('feature displayed: >>>', column, ' <<<')
      print('                      ', len(column) * '-')
      print('value     &    times present in column')
      print(X[column].value_counts())

In [42]:
identify_double_columns_data(train)

--------------------------------------------------------------------------------
feature displayed: >>> basin  <<<
                       -----
value     &    times present in column
Lake Victoria              8137
Pangani                    7173
Rufiji                     6412
Internal                   6255
Lake Tanganyika            5194
Wami / Ruvu                4773
Lake Nyasa                 4075
Ruvuma / Southern Coast    3551
Lake Rukwa                 1950
Name: basin, dtype: int64
--------------------------------------------------------------------------------
feature displayed: >>> region  <<<
                       ------
value     &    times present in column
Iringa           4250
Shinyanga        3972
Mbeya            3703
Kilimanjaro      3512
Morogoro         3238
Arusha           2686
Kagera           2615
Mwanza           2485
Kigoma           2276
Pwani            2110
Ruvuma           2090
Tanga            2029
Dodoma           1768
Singida          1678
Tabora    

#### Analysis of columns: source, source_type, source_class

In [13]:
train[['source', 'source_type', 'source_class']].isin(['unknown']).sum()

source           59
source_type       0
source_class    234
dtype: int64

In [14]:
train[['source', 'source_type', 'source_class']].isin(['other']).sum()

source          175
source_type     234
source_class      0
dtype: int64

#### Analyze columns 'scheme_management', 'management', 'management_group'

In [15]:
train[['scheme_management', 'management', 'management_group']].isin(['unknown']).sum()

scheme_management      0
management           453
management_group     453
dtype: int64

In [16]:
train[['scheme_management', 'management', 'management_group']].isin(['other']).sum()

scheme_management      0
management           669
management_group     749
dtype: int64

In [18]:
train[['scheme_management', 'management', 'management_group']].isna().sum()

scheme_management    3128
management              0
management_group        0
dtype: int64

In [17]:
train[['scheme_management', 'management', 'management_group']]

Unnamed: 0,scheme_management,management,management_group
43360,VWC,vwc,user-group
7263,WUA,wua,user-group
2486,VWC,vwc,user-group
313,,vwc,user-group
52726,VWC,vwc,user-group
8558,VWC,vwc,user-group
2559,VWC,vwc,user-group
54735,VWC,vwc,user-group
25763,VWC,vwc,user-group
44540,VWC,vwc,user-group


#### Analyze columns 'extraction_type', 'extraction_type_group', 'extraction_type_class'

In [19]:
train[['extraction_type', 'extraction_type_group', 'extraction_type_class']].isna().sum()

extraction_type          0
extraction_type_group    0
extraction_type_class    0
dtype: int64

In [23]:
train[['extraction_type', 'extraction_type_group', 'extraction_type_class']].nunique()

extraction_type          18
extraction_type_group    13
extraction_type_class     7
dtype: int64

In [24]:
train[['extraction_type', 'extraction_type_group', 'extraction_type_class']].isin(['unknown']).sum()

extraction_type          0
extraction_type_group    0
extraction_type_class    0
dtype: int64

In [25]:
train[['extraction_type', 'extraction_type_group', 'extraction_type_class']].isin(['other']).sum()

extraction_type          5193
extraction_type_group    5193
extraction_type_class    5193
dtype: int64

#### Analyzing the results above, I decided the next actions:
1. Delete column  'recorded_by'. It's contains the same value in every cell.

2. Delete column 'quality_group'. Have same data as 'water_quality',  but the last have 2 values divided in 4 values 

3. Doubled data in the columns 'quantity_group' and 'quantity'. Drop 'quantity_group' column

4. Delete column 'waterpoint_type_group'. Have same data as 'waterpoint_type',  but the last have 1 category divided in 2 categories

5. Columns 'payment' and 'payment_type' double the same values and I will drop the 'payment column'

6. Columns  'source', 'source_type', 'source_class' contain the same values. Delete 'source_type', 'source_class'

7. Delete column 'scheme_management' it's look like it double the 'management', and the last more or less is it doubled by 'management_group' column what will be deleted too

8. Delete columns 'extraction_type_group', 'extraction_type_class' that double the column 'extraction_type' (the last have more type of value, and could be more useful)

#### Conclusion
Delete columns:  recorded_by, quality_group, quantity_group, waterpoint_type_group, payment column, source_type, source_class, scheme_management, management_group, extraction_type_group, extraction_type_class

In [0]:
def wrangle(X):
  """Wrangle train, validate and test sets in the same way"""
  X = X.copy()
  
  # Convert data_recorded to datetime
  X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
  
  # Extract components from date_recorded, then drop the original column
  X['year_recorded'] = X['date_recorded'].dt.year
  X['month_recorded'] = X['date_recorded'].dt.month
  X['day_recorded'] = X['date_recorded'].dt.day
  X = X.drop(columns='date_recorded')
  
  # Delete the columns that double the data
  X = X.drop(columns=['recorded_by', 'quality_group', 'quantity_group', 
              'waterpoint_type_group', 'payment', 'source_type', 
              'source_class', 'scheme_management', 'management_group', 
              'extraction_type_group', 'extraction_type_class'])
  
  # Delete 'id' column doesn't contain information as feature
  X.drop(columns='id', inplace=True)
  
  # About 3% of the time, latitude has small values near zero,
  # outside Tanzania, so we'll treat these like null values
  X['latitude'] = X['latitude'].replace(-2e-08, np.nan)

  # When columns have zeros and shouldn't, they are like null values
  for col in ['construction_year', 'longitude', 'latitude', 'gps_height', 'population']:
    X[col] = X[col].replace(0, np.nan)
  
  # Calculate the age of the pump at the moment of recording
  X['age'] = X['year_recorded'] - X['construction_year']
  
  X.drop(columns='construction_year', inplace=True)
  
  return X

#### Wrangle datasets

In [0]:
train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

In [50]:
train.shape, val.shape, test.shape

((47520, 32), (11880, 32), (14358, 31))

#### Define features and target

In [0]:
# The status_group column is the target
target = 'status_group'

# Get a dataframe with all train columns except the target
train_features = train.drop(columns=[target])

# Get a list of the numeric features
numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 50].index.tolist()

# Combine the lists 
features = numeric_features + categorical_features

In [0]:
# Arrange data into X features matrix and y target vector 
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

## Random Forest Classifier

In [73]:
%%time
from sklearn.ensemble import RandomForestClassifier

pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    SimpleImputer(strategy='median'), 
    RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
)

# Fit on train, score on val
model = pipeline.fit(X_train, y_train)
print('Validation Accuracy', pipeline.score(X_val, y_val))

Validation Accuracy 0.8085858585858586
CPU times: user 39.5 s, sys: 417 ms, total: 39.9 s
Wall time: 21.6 s


In [0]:
y_pred = model.predict(X_test)

In [22]:
X_train.shape

(47520, 24)

In [23]:
X_test.shape

(14358, 24)

## Ordinal encoding

In [0]:
# Arrange data into X features matrix and y target vector
X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]

In [48]:
X_train.shape, X_val.shape, X_test.shape

((47520, 31), (11880, 31), (14358, 24))

In [95]:
%%time

pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='median'), 
    RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
)

# Fit on train, score on val
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(test)
print('Validation Accuracy', pipeline.score(X_val, y_val))

Validation Accuracy 0.8111952861952862
CPU times: user 39.9 s, sys: 144 ms, total: 40.1 s
Wall time: 20.9 s


In [96]:
%%time

pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='mean'), 
    RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
)

# Fit on train, score on val
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(test)
print('Validation Accuracy', pipeline.score(X_val, y_val))

Validation Accuracy 0.8092592592592592
CPU times: user 41.7 s, sys: 146 ms, total: 41.8 s
Wall time: 21.7 s


In [98]:
%%time

pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='most_frequent'), 
    RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
)

# Fit on train, score on val
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(test)
print('Validation Accuracy', pipeline.score(X_val, y_val))

Validation Accuracy 0.8083333333333333
CPU times: user 39.7 s, sys: 145 ms, total: 39.8 s
Wall time: 20.7 s


In [26]:
X_val.shape

(11880, 31)

In [0]:
submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('submission-06.csv', index=False)

In [0]:
from google.colab import files
files.download('submission-06.csv')

## XGBoost Classifier

In [78]:
!pip install xgboost



In [0]:
from xgboost import XGBClassifier

In [0]:
encoder = pipeline.named_steps['ordinalencoder']
imputer = pipeline.named_steps['simpleimputer']
X_train_encoded = encoder.transform(X_train)
X_train_imputed = imputer.transform(X_train_encoded)

In [81]:
model = XGBClassifier()
model.fit(X_train_encoded, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
X_val_encoded = encoder.transform(X_val)
X_val_imputed = imputer.transform(X_val_encoded)

In [83]:
model.score(X_val_encoded, y_val)

0.7428451178451179