In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Importing the train dataset
df_train = pd.read_csv("./properties_colombia_train.csv")

In [None]:
# Expand all dataset columns
pd.set_option('display.max_columns', None)

# Expand the whole float numbers and leave them all with 2 decimals
pd.options.display.float_format = '{:.2f}'.format

In [None]:
df_train.shape

## Henry's requirement
Priorly, add the 'target' column from the 'price' feature <br>
Null values will be replaced by its mean value<br>
Then, set it up

In [None]:
# Setting up categories values as 'expensive' and 'cheap' under the following statement rules
price_avg = df_train['price'].mean()
df_train['target'] = ['expensive' if price >= price_avg else 'cheap' for price in df_train['price']]

In [None]:
# Checking dataset after adding the target column
df_train.shape

In [None]:
#Switchig to numerical values in the recent column added
df_train['target'] = (df_train['target']== 'expensive').astype(int)

In [None]:
# Check and count 'target' column values 
df_train.target.value_counts()

Drop 'price' column

In [None]:
#Drop the 'price' column from df_train dataset
df_train= df_train.drop(columns='price')

In [None]:
# Check column names
df_train.columns

### Pre-processing
Visual analysis

In [None]:
# Checking  numerical column and their metrics
df_train.describe()

In [None]:
# Checking the dataset general info
df_train.info()

In [None]:
# Checking correlation of price and another numerical columns 
df_train = df_train[['Unnamed: 0','id', 'ad_type', 'start_date', 'end_date', 'created_on',
       'lat', 'lon', 'l1', 'l2', 'l3', 'l4', 'l5', 'l6', 'rooms', 'bedrooms',
       'bathrooms', 'surface_total', 'surface_covered', 'currency',
       'price_period', 'title', 'description', 'property_type',
       'operation_type', 'geometry','target']]
                       
corr = df_train.corr()

fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        annot=True);

From the previous Person's correlation figure I´ve temporary choosen to keep 'lat', 'lon', 'bathrooms' numerical features

In [None]:
# Drop numerical features from Pearson´s analysis
df_train = df_train.drop(columns={'Unnamed: 0','id', 'ad_type', 'start_date', 'end_date', 'created_on',
       'rooms', 'bedrooms', 'surface_total', 'surface_covered'})

df_train.head(3)

#### After visualization analysis, I´ve decided to drop the following features:<br>
a. 'l1' due to the whole properties being located in Colombia.<br>
b. 'currency' and 'price_period' because they are irrelevant for properties' price clssification.<br>
c. 'title' and 'description' because I won´t implement any "Natural Language" Model.<br>
d. 'operation_type' all the dataset registers are about properties sale.<br>
e. 'geometry' due to I going to use 'lat' and 'lon' features.

In [None]:
#Drop columns irrelevant under my criterion
df_train = df_train.drop(columns={'l1', 'currency', 'title', 'price_period', 'description', 'operation_type', 'geometry'})

df_train.head(3)

### Graphically visualize missing feature values

In [None]:
#python library to visualise missing values
import missingno as msno

msno.bar(df_train, figsize=(10,5), color='lightblue')
plt.show

In [None]:
# Missing values porcentage in 'l4' column
(df_train.l4.isnull().sum()/len(df_train)*100)

Based on the previous graphical analysis and supported by the percentage of the missing values decided to drop columns 'l4','l5','l6'

In [None]:
# Drop columns
df_train = df_train.drop(columns={'l4', 'l5', 'l6'})

### Features chose to fed the ML model

In [None]:
# Change feature names
df_train = df_train.rename(columns={'l2': 'department',
                                    'l3': 'city', 
                                    'lat': 'latitude',
                                    'lon': 'longitude'})

In [None]:
df_train.head()

## Dealing with NaN values

### Analysis stage

In [None]:
# Checking null values
df_train.isna().sum()

In [None]:
# Checking graphically
msno.bar(df_train, figsize=(8,4), color='pink')
plt.show

In [None]:
df_train.isna().sum()

In [None]:
# Replace the null numerical feature values with their means
df_train['latitude'] = df_train['latitude'].fillna(df_train['latitude'].mean())
df_train['longitude'] = df_train['longitude'].fillna(df_train['longitude'].mean())

In [None]:
# Replace the null 'bathroom' values with their median
df_train['bathrooms'] = df_train['bathrooms'].fillna(df_train['bathrooms'].median())

In [None]:
# Drop the 'city' feature has some different column values from the same in the test dataset
df_train = df_train.drop('city', axis=1)

In [None]:
# Count null values in the whole dataset
df_train.isna().value_counts()

In [None]:
df_train.info()

In [None]:
df_train.head()

### Dealing categorical features

I´m going to use OneHotEncoder to work with categorical feature values and<br>
apply it for each categorical feature in the dataset.

In [None]:
# Import encoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
# Applay encoder to the feature
codif = ohe.fit_transform(df_train[['department']])

# Create a new DF with new values
new_cols = pd.DataFrame(codif.toarray(), columns=ohe.categories_[0])

# Join dataframes
df_train = pd.concat([df_train, new_cols], axis=1)

df_train.shape

In [None]:
# Apply encoder to 'property_type' feature

ohe = OneHotEncoder()
# Applay encoder to the feature
codif = ohe.fit_transform(df_train[['property_type']])

# Create a new DF with new values
new_cols = pd.DataFrame(codif.toarray(), columns=ohe.categories_[0])

# Join dataframes
df_train = pd.concat([df_train, new_cols], axis=1)

df_train.shape

In [None]:
# Checking dataframe
df_train

In [None]:
# Extract the 'target' feature
target = df_train.pop('target')
target

In [None]:
# Delete categorical features
df_train = df_train.drop(columns=['department', 'property_type'])

In [None]:
df_train.shape

In [None]:
# Assign variables to train the ML model
X = df_train
y = target

## ML Model

I´ve chosen Decision Tree Classifier to make predictions.

In [None]:
# Importing the ML model
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=25, random_state=42)
clf.fit(X.values, y.values)

In [None]:
# Check model raleted information
print(clf.classes_)
print(clf.n_classes_)
print(clf.max_features_)
print(clf.feature_importances_)

In [None]:
# Apply the model on train dataset values
y_pred = clf.predict(X.values)

In [None]:
# Check the model efficiency
from sklearn.metrics import accuracy_score

print(accuracy_score(y,y_pred))

In [None]:
# Apply cross validation
from sklearn.model_selection import cross_validate
from IPython.display import clear_output