# Predicting Price @ Hansel AMS

This data project has been used as a take-home assignment in the recruitment process for the data science positions at Haensel AMS.

## Data Description
The data are provided in the sample.csv file. The target variable is price. There are 7 attributes and total 10000 records.

Steps:
1. EDA & split data to training, validation, & test sets;
2. Fit a base model;
3. Fit multiple ML models to compare the results;
4. Tune the selected model.

#### conda activate ml_env

In [None]:
# Load the packages
import math
import matplotlib.pyplot as plt
import seaborn as sns
import shap as shap
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings

# Libraries for missing values:
import missingno as msno   
# !pip install fancyimpute --target=/kaggle/working/
#import fancyimpute
warnings.filterwarnings("ignore")





  from .autonotebook import tqdm as notebook_tqdm


In [16]:
# 2. Create interactive widgets in Jupyter notebooks
# Use Jupyter Widges Package - https://ipywidgets.readthedocs.io/en/stable/index.html 
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import ipywidgets as widgets
style = {'description_width': 'initial'}

# Configure pandas
pd.set_option('display.max_colwidth', None) # do not truncate column values
pd.set_option('display.max_rows', 200)      # max number of rows to display 
pd.set_option('display.max_columns', 300)   # max number of columns to display 

In [7]:
# Load the data
hansel_data = pd.read_csv("datasets/sample.csv")

# get a list of categorical and numerical variables
categorical_features = list(hansel_data.select_dtypes(include=['object', 'category']).columns)
numerical_features = list(hansel_data.select_dtypes(include=['float64', 'int64']).columns)[:-1]

hansel_data


Unnamed: 0,loc1,loc2,para1,dow,para2,para3,para4,price
0,0,01,1,Mon,662,3000.0,3.8,73.49
1,9,99,1,Thu,340,2760.0,9.2,300.00
2,0,04,0,Mon,16,2700.0,3.0,130.00
3,4,40,1,Mon,17,12320.0,6.4,365.00
4,5,50,1,Thu,610,2117.0,10.8,357.50
...,...,...,...,...,...,...,...,...
9995,9,98,3,Fri,386,5000.0,12.0,460.00
9996,7,74,1,Thu,386,3250.0,8.0,325.00
9997,0,06,0,Tue,190,8856.0,5.6,133.33
9998,7,74,3,Fri,717,5000.0,13.6,820.00


In [8]:
# EDA
num_duplicates = hansel_data.duplicated().sum()

if num_duplicates !=0:
    print(f'There are {num_duplicates} duplicate rows, removing them from the data.')
    hansel_data.drop_duplicates(inplace=True)
else:
    print(f'{num_duplicates} duplicate rows detected.')

There are 107 duplicate rows, removing them from the data.


In [14]:
# Descriptive Statistics
desc = pd.DataFrame(index = list(hansel_data))
desc['type'] = hansel_data.dtypes
desc['count'] = hansel_data.count()
desc['nunique'] = hansel_data.nunique()
#desc['%unique'] = desc['nunique'] /len(df_train) * 100
desc['null'] = hansel_data.isnull().sum()
desc['%null'] = desc['null'] / len(hansel_data) * 100
desc = pd.concat([desc,hansel_data.describe().T.drop('count',axis=1)],axis=1)
desc.sort_values(by=['type','null']).style.background_gradient(axis=0)

Unnamed: 0,type,count,nunique,null,%null,mean,std,min,25%,50%,75%,max
para1,int64,9893,13,0,0.0,1.384009,3.518367,0.0,1.0,1.0,1.0,337.0
para2,int64,9893,1016,0,0.0,447.895987,221.453277,16.0,302.0,434.0,582.0,2554.0
para3,float64,9893,4359,0,0.0,9452.867704,7988.784764,200.0,2857.0,6340.0,15000.0,34782.0
para4,float64,9893,243,0,0.0,8.413242,4.607946,1.0,4.0,7.2,13.6,27.2
price,float64,9893,932,0,0.0,433.178061,278.156427,50.73,250.0,370.0,550.0,5700.0
loc1,object,9893,12,0,0.0,,,,,,,
loc2,object,9893,107,0,0.0,,,,,,,
dow,object,9893,7,0,0.0,,,,,,,


In [17]:
# Plotting the categorical variables 
### hist plot for categorical features to include both training and testing data ###
###  count_plot function to display categories in the same order ###
def count_plot(var, dataframe1):
    # Determine the union of all unique values from both datasets for the variable
    all_categories = pd.Series(pd.concat([dataframe1[var]]).unique()).dropna()
    
    # # Adjusted for two subplots:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6)) #, sharey=True) # sharey to ensure y-axis is shared
    sns.countplot(y=var, data=dataframe1, ax=axes[0], order=all_categories)
    axes[0].set_title(f'Training Set - {var}')
    plt.tight_layout()

def inter_cat_plot(x):
    return count_plot(x, hansel_data)

### Multiple selection widgets for interactive plotting ###
widget_cat_plot = widgets.Dropdown(
    options=categorical_features,
    description="Categorical Variable:", 
    style=style
)

interact(inter_cat_plot, x=widget_cat_plot)

In [None]:
# Split the data
target = 'price'
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)


In [None]:
# Fit the base model

Source:

https://platform.stratascratch.com/data-projects/predicting-price

https://scikit-learn.org/stable/supervised_learning.html