# Data Pre-Processing

#### Import Packages and CSV

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

pd.pandas.set_option("display.max_columns", None)
# Create Dataframe
df = pd.read_csv(r"Mall_Customers.csv")
# Print shape of dataset
print(df.shape)

(200, 5)


## Data Cleaning

### Handling Missing values

* Handling Missing values 
* Handling Duplicates
* Check data type
* Understand the dataset

#### Check Null Values

In [2]:
##these are the features with nan value
features_with_na=[features for features in df.columns if df[features].isnull().sum()>=1]
for feature in features_with_na:
    print(feature,np.round(df[feature].isnull().mean()*100,5), '% missing values')

In [None]:
features_with_na

* **There are no null values in the dataset**

### 3.2 Other Data Cleaning steps

**Handling Duplicates**

In [3]:
df.duplicated().sum()

0

* **No Duplicates in the dataset**

**Remove CustomerID from the dataset as it cannot used in Model Training**

In [4]:
df.drop('CustomerID', inplace=True, axis=1)

# Feature Engineering

## Feature Extraction

In [5]:
df.head()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


### Type of Features

**Numeric Features**

In [6]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

Num of Numerical Features : 3


**Categorical Features**

In [7]:
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))

Num of Categorical Features : 1


**Discrete features**

In [8]:
discrete_features=[feature for feature in num_features if len(df[feature].unique())<=25]
print('Num of Discrete Features :',len(discrete_features))

Num of Discrete Features : 0


**Continues Features**

In [None]:
continuous_features=[feature for feature in num_features if feature not in discrete_features]
print('Num of Continuous Features :',len(continuous_features))

### Split X and Y

## Feature Encoding and Scaling

 **One Hot Encoding for Columns which had lesser unique values and not ordinal**
* One hot encoding is a process by which categorical variables are converted into a form that could be provided to ML algorithms to do a better job in prediction.

**Ordinal Encoding for Columns which has many unique categories** 
* Ordinal encoding is used here as label encoder is supported for column transformer.
* Ordinal encoding is used for Ordinal Variable. Variable comprises a finite set of discrete values with a ranked ordering between values.

**Label encoding Gender column**

In [None]:
# If the gender column has Male it is encoded as 1 others as 0
from sklearn import preprocessing
le1 = preprocessing.LabelEncoder()
df['Gender'] =le1.fit_transform(df['Gender'])

# Regression

## Train Test Split
- The train-test split procedure is used to estimate the performance of machine learning algorithms when they are used to make predictions on data not used to train the model.

- It is a fast and easy procedure to perform, the results of which allow you to compare the performance of machine learning algorithms.

In [11]:
from sklearn.model_selection import  train_test_split
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((160, 3), (40, 3))

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error 

In [13]:
def evaluate_clf(true, predicted):
    mae = mean_absolute_error(true, predicted) # Calculate Accuracy
    return mae

In [14]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Support Vector Regressor": SVR(),
}

In [15]:
# Create a function which can evaluate models and return a report 
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    
    models_list = []
    accuracy_list = []
    
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Training set performance
        mae_train_error=evaluate_clf(y_train ,y_train_pred)


        # Test set performance
        mae_test_error=evaluate_clf(y_test, y_test_pred)

        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- MAE: {:.4f}".format(mae_train_error))
       
        print('----------------------------------')

        print('Model performance for Test set')
        print('- MAE: {:.4f}'.format(mae_test_error))
        accuracy_list.append(mae_test_error)
        print('='*35)
        print('\n')
        
    report=pd.DataFrame(list(zip(models_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=['Accuracy'], ascending=False)
        
    return report

## Model Training 

In [16]:
base_model_report =evaluate_models(X=X, y=y, models=models)

Linear Regression
Model performance for Training set
- MAE: 20.8651
----------------------------------
Model performance for Test set
- MAE: 18.1514


Decision Tree Regressor
Model performance for Training set
- MAE: 0.5875
----------------------------------
Model performance for Test set
- MAE: 17.0000


Support Vector Regressor
Model performance for Training set
- MAE: 20.8281
----------------------------------
Model performance for Test set
- MAE: 18.3155




**Results of All Models**

In [17]:
base_model_report

Unnamed: 0,Model Name,Accuracy
2,Support Vector Regressor,18.315538
0,Linear Regression,18.151395
1,Decision Tree Regressor,17.0


## Best Model is Support Vector Regressor (SVR) with Accuracy 18.31%