# Data Science 
This notebook apply different Machine Learning algorithms to learn and generate prediction data from the original dataset, help us to predict the future revenue from what we know from our customer information

## Library 

In [19]:
import pandas as pd
import numpy as np 
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error as mea
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor as XGBR

import warnings 
#Ignore all warnings
warnings.filterwarnings('ignore')


In [4]:
#change to corrected the directory 
os.chdir('..')

# Import the dataset
data = pd.read_csv('./Data_Engineer/data/marketing.csv')
data.drop(columns = ['Unnamed: 0'], inplace= True)

In [5]:
data.head()

Unnamed: 0,gender,age,occupation numeric,city_category,stay_in_current_city_years,marital_status,product_category_1,product_category_2,product_category_3,purchase_amount
0,F,0-17,10,A,2,0,1,6,14,15200
1,M,46-50,7,B,2,1,1,8,17,19215
2,M,26-35,20,A,1,1,1,2,5,15665
3,F,51-55,9,A,1,0,5,8,14,5378
4,F,51-55,9,A,1,0,2,3,4,13055


In [6]:
#There is no missing Value in this dataset 
data.isnull().sum()

gender                        0
age                           0
occupation numeric            0
city_category                 0
stay_in_current_city_years    0
marital_status                0
product_category_1            0
product_category_2            0
product_category_3            0
purchase_amount               0
dtype: int64

Apply Label encoding for the categorical columns , turn them into numeric values

In [7]:
#Apply label encoding the categorical columns 
cols = ['gender','age','city_category','stay_in_current_city_years']
label = LabelEncoder()
for col in cols:
    data[col] = label.fit_transform(data[col])

In [8]:
data.head()

Unnamed: 0,gender,age,occupation numeric,city_category,stay_in_current_city_years,marital_status,product_category_1,product_category_2,product_category_3,purchase_amount
0,0,0,10,0,2,0,1,6,14,15200
1,1,4,7,1,2,1,1,8,17,19215
2,1,2,20,0,1,1,1,2,5,15665
3,0,5,9,0,1,0,5,8,14,5378
4,0,5,9,0,1,0,2,3,4,13055


In [9]:
#Assign y = purchase_amount, because y is a targetd variable, the rest of the columns assign in X
X = data.drop(columns =['purchase_amount'])
y = data['purchase_amount']

In [10]:
#Split the dataset into training and testing set 
X_train, X_test, y_train, y_test = train_test_split(X,y ,random_state= 42, test_size= 0.2)


# Base Model

We choose Linear Regression As base model, because it simplicity and easy to understand.

In [26]:

base_model = LinearRegression()
base_model.fit(X_train,y_train)
base_model_prediction = base_model.predict(X_test)

#Cross Validation 
cv_score = cross_val_score(base_model, X, y, scoring = 'neg_mean_squared_error', cv = 3)
cv_score = np.abs(np.mean(cv_score))
base_RMSE = np.sqrt(mean_squared_error(y_test,base_model_prediction))

print('Results')
print('RMSE', base_RMSE )
print('CV Score: ', np.sqrt(cv_score))

Results
RMSE 4641.144105485624
CV Score:  4632.738083476819


## XGBoost

The Section below initiate the XGBoost model, we are predicting the continous value, so we use XGBoost Regression, instead of XGBoost Classification
- n_estimator = 1000. This value specifies the number of boosting round or tree to be built in the model
- learning_rate = 0.05. This parameter control the step size at each itation while moving towards a minimum of the loss function 

In [28]:
XGB_model = XGBR(n_estimator = 1000, learning_rate= 0.05)
XGB_model.fit(X_train,y_train, eval_set = [(X_test,y_test)], verbose = False)
XGB_prediction = XGB_model.predict(X_test)
#Cross Validation 
cv_score_XGB = cross_val_score(XGB_model, X, y, scoring = 'neg_mean_squared_error', cv = 3)
cv_score_XGB = np.abs(np.mean(cv_score))

XGB_RMSE = np.sqrt(mean_squared_error(y_test,XGB_prediction))

print('XGB Results')
print('RMSE', XGB_RMSE)
print('CV Score: ', np.sqrt(cv_score_XGB))

XGB Results
RMSE 3526.036886755855
CV Score:  4632.738083476819


# Random Forest 
The section below fit the Random forest machine learning model. Similar to XGBoost, random forest also an ensemble learning, which combine multiple decision trees. but random forest build trees independently and aggregates their result

In [23]:
random_forest_model = RandomForestRegressor(n_jobs = 1)
random_forest_model.fit(X_train,y_train)
random_forest_prediction = random_forest_model.predict(X_test)
#Cross Validation 
cv_score_random_forest = cross_val_score(random_forest_model, X, y, scoring = 'neg_mean_squared_error', cv = 3)
cv_score_random_forest = np.abs(np.mean(cv_score))

random_forest_RMSE = np.sqrt(mean_squared_error(y_test,random_forest_prediction))

print('Random Forest Result')
print('RMSE', random_forest_RMSE)
print('CV Score: ', np.sqrt(cv_score_random_forest))

Random Forest Result
RMSE 3738.107628099742
CV Score:  4632.738083476819


In [29]:
evaluation = {
    'Model' : ['Base Model', 'XGBoost' , 'Random Forest'],
    'RMSE Score': [base_RMSE, XGB_RMSE, random_forest_RMSE],
    'CV Score': [cv_score, cv_score_XGB, cv_score_random_forest]
}

evaluation = pd.DataFrame(evaluation)
print(evaluation)

           Model   RMSE Score      CV Score
0     Base Model  4641.144105  2.146226e+07
1        XGBoost  3526.036887  2.146226e+07
2  Random Forest  3738.107628  2.146226e+07


# we can see the XGBoost provide the lowest , so it can become the best model among 3

In [27]:
output.to_csv("XGB_output.csv")