# Data Science Challenge

In [33]:
# If you'd like to install packages that aren't installed by default, uncomment the last two lines of this cell and replace <package list> with a list of your packages.
# This will ensure your notebook has all the dependencies and works everywhere

#import sys
#!{sys.executable} -m pip install <package list>

In [34]:
#Libraries
import pandas as pd, numpy as np
pd.set_option("display.max_columns", 101)

## Data Description

Column | Description
:---|:---
`id` | Unique id
`Daily Time Spent on Site` | Consumer time spent on site (in minutes)
`Age` | Consumer age (in years)
`Area Income` | Avg. Income of geographical area of consumer (in US $$)
`Daily Internet Usage` | Avg. minutes a day consumer is on the internet
`Ad Topic Line` | Headline of the advertisement
`gender` | Gender of the consumer
`Country` | Country of consumer
`Timestamp` | Timestamp at which consumer clicked on Ad or closed window (YYYY-MM-DD HH:MM:SS)
`Clicked` | Whether a consumer clicked on the advert or not (0: No ,1: Yes)

## Data Wrangling & Visualization

In [35]:
# Dataset is already loaded below
data = pd.read_csv("train.csv")

In [36]:
data.head()

Unnamed: 0,id,Timestamp,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,gender,Country,Clicked
0,1200,2016-01-01 02:52:10,80.67,34,58909.36,239.76,Seamless impactful info-mediaries,0,Portugal,0
1,1201,2016-01-01 03:35:35,68.01,25,68357.96,188.32,Ameliorated actuating workforce,1,Afghanistan,0
2,1202,2016-01-01 05:31:22,80.94,36,60803.0,239.94,Self-enabling local strategy,0,Bermuda,0
3,1203,2016-01-01 08:27:06,78.77,28,63497.62,211.83,Public-key intangible Graphical User Interface,0,Guam,0
4,1204,2016-01-01 15:14:24,36.56,29,42838.29,195.89,Team-oriented bi-directional secured line,0,Uganda,1


In [37]:
#Explore columns
data.columns

Index(['id', 'Timestamp', 'Daily Time Spent on Site', 'Age', 'Area Income',
       'Daily Internet Usage', 'Ad Topic Line', 'gender', 'Country',
       'Clicked'],
      dtype='object')

In [38]:
#Description
data.describe()

Unnamed: 0,id,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,gender,Clicked
count,1000.0,1000.0,1000.0,775.0,1000.0,1000.0,1000.0
mean,1699.5,65.12065,35.816,55343.014555,179.59231,0.474,0.503
std,288.819436,15.781456,8.782669,13182.578667,43.735785,0.499573,0.500241
min,1200.0,32.6,19.0,14548.06,105.0,0.0,0.0
25%,1449.75,51.45,29.0,48270.62,137.885,0.0,0.0
50%,1699.5,68.39,35.0,57737.51,182.425,0.0,1.0
75%,1949.25,78.5725,41.25,65207.185,217.7075,1.0,1.0
max,2199.0,91.37,61.0,79484.8,269.96,1.0,1.0


## Preprocessing

In [41]:
# Explore values types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
id                          1000 non-null int64
Timestamp                   1000 non-null object
Daily Time Spent on Site    1000 non-null float64
Age                         1000 non-null int64
Area Income                 775 non-null float64
Daily Internet Usage        1000 non-null float64
Ad Topic Line               1000 non-null object
gender                      1000 non-null int64
Country                     1000 non-null object
Clicked                     1000 non-null int64
dtypes: float64(3), int64(4), object(3)
memory usage: 78.2+ KB


In [42]:
# Convert timestamps to unix format

# Create function to convert the format 
def timestamp_to_unix(time_stamp):
    import time
    import datetime
    
    reformat = datetime.datetime.strptime(time_stamp,'%Y-%m-%d %H:%M:%S')
    unix_format = int(datetime.datetime.timestamp(reformat))
    
    return(unix_format)

In [43]:
data['Timestamp'] = data['Timestamp'].apply(timestamp_to_unix)
data.head()

Unnamed: 0,id,Timestamp,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,gender,Country,Clicked
0,1200,1451616730,80.67,34,58909.36,239.76,Seamless impactful info-mediaries,0,Portugal,0
1,1201,1451619335,68.01,25,68357.96,188.32,Ameliorated actuating workforce,1,Afghanistan,0
2,1202,1451626282,80.94,36,60803.0,239.94,Self-enabling local strategy,0,Bermuda,0
3,1203,1451636826,78.77,28,63497.62,211.83,Public-key intangible Graphical User Interface,0,Guam,0
4,1204,1451661264,36.56,29,42838.29,195.89,Team-oriented bi-directional secured line,0,Uganda,1


## Visualization, Modeling, Machine Learning

Predict which users might click a particular advertisement.
Please explain the findings effectively to technical and non-technical audiences using comments and visualizations, if appropriate.
- **Build an optimized model that effectively solves the business problem.**
- **The model's performance will be evaluated on the basis of accuracy.**
- **Read the test.csv file and prepare features for testing.**

In [67]:
# Preparing the data for training.

X = data.iloc[:, 0:9]
y = data.iloc[:, 9]

In [68]:
X.head()

Unnamed: 0,id,Timestamp,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,gender,Country
0,1200,1451616730,80.67,34,58909.36,239.76,Seamless impactful info-mediaries,0,Portugal
1,1201,1451619335,68.01,25,68357.96,188.32,Ameliorated actuating workforce,1,Afghanistan
2,1202,1451626282,80.94,36,60803.0,239.94,Self-enabling local strategy,0,Bermuda
3,1203,1451636826,78.77,28,63497.62,211.83,Public-key intangible Graphical User Interface,0,Guam
4,1204,1451661264,36.56,29,42838.29,195.89,Team-oriented bi-directional secured line,0,Uganda


In [69]:
y.head()

Unnamed: 0,Clicked
0,0
1,0
2,0
3,0
4,1


In [73]:
X = X.values.tolist()

In [76]:
# Training the decision tree classifier

from sklearn import tree
classifier = tree.DecisionTreeClassifier()
classifier = classifier.fit(X, y)

ValueError: could not convert string to float: 'Seamless impactful info-mediaries'

In [47]:
#Loading Test data
test_data=pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,id,Timestamp,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,gender,Country
0,2200,2016-06-04 17:24:07,43.88,54,31523.09,166.85,Enhanced methodical database,1,Somalia
1,2201,2016-06-05 00:29:13,66.4,40,77567.85,214.42,Digitized heuristic solution,0,Antigua and Barbuda
2,2202,2016-06-05 00:29:13,79.52,34,,141.58,Customer-focused 24/7 concept,0,United States of America
3,2203,2016-06-05 07:54:30,52.35,25,33293.78,147.61,Mandatory 4thgeneration structure,1,Mali
4,2204,2016-06-05 07:54:30,81.51,36,,195.93,Monitored local Internet solution,0,Croatia


In [48]:
# Convert the timestamps in test data.

test_data['Timestamp'] = test_data['Timestamp'].apply(timestamp_to_unix)
test_data.head()

Unnamed: 0,id,Timestamp,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,gender,Country
0,2200,1465061047,43.88,54,31523.09,166.85,Enhanced methodical database,1,Somalia
1,2201,1465086553,66.4,40,77567.85,214.42,Digitized heuristic solution,0,Antigua and Barbuda
2,2202,1465086553,79.52,34,,141.58,Customer-focused 24/7 concept,0,United States of America
3,2203,1465113270,52.35,25,33293.78,147.61,Mandatory 4thgeneration structure,1,Mali
4,2204,1465113270,81.51,36,,195.93,Monitored local Internet solution,0,Croatia




**Describe the most important features in the model to a management audience.**

> #### Task:
- **Visualize the top 10 features and their feature importance.**


> #### Task:
- **Submit the predictions on the test dataset using your optimized model** <br/>
    For each record in the test set (`test.csv`), predict whether a consumer clicked on an ad or not. Submit a CSV file with a header row and one row per test entry. 
    
The file (`submissions.csv`) should have exactly 2 columns:
   - **id**
   - **Clicked**

In [None]:
#Submission
submission_df.to_csv('submissions.csv',index=False)

---