In [12]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

# Pretty display for notebooks
%matplotlib inline

# Load the Census dataset
data1 = pd.read_csv("test_results.csv")
data2 = pd.read_csv("user_table.csv")
data =pd.merge(data1, data2, on='user_id', how='left')
data = data.drop(['Unnamed: 0'], 1)
# Success - Display the first record
print (data.shape)
display(data.head(n=10))

(316800, 12)


Unnamed: 0,user_id,timestamp,source,device,operative_system,test,price,converted,city,country,lat,long
0,604839,2015-05-08 03:38:34,ads_facebook,mobile,iOS,0,39.0,0,Buffalo,USA,42.89,-78.86
1,624057,2015-05-10 21:08:46,seo-google,mobile,android,0,39.0,0,Lakeville,USA,44.68,-93.24
2,317970,2015-04-04 15:01:23,ads-bing,mobile,android,0,39.0,0,Parma,USA,41.38,-81.73
3,685636,2015-05-07 07:26:01,direct_traffic,mobile,iOS,1,59.0,0,Fayetteville,USA,35.07,-78.9
4,820854,2015-05-24 11:04:40,ads_facebook,web,mac,0,39.0,0,Fishers,USA,39.95,-86.02
5,169971,2015-04-13 12:07:08,ads-google,mobile,iOS,0,39.0,0,New York,USA,40.67,-73.94
6,600150,2015-03-04 14:45:44,seo_facebook,web,windows,0,39.0,0,,,,
7,798371,2015-03-15 08:19:29,ads-bing,mobile,android,1,59.0,1,East Orange,USA,40.77,-74.21
8,447194,2015-03-28 12:28:10,ads_facebook,web,windows,1,59.0,0,Dayton,USA,39.78,-84.2
9,431639,2015-04-24 12:42:18,ads_facebook,web,windows,1,59.0,0,Richmond,USA,37.53,-77.47


In [13]:
data.describe()

Unnamed: 0,user_id,test,price,converted,lat,long
count,316800.0,316800.0,316800.0,316800.0,275616.0,275616.0
mean,499281.34184,0.360079,6457590.0,0.018333,37.11168,-93.981772
std,288591.154044,0.480024,67973070.0,0.134154,5.209627,18.086486
min,3.0,0.0,39.0,0.0,19.7,-157.8
25%,249525.75,0.0,39.0,0.0,33.66,-112.2
50%,499021.5,0.0,39.0,0.0,37.74,-88.93
75%,749025.5,1.0,59.0,0.0,40.7,-78.91
max,1000000.0,1.0,999884300.0,1.0,61.18,30.31


### Changing lat,long data types for EDA¶

### Creating two separate dataframes for test & control¶

We have collected ~0.3MM data and control / test split is ~66% / 33%

In [15]:
data_test = data[data['test']==1]
data_control = data[data['test']==0]
print (' Test length: ', len(data_test),'\n','Control length: ', len(data_control)) # split between test & control

 Test length:  114073 
 Control length:  202727


### Performing t-test for conversion between test & training data sets

In [16]:
from scipy import stats
stats.ttest_ind(data_test['converted'], data_control['converted'])

Ttest_indResult(statistic=-8.78381266742153, pvalue=1.5878799805343215e-18)

In [17]:
print ('Test set coverted mean:', data_test['converted'].mean())
print ('Convert set converted mean:', data_control['converted'].mean())

Test set coverted mean: 0.015542678810936857
Convert set converted mean: 0.019903614220108817



The control set (control with $39 priced software) are converting at 1.99% while users in the test just at 1.55%. That’s a 22% drop. Which is be dramatic if it were true. The most likely reason for weird A/B test results are:

We didn’t collect enough data.
Some bias has been introduced in the experiment so that test/control people are not really random. In data science, whenever results appear too bad or too good to be true, they are not true. 

In [18]:
def data_cleaner(data):
    df_clean= data.copy()
    
    #print number of null cells and shape before removing NaN
    print('Number of null cells: ',data.isnull().sum().sum())
    print('Shape before removing NaN: ', data.shape)

    #eliminate any rows with Nan values
    df_clean = df_clean.dropna(axis=0)
    print('Shape after removing NaN: ', df_clean.shape)

    # removing any rows with filt_x=False
    #df_clean = df_clean[df_clean['filt_x']==True]

    #Feature engineering: drop unrelated features 
    df_clean = df_clean.drop(['user_id', 'timestamp', 'price'],axis=1)
    print('Shape after feature engineering: ', df_clean.shape)
    return df_clean

In [19]:
data_clean=data_cleaner(data)
display(data_clean.head(n=10))

Number of null cells:  185721
Shape before removing NaN:  (316800, 12)
Shape after removing NaN:  (257298, 12)
Shape after feature engineering:  (257298, 9)


Unnamed: 0,source,device,operative_system,test,converted,city,country,lat,long
0,ads_facebook,mobile,iOS,0,0,Buffalo,USA,42.89,-78.86
1,seo-google,mobile,android,0,0,Lakeville,USA,44.68,-93.24
2,ads-bing,mobile,android,0,0,Parma,USA,41.38,-81.73
3,direct_traffic,mobile,iOS,1,0,Fayetteville,USA,35.07,-78.9
4,ads_facebook,web,mac,0,0,Fishers,USA,39.95,-86.02
5,ads-google,mobile,iOS,0,0,New York,USA,40.67,-73.94
7,ads-bing,mobile,android,1,1,East Orange,USA,40.77,-74.21
8,ads_facebook,web,windows,1,0,Dayton,USA,39.78,-84.2
9,ads_facebook,web,windows,1,0,Richmond,USA,37.53,-77.47
10,ads-google,web,windows,0,0,Bloomfield Township,USA,42.58,-83.27


In [20]:
features = data_clean.copy()
features = features.drop(['test'], axis=1)

    #Extract labels list
labels = data_clean ['test'].astype(int)
print(features.shape)
print(labels.shape)

(257298, 8)
(257298,)


In [23]:
features_dummy = pd.get_dummies(features)
#features_dummy = features_dummy.astype(np.float32)
print(features_dummy.shape)

(257298, 948)


In [28]:
from sklearn import cluster
from collections import Counter

def fit_dbscan(features_data):
    dbscan = cluster.DBSCAN(eps=5, min_samples=15)
    dbscan_labels = dbscan.fit_predict(features_data)
    return dbscan_labels

dbscan_labels = fit_dbscan (features_dummy.iloc[:10000])
print('dbscan_labels_ classes: ', Counter(dbscan_labels))

dbscan_labels_ classes:  Counter({0: 9955, -1: 28, 1: 17})
