In [277]:
# Importing necessary packages
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
# Machine learning libraries:# Machi 
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_predict

%matplotlib inline

In [209]:
# Reading takehome_users.csv data into dataframe
filename = 'D:\\Springboard_Capstone2\\relax_challenge\\takehome_users.csv'
takehome_users=pd.read_csv(filename, encoding='latin-1')
takehome_users = takehome_users.dropna()
takehome_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [210]:
#Checking for missing values
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(takehome_users.isnull().sum())

object_id                     0
creation_time                 0
name                          0
email                         0
creation_source               0
last_session_creation_time    0
opted_in_to_mailing_list      0
enabled_for_marketing_drip    0
org_id                        0
invited_by_user_id            0
dtype: int64


In [211]:
# Reading takehome_user_engagement.csv data into dataframe
filename = 'D:\\Springboard_Capstone2\\relax_challenge\\takehome_user_engagement.csv'
takehome_user_engagement=pd.read_csv(filename, encoding='latin-1', parse_dates=['time_stamp'])
takehome_user_engagement.dropna()
takehome_user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [212]:
# Parsing by Week
takehome_user_engagement['wk'] = takehome_user_engagement['time_stamp'].dt.week

In [213]:
#Calculating the Frequency of logging every user 
takehome_user_engagement = takehome_user_engagement.groupby(['wk','user_id'])['time_stamp'].count().reset_index(name="freq")

In [214]:
# Setting user frequency criterion to 3
result = takehome_user_engagement[takehome_user_engagement.freq >= 3]

In [215]:
# only user ids matching criteria
user_ids = result.user_id.tolist()
user_ids = list(set(user_ids))

In [216]:
# Checking the length of the users who logged in on three separate
# days in at least one sevenday period
len(user_ids)

1445

In [217]:
#Converting the list into a DataFrame
df = pd.DataFrame(user_ids,columns=['object_id'])
df.head()

Unnamed: 0,object_id
0,8192
1,2
2,8196
3,10
4,20


##### Introducing Label column based on the criterion

In [266]:
takehome_users['Label']= df.isin(takehome_users.object_id)
takehome_users = takehome_users.dropna()
takehome_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,Label
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,True
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,False
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


In [287]:
#Preparing the Training and Testing data
Features = ['creation_source','enabled_for_marketing_drip','opted_in_to_mailing_list']
Label = ['Label']

X = takehome_users[Features]
y = takehome_users[Label]

In [278]:
X = pd.get_dummies(X)

### Applyting Logistic Regression Classification to my Training Data 

In [270]:
# Splitting the data into Training and Testing set
from sklearn.model_selection import train_test_split

In [271]:
# Splitting the Dataset into Training and Testing set
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [272]:
# train a logistic regression model on the training set
from sklearn.ensemble import RandomForestClassifier

# instantiate model
rf = RandomForestClassifier()


In [273]:
# fit model
rf.fit(X_train, Y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [274]:
# Predict the labels: pred
pred = rf.predict(X_test)

In [279]:
# Machine learning pipelines
pipe_RF = make_pipeline(RandomForestClassifier())

In [280]:
# Compute predicted y's (y_hat)
predicted_RF = cross_val_predict(pipe_RF, X, y, cv=3, n_jobs=-1)



In [281]:
# Classification tables

print("Random Forest:") 
print(classification_report(y, predicted_RF)) 
print('\n')

Random Forest:
             precision    recall  f1-score   support

      False       1.00      1.00      1.00       554
       True       0.00      0.00      0.00         1

avg / total       1.00      1.00      1.00       555





  'precision', 'predicted', average, warn_for)


In [None]:
# Print the name and gini importance of each feature
for feature in zip(feat_labels, rf.feature_importances_):
    print(feature)

In [282]:
from sklearn.feature_selection import SelectFromModel

In [283]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(rf, threshold=0.15)

# Train the selector
sfm.fit(X_train, Y_train)


  self.estimator_.fit(X, y, **fit_params)


SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        norm_order=1, prefit=False, threshold=0.15)

In [288]:
Features = takehome_users[Features]

In [290]:
# Print the name and gini importance of each feature
for feature in zip(Features, rf.feature_importances_):
    print(feature)

('creation_source', 0.020305590414704)
('enabled_for_marketing_drip', 0.15807768187555624)
('opted_in_to_mailing_list', 0.08357138433003244)


#### As per the results from my classifier , I feel the users who are on Regular marketing Dip are most likely to be underthe category of Adopted user.