In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', None)
%config IPCompleter.greedy=True

In [None]:
# Obtain and combine database

leads = pd.read_csv('/Users/steven.w/Downloads/Final_Projects/leads.csv')
opps = pd.read_csv('/Users/steven.w/Downloads/Final_Projects/opps.csv')
opps['Opportunity'].fillna(value=1, inplace=True)
data = pd.concat([opps, leads])
data.drop_duplicates(subset=['Id'])
df = data.iloc[:10000]
df.to_csv('machine_learning_data.csv')

### Machine Learning Part

In [69]:
data = pd.read_csv('machine_learning_data.csv')

In [71]:
# Convert the SFDC Campaigns to yes or no
data['SFDC Campaigns'].fillna(value=0, inplace=True)
data['From SFDC Campaigns'] = np.where(data['SFDC Campaigns'] == 0, 0, 1)

In [72]:
# Convert 'City of Event' to yes or no
data['City of Event'].fillna(value=0, inplace=True)
data['Attended Event'] = np.where(data['City of Event'] == 0, 0, 1)

In [73]:
# convert birth date to age
time_value = pd.to_datetime(data['Birth Date'], format='%Y-%m-%d')
time_value = pd.DatetimeIndex(time_value)
data['Age'] = 2020 - time_value.year
data['Age'].fillna(value=data['Age'].mean(), inplace=True)

In [93]:
# clearn all the features we need for machine learening
data['Unsubscribed'].fillna(value=0, inplace=True)
data['Person Score'].fillna(value=0, inplace=True)
data['Behavior Score'].fillna(value=0, inplace=True)
data['Media SubGroup'].fillna(value=0, inplace=True)
data['Address Country'].fillna(value=0, inplace=True)
data['Primary Program'].fillna(value=0, inplace=True)
data['Engagement'].fillna(value=0, inplace=True)
data['Opportunity'].fillna(value=0, inplace=True)

In [94]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                10000 non-null  int64  
 1   Id                        10000 non-null  int64  
 2   Person Status             6745 non-null   object 
 3   Person Score              10000 non-null  float64
 4   SFDC Type                 10000 non-null  object 
 5   Application Status        244 non-null    object 
 6   Behavior Score            10000 non-null  float64
 7   Birth Date                3626 non-null   object 
 8   Citizenship Status        310 non-null    object 
 9   City of Event             10000 non-null  object 
 10  Source                    232 non-null    object 
 11  Media Group               10000 non-null  object 
 12  Media SubGroup            10000 non-null  object 
 13  Opportunity               10000 non-null  float64
 14  Address

In [243]:
# Prepare the dataframe for machine learning (pick up different features)
df = data[['Media SubGroup', 'Primary Program', 'Unsubscribed', 'Attended Event', 'Opportunity']]

In [244]:
y = df['Opportunity']

x = df.drop(axis=1, columns=['Opportunity'])

In [245]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [246]:
# # Standardization
# std = StandardScaler()
# x_train[['Person Score', 'Behavior Score', 'Engagement']] = std.fit_transform(x_train[['Person Score', 'Behavior Score', 'Engagement']])
# x_test[['Person Score', 'Behavior Score', 'Engagement']] = std.transform(x_test[['Person Score', 'Behavior Score', 'Engagement']])

In [247]:
# # Scaler
# mm = MinMaxScaler()
# x_train[['Person Score', 'Behavior Score', 'Engagement']] = mm.fit_transform(x_train[['Person Score', 'Behavior Score', 'Engagement']])
# x_test[['Person Score', 'Behavior Score', 'Engagement']] = mm.transform(x_test[['Person Score', 'Behavior Score', 'Engagement']])

In [248]:
# x_train['Age'] = x_train['Age'].values.reshape(-1, 1)
# x_test['Age'] = x_test['Age'].values.reshape(-1, 1)

In [249]:
# # Using PCA decrease the features
# pca = PCA(n_components=0.9)
# x_train = pca.fit_transform(x_train)
# x_test = pca.fit_transform(x_test)

In [250]:
dict = DictVectorizer(sparse=False)
x_train = dict.fit_transform(x_train.to_dict(orient='records'))
x_test = dict.transform(x_test.to_dict(orient='records'))

### Using KNeighbors 

In [251]:
# use knn model
knn = KNeighborsClassifier()

knn.fit(x_train, y_train)
knn.score(x_test, y_test)

# y_predict = knn.predict(x_test)
# y_predict

0.8948

In [252]:
# using GridSearchCV 
param = {'n_neighbors': [5, 10, 50, 100, 500]}
gc = GridSearchCV(knn, param_grid=param, cv=2)
gc.fit(x_train, y_train)
gc.score(x_test, y_test)

0.8896

In [253]:
gc.best_params_
# gc.best_score_
# gc.best_estimator_
# gc.cv_results_

{'n_neighbors': 10}

### Using naive bayes classifier

In [254]:
# tf = TfidfVectorizer()
# mlt = MultinomialNB(alpha=1.0)
# mlt.fit(x_train, y_train)

# Not a good idea

### Using Decision Tree

In [255]:
print(dict.get_feature_names())
x_train

['Attended Event', 'Media SubGroup', 'Media SubGroup=680 News', 'Media SubGroup=Academy Canada', 'Media SubGroup=Academy of Design', 'Media SubGroup=Academy of Design Sign', 'Media SubGroup=Admissions Advisor', 'Media SubGroup=All Star Directories', 'Media SubGroup=Alumni', 'Media SubGroup=American Life News', 'Media SubGroup=Ashley', 'Media SubGroup=At Work Immigration', 'Media SubGroup=Bee Line Web', 'Media SubGroup=Bing', 'Media SubGroup=Bing Ad', 'Media SubGroup=Bing Ad Brand', 'Media SubGroup=Blog', 'Media SubGroup=Brazil Tour', 'Media SubGroup=Brochure', 'Media SubGroup=CCO', 'Media SubGroup=CHFI', 'Media SubGroup=CHOM', 'Media SubGroup=CHUM', 'Media SubGroup=Canadian Military Magazine', 'Media SubGroup=Capital English Solutions', 'Media SubGroup=Career Fair', 'Media SubGroup=China Tour 2015', 'Media SubGroup=China Tour Beijing CEE Fair', 'Media SubGroup=Counselor.', 'Media SubGroup=E! Talk', 'Media SubGroup=Edge', 'Media SubGroup=Edge Interactive', 'Media SubGroup=Edge102.1', 'M

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [256]:
dec = DecisionTreeClassifier()

In [257]:
dec.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [258]:
dec.score(x_test, y_test)

0.898

In [259]:
# export a tree to have a look
# convert the dot to png : dot -Tpng tree.dot -o tree.png
export_graphviz(dec, out_file='tree.dot')

### Using Random Forest

In [260]:
rf = RandomForestClassifier()

In [261]:
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.9028

In [262]:
# using GridSearchCV to evalue the result
param = {"n_estimators": [120, 200, 300, 500, 800, 1200], 'max_depth': [5, 8, 15, 25, 30]}
GC = GridSearchCV(rf, param_grid=param, cv=2)

In [263]:
GC.fit(x_train, y_train)
GC.score(x_test, y_test)

0.9056

In [265]:
GC.best_params_

{'max_depth': 30, 'n_estimators': 1200}