# Table of contents

[1. Introduction](#Introduction)

[2. Obtaining the data](#Obtaining-the-data)

[3. Creating and saving models](#Creating-and-saving-models)

In [1]:
# Importing the required libraries
import pandas as pd
pd.set_option('display.max_columns', 50) # Display up to 50 columns at a time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm
plt.style.use('seaborn')
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12,5
import glob # To read all csv files in the directory
import seaborn as sns
import calendar
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support
import itertools
import time
import xgboost as xgb
import pickle as pk
from sklearn.externals import joblib

In [10]:
# Loading the datasets
X1 = pd.read_csv('processed_datasets/observations1.csv', index_col=0)
y1 = pd.read_csv('processed_datasets/labels1.csv', index_col=0)
X2 = pd.read_csv('processed_datasets/observations2.csv', index_col=0)
y2 = pd.read_csv('processed_datasets/labels2.csv', index_col=0)
X3 = pd.read_csv('processed_datasets/observations3.csv', index_col=0)
y3 = pd.read_csv('processed_datasets/labels3.csv', index_col=0)

In [3]:
# Functionn to train and save models
def train_and_save_model(model, X, y, model_name):
    """train and save model into folder zerol_level_models"""
    model.fit(X, y)
    filename = 'zero_level_models/{}.txt'.format(model_name)
    pk.dump(model, open(filename, 'wb'))

In [5]:
# Function to create all models
def create_models(models_to_create, datasets):
    for dataset_number, dataset in enumerate(datasets):
        total_count_models = 1
        X, y = dataset
        for model_information in models_to_create:
            model, number_of_models_to_create = model_information
            for i in range(number_of_models_to_create):
                train_and_save_model(model, X, y, 'model{}_{}'.format(str(dataset_number+1), total_count_models))
                total_count_models += 1

## Classifiers

In [6]:
# Logistic Regression
logreg = LogisticRegression(C=0.01, fit_intercept=True, penalty='l1')

# Random Forest
rf = RandomForestClassifier(min_samples_split=0.001, verbose=2, n_estimators=200, max_depth=35)

# XGBoost
xgb = xgb.XGBClassifier(learning_rate=0.1, max_depth=35, min_child_weight=100, n_estimators=200, subsample=0.7)

# AdaBoost
adaboost = AdaBoostClassifier(learning_rate=0.3, n_estimators=300)

In [8]:
X1.head()

Unnamed: 0,blurb_length,usd_goal,name_length,creation_to_launch_days,campaign_days,category_art,category_comics,category_crafts,category_dance,category_design,category_fashion,category_film & video,category_food,category_games,category_journalism,category_music,category_photography,category_publishing,category_technology,category_theater,country_AT,country_AU,country_BE,country_CA,country_CH,...,deadline_month_September,launch_time_10am-12pm,launch_time_10pm-12am,launch_time_12am-2am,launch_time_12pm-2pm,launch_time_2am-4am,launch_time_2pm-4pm,launch_time_4am-6am,launch_time_4pm-6pm,launch_time_6am-8am,launch_time_6pm-8pm,launch_time_8am-10am,launch_time_8pm-10pm,deadline_time_10am-12pm,deadline_time_10pm-12am,deadline_time_12am-2am,deadline_time_12pm-2pm,deadline_time_2am-4am,deadline_time_2pm-4pm,deadline_time_4am-6am,deadline_time_4pm-6pm,deadline_time_6am-8am,deadline_time_6pm-8pm,deadline_time_8am-10am,deadline_time_8pm-10pm
0,0.599954,-0.863064,-0.36913,0.046185,-0.224741,-0.34594,-0.19773,-0.182523,-0.134647,-0.189923,-0.235358,-0.40577,-0.292264,3.839393,-0.14929,-0.412689,-0.186654,-0.334418,-0.34294,-0.168885,-0.047035,-0.152409,-0.050568,-0.217684,-0.055075,...,3.338081,-0.208348,-0.338527,-0.298293,-0.275938,-0.259576,-0.367145,-0.235875,2.445936,-0.203413,-0.387776,-0.191523,-0.368327,-0.196596,-0.343587,-0.307077,-0.245277,-0.317588,-0.324195,-0.31305,2.66237,-0.24725,-0.36068,-0.180051,-0.352531
1,-0.804977,0.694414,0.596395,1.329281,-0.224741,-0.34594,-0.19773,-0.182523,-0.134647,-0.189923,-0.235358,-0.40577,-0.292264,-0.260458,-0.14929,2.423134,-0.186654,-0.334418,-0.34294,-0.168885,-0.047035,-0.152409,-0.050568,-0.217684,-0.055075,...,-0.299573,-0.208348,-0.338527,-0.298293,-0.275938,-0.259576,-0.367145,4.23953,-0.408841,-0.203413,-0.387776,-0.191523,-0.368327,-0.196596,-0.343587,-0.307077,-0.245277,-0.317588,-0.324195,3.194373,-0.375605,-0.24725,-0.36068,-0.180051,-0.352531
2,0.800659,0.573727,0.826781,-0.134553,2.325957,-0.34594,-0.19773,-0.182523,-0.134647,-0.189923,-0.235358,-0.40577,-0.292264,3.839393,-0.14929,-0.412689,-0.186654,-0.334418,-0.34294,-0.168885,-0.047035,-0.152409,-0.050568,-0.217684,-0.055075,...,-0.299573,-0.208348,-0.338527,-0.298293,-0.275938,-0.259576,-0.367145,-0.235875,-0.408841,-0.203413,2.57881,-0.191523,-0.368327,-0.196596,-0.343587,-0.307077,-0.245277,-0.317588,-0.324195,-0.31305,2.66237,-0.24725,-0.36068,-0.180051,-0.352531
3,1.001363,-0.2709,0.596395,-0.220362,-0.224741,-0.34594,-0.19773,-0.182523,-0.134647,-0.189923,-0.235358,-0.40577,-0.292264,-0.260458,-0.14929,-0.412689,-0.186654,2.990267,-0.34294,-0.168885,-0.047035,-0.152409,-0.050568,-0.217684,-0.055075,...,-0.299573,-0.208348,-0.338527,-0.298293,-0.275938,-0.259576,-0.367145,-0.235875,-0.408841,-0.203413,-0.387776,-0.191523,2.714977,-0.196596,-0.343587,-0.307077,-0.245277,-0.317588,-0.324195,-0.31305,-0.375605,-0.24725,-0.36068,-0.180051,2.83663
4,-0.202864,-0.142564,-0.36913,1.195768,-0.224741,-0.34594,-0.19773,-0.182523,-0.134647,-0.189923,-0.235358,-0.40577,-0.292264,-0.260458,-0.14929,2.423134,-0.186654,-0.334418,-0.34294,-0.168885,-0.047035,-0.152409,-0.050568,-0.217684,-0.055075,...,-0.299573,-0.208348,-0.338527,-0.298293,-0.275938,3.852439,-0.367145,-0.235875,-0.408841,-0.203413,-0.387776,-0.191523,-0.368327,-0.196596,-0.343587,-0.307077,-0.245277,3.148737,-0.324195,-0.31305,-0.375605,-0.24725,-0.36068,-0.180051,-0.352531


In [9]:
y1.head()

Unnamed: 0,0
0,1
1,1
2,0
3,1
4,1


In [10]:
adaboost.fit(X1, y1)
y_pred = adaboost.predict(X2)
print(classification_report(y2, y_pred))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.76      0.60      0.67     24881
           1       0.73      0.85      0.78     31445

   micro avg       0.74      0.74      0.74     56326
   macro avg       0.74      0.72      0.73     56326
weighted avg       0.74      0.74      0.73     56326



In [13]:
# load the model from disk
loaded_model = joblib.load('zero_level_models/model1_1.txt')
result = loaded_model.score(X2, y2)
print(result)

0.7305862301601392


In [11]:
models_to_create = [(logreg, 5), (rf, 1), (xgb, 5), (adaboost, 5)]
datasets = [(X1, y1), (X2, y2)]

In [8]:
create_models(models_to_create, datasets)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  after removing the cwd from sys.path.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 1 of 200
building tree 2 of 200
building tree 3 of 200
building tree 4 of 200
building tree 5 of 200
building tree 6 of 200
building tree 7 of 200
building tree 8 of 200
building tree 9 of 200
building tree 10 of 200
building tree 11 of 200
building tree 12 of 200
building tree 13 of 200
building tree 14 of 200
building tree 15 of 200
building tree 16 of 200
building tree 17 of 200
building tree 18 of 200
building tree 19 of 200
building tree 20 of 200
building tree 21 of 200
building tree 22 of 200
building tree 23 of 200
building tree 24 of 200
building tree 25 of 200
building tree 26 of 200
building tree 27 of 200
building tree 28 of 200
building tree 29 of 200
building tree 30 of 200
building tree 31 of 200
building tree 32 of 200
building tree 33 of 200
building tree 34 of 200
building tree 35 of 200
building tree 36 of 200
building tree 37 of 200
building tree 38 of 200
building tree 39 of 200
building tree 40 of 200
building tree 41 of 200
building tree 42 of 200
b

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   14.3s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  after removing the cwd from sys.path.
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 1 of 200
building tree 2 of 200
building tree 3 of 200
building tree 4 of 200
building tree 5 of 200
building tree 6 of 200
building tree 7 of 200
building tree 8 of 200
building tree 9 of 200
building tree 10 of 200
building tree 11 of 200
building tree 12 of 200
building tree 13 of 200
building tree 14 of 200
building tree 15 of 200
building tree 16 of 200
building tree 17 of 200
building tree 18 of 200
building tree 19 of 200
building tree 20 of 200
building tree 21 of 200
building tree 22 of 200
building tree 23 of 200
building tree 24 of 200
building tree 25 of 200
building tree 26 of 200
building tree 27 of 200
building tree 28 of 200
building tree 29 of 200
building tree 30 of 200
building tree 31 of 200
building tree 32 of 200
building tree 33 of 200
building tree 34 of 200
building tree 35 of 200
building tree 36 of 200
building tree 37 of 200
building tree 38 of 200
building tree 39 of 200
building tree 40 of 200
building tree 41 of 200
building tree 42 of 200
b

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   13.6s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [5]:
# Here are the needed ressources
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import load_model

In [27]:
def train_and_save_neural_net_model(X, y, model_name):
    ohe = OneHotEncoder()
    y = ohe.fit_transform(y)
    # let's retrieve the dimension of the input layer
    n = len(X.columns)
    # let's retrieve the dimension of the output layer
    m = 1
    # Neural network
    model = Sequential()
    model.add(Dropout(0.2, input_shape = (n,)))
    model.add(Dense(1000, activation='relu'))
    model.add(Dense(1000, activation='relu'))
    model.add(Dense(1000, activation='relu'))
    model.add(Dense(500, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # Training
    history = model.fit(X, y, epochs=10, batch_size=64)
    model.save('zero_level_models/{}'.format(model_name))

In [28]:
# Function to create Neural Networks
def create_neural_networks_models(number_of_models_to_create, datasets):
    for dataset_number, dataset in enumerate(datasets):
        total_count_models = 1
        X, y = dataset
        for i in range(number_of_models_to_create):
                train_and_save_neural_net_model(X, y, 'neural_net_model{}_{}'.format(str(dataset_number+1), total_count_models))
                total_count_models += 1

In [29]:
datasets = [(X1, y1), (X2, y2)]
create_neural_networks_models(5, datasets)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
model2 = load_model('zero_level_models/neural_net_model1_1.h5')

ohe = OneHotEncoder()
y2 = ohe.fit_transform(y2)

pred_test = model2.predict(X2)

model2.evaluate(X2,y2)

# Accuracy
# Converting predictions to label
pred = list()
for i in range(len(pred_test)):
    pred.append(np.argmax(pred_test[i]))
#Converting one hot encoded test label to label
test = list()
for i in range(y_test.shape[0]):
    test.append(np.argmax(y_test[i]))


# Accuracy
a = accuracy_score(pred,test)
print('Accuracy is:', a*100)
print(f1_score(test, pred, average="macro"))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Accuracy is: 72.99826012853744
0.7137349411536646
