# Table of contents

[1. Introduction](#Introduction)

[2. Obtaining the data](#Obtaining-the-data)

[3. Creating and saving models](#Creating-and-saving-models)

In [10]:
# Importing the required libraries
import pandas as pd
pd.set_option('display.max_columns', 50) # Display up to 50 columns at a time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm
plt.style.use('seaborn')
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12,5
import glob # To read all csv files in the directory
import seaborn as sns
import calendar
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support
import itertools
import time
import xgboost as xgb

In [14]:
X = pd.read_csv('processed_datasets/observations1.csv', index_col=0)
y = pd.read_csv('processed_datasets/labels1.csv', index_col=0)

In [15]:
X.head()

Unnamed: 0,blurb_length,usd_goal,name_length,creation_to_launch_days,campaign_days,category_art,category_comics,category_crafts,category_dance,category_design,category_fashion,category_film & video,category_food,category_games,category_journalism,category_music,category_photography,category_publishing,category_technology,category_theater,country_AT,country_AU,country_BE,country_CA,country_CH,...,deadline_month_September,launch_time_10am-12pm,launch_time_10pm-12am,launch_time_12am-2am,launch_time_12pm-2pm,launch_time_2am-4am,launch_time_2pm-4pm,launch_time_4am-6am,launch_time_4pm-6pm,launch_time_6am-8am,launch_time_6pm-8pm,launch_time_8am-10am,launch_time_8pm-10pm,deadline_time_10am-12pm,deadline_time_10pm-12am,deadline_time_12am-2am,deadline_time_12pm-2pm,deadline_time_2am-4am,deadline_time_2pm-4pm,deadline_time_4am-6am,deadline_time_4pm-6pm,deadline_time_6am-8am,deadline_time_6pm-8pm,deadline_time_8am-10am,deadline_time_8pm-10pm
0,0.599954,-0.863064,-0.36913,0.046185,-0.224741,-0.34594,-0.19773,-0.182523,-0.134647,-0.189923,-0.235358,-0.40577,-0.292264,3.839393,-0.14929,-0.412689,-0.186654,-0.334418,-0.34294,-0.168885,-0.047035,-0.152409,-0.050568,-0.217684,-0.055075,...,3.338081,-0.208348,-0.338527,-0.298293,-0.275938,-0.259576,-0.367145,-0.235875,2.445936,-0.203413,-0.387776,-0.191523,-0.368327,-0.196596,-0.343587,-0.307077,-0.245277,-0.317588,-0.324195,-0.31305,2.66237,-0.24725,-0.36068,-0.180051,-0.352531
1,-0.804977,0.694414,0.596395,1.329281,-0.224741,-0.34594,-0.19773,-0.182523,-0.134647,-0.189923,-0.235358,-0.40577,-0.292264,-0.260458,-0.14929,2.423134,-0.186654,-0.334418,-0.34294,-0.168885,-0.047035,-0.152409,-0.050568,-0.217684,-0.055075,...,-0.299573,-0.208348,-0.338527,-0.298293,-0.275938,-0.259576,-0.367145,4.23953,-0.408841,-0.203413,-0.387776,-0.191523,-0.368327,-0.196596,-0.343587,-0.307077,-0.245277,-0.317588,-0.324195,3.194373,-0.375605,-0.24725,-0.36068,-0.180051,-0.352531
2,0.800659,0.573727,0.826781,-0.134553,2.325957,-0.34594,-0.19773,-0.182523,-0.134647,-0.189923,-0.235358,-0.40577,-0.292264,3.839393,-0.14929,-0.412689,-0.186654,-0.334418,-0.34294,-0.168885,-0.047035,-0.152409,-0.050568,-0.217684,-0.055075,...,-0.299573,-0.208348,-0.338527,-0.298293,-0.275938,-0.259576,-0.367145,-0.235875,-0.408841,-0.203413,2.57881,-0.191523,-0.368327,-0.196596,-0.343587,-0.307077,-0.245277,-0.317588,-0.324195,-0.31305,2.66237,-0.24725,-0.36068,-0.180051,-0.352531
3,1.001363,-0.2709,0.596395,-0.220362,-0.224741,-0.34594,-0.19773,-0.182523,-0.134647,-0.189923,-0.235358,-0.40577,-0.292264,-0.260458,-0.14929,-0.412689,-0.186654,2.990267,-0.34294,-0.168885,-0.047035,-0.152409,-0.050568,-0.217684,-0.055075,...,-0.299573,-0.208348,-0.338527,-0.298293,-0.275938,-0.259576,-0.367145,-0.235875,-0.408841,-0.203413,-0.387776,-0.191523,2.714977,-0.196596,-0.343587,-0.307077,-0.245277,-0.317588,-0.324195,-0.31305,-0.375605,-0.24725,-0.36068,-0.180051,2.83663
4,-0.202864,-0.142564,-0.36913,1.195768,-0.224741,-0.34594,-0.19773,-0.182523,-0.134647,-0.189923,-0.235358,-0.40577,-0.292264,-0.260458,-0.14929,2.423134,-0.186654,-0.334418,-0.34294,-0.168885,-0.047035,-0.152409,-0.050568,-0.217684,-0.055075,...,-0.299573,-0.208348,-0.338527,-0.298293,-0.275938,3.852439,-0.367145,-0.235875,-0.408841,-0.203413,-0.387776,-0.191523,-0.368327,-0.196596,-0.343587,-0.307077,-0.245277,3.148737,-0.324195,-0.31305,-0.375605,-0.24725,-0.36068,-0.180051,-0.352531


In [16]:
y.head()

Unnamed: 0,0
0,1
1,1
2,0
3,1
4,1


In [17]:
# Loading the datasets
X1 = pd.read_csv('processed_datasets/observations1.csv', index_col=0)
y1 = pd.read_csv('processed_datasets/labels1.csv', index_col=0)

X2 = pd.read_csv('processed_datasets/observations2.csv', index_col=0)
y2 = pd.read_csv('processed_datasets/labels2.csv', index_col=0)

X3 = pd.read_csv('processed_datasets/observations3.csv', index_col=0)
y3 = pd.read_csv('processed_datasets/labels3.csv', index_col=0)