##### importing libraries

In [54]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import math
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from collections import Counter
import scipy.stats as ss
from sklearn import preprocessing,model_selection
from tensorflow import keras
import tensorflow as tf
from keras.utils import np_utils
from sklearn.svm import LinearSVC
import datetime

In [55]:
df = pd.read_csv("ml_assignment_data_train.csv")

#### preprocessing the data 
changing categorical variables to numerics

In [56]:
le = preprocessing.LabelEncoder()
df['category_id'] = le.fit_transform(df.category.astype(str))
df['main_category_id'] = le.fit_transform(df.main_category.astype(str))
df['currency_id'] = le.fit_transform(df.currency.astype(str))
df['country_id'] = le.fit_transform(df.country.astype(str))
df['new_state_id'] = le.fit_transform(df.new_state.astype(str))

df['log_usd_goal_real'] = np.log(df['usd_goal_real']+1)
df['log_goal'] = np.log(df['goal']+1)

#dealing with date type
df['launched_date'], df['launched_hour'] = df['launched'].str.split(' ',2).str
df['launched_date'] = pd.to_datetime(df['launched_date'])
df['deadline'] = pd.to_datetime(df['deadline'])
# difference between deadline and lunched
df['time_interval'] = df['launched_date'] - df['deadline']
# changing that to int
df['time_interval'] = df['time_interval'].dt.days
#extracting yead and month and day from deadline to see if there are seasonal effect 
df['deadline_year'] = df['deadline'].dt.year
df['deadline_month'] = df['deadline'].dt.month
df['deadline_day'] = df['deadline'].dt.day

### Linear SVM

In [52]:
# preparing features and labels for SVM. The difference is that the feature shouldn't change to a dummy variable  
X = df[['main_category_id', 'category_id', 'backers','country_id','deadline_day', 'deadline_month','time_interval']]
X = np.array(X)
y = df['new_state_id']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size = 0.2)
clfsvm = LinearSVC().fit(X_train, y_train)

In [53]:
print('SVM Accuracy',clfsvm.score(X_test, y_test))

SVM Accuracy 0.5756121165198878


### Decision Tree

In [57]:
# preparing features and labels for SVM. The difference is that the feature shouldn't change to a dummy variable  
X = df[['main_category_id', 'category_id', 'backers','country_id', 'currency_id', 'deadline_day','deadline_month','deadline_day' ,'time_interval', 'log_usd_goal_real' ]]
X = np.array(X)
y = df['new_state_id']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size= 0.2)


In [58]:
from sklearn import tree
clf_tree = tree.DecisionTreeClassifier()
clf_tree = clf_tree.fit(X_train, y_train)

In [59]:
clf_tree.score(X_test, y_test)

0.7905932449433852

### Neural Network

In [6]:
#changing the lable to a dummy variable. 
new_state_dummy = np_utils.to_categorical(df['new_state_id'])
new_state_dummy

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], dtype=float32)

#### spliting date to features and label, also to test and train
we don't need all variables to be included in the model. 
For instance, features such as dealine_year, log_usd_goal_real drop the accuracy of the model. Also, since we have the time interval and dealine, adding launched date is a duplicate. currency and country represent same thing. Therefore, on is enough. 


In [184]:
X = df[['main_category_id', 'category_id', 'backers','country_id','deadline_day', 'deadline_month', 'time_interval']]
X = np.array(X)
y = new_state_dummy

X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size= 0.2)

In [86]:
# It is usually common to rescale all variables to 0,1 in neural networks. Although in this case, it doesn't improve
# network accuracy. Since most of variables' scale are similar, I ignored this part. 

# min_max_scaler = preprocessing.MinMaxScaler()
# np_scaled = min_max_scaler.fit_transform(X_train.astype(float))
# X_train_normalized = pd.DataFrame(np_scaled)
# np_scaled = min_max_scaler.fit_transform(X_test.astype(float))
# X_test_normalized = pd.DataFrame(np_scaled)

In [210]:
from keras.layers import Dropout

model01 = keras.Sequential([
    keras.layers.Dense(70, input_dim = 7, activation=tf.nn.relu),
    keras.layers.Dense(40, activation = tf.nn.tanh),
    #keras.layers.Dropout(0.1),
    keras.layers.Dense(30, activation = tf.nn.tanh),
    keras.layers.Dense(3, activation = tf.nn.softmax),
    ])
model01.compile(optimizer = tf.train.AdamOptimizer(),
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])
model01.fit(X_train, y_train, validation_data=(X_test,y_test), batch_size= 300, epochs=20)

Train on 206660 samples, validate on 51665 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras._impl.keras.callbacks.History at 0x1a43fcdf98>

In [166]:
# prediction on the part of data that the network hasn't seen yet.
results = model01.predict(X_test)

In [168]:
#converting the prediction from array to a data frame
results = pd.DataFrame({'column1':results[:,0],'column2':results[:,1],'column3':results[:,2]})

In [169]:
# each label is predicted a number between 0, 1. max of 3 columns shows the prediction.
results['max'] = results.idxmax(axis=1)

In [170]:
results

Unnamed: 0,column1,column2,column3,max
0,0.027168,5.262653e-01,4.465669e-01,column2
1,0.748091,2.476361e-01,4.272917e-03,column1
2,0.040971,7.518923e-01,2.071370e-01,column2
3,0.195272,6.492521e-01,1.554759e-01,column2
4,0.997744,2.208908e-03,4.747870e-05,column1
5,0.131149,7.407973e-01,1.280532e-01,column2
6,0.024869,6.843339e-01,2.907973e-01,column2
7,0.054301,7.641478e-01,1.815509e-01,column2
8,0.209492,5.975983e-01,1.929092e-01,column2
9,0.999986,1.404071e-05,2.747967e-08,column1


In [171]:
y_test = pd.DataFrame({'column1':y_test[:,0],'column2':y_test[:,1],'column3':y_test[:,2]})

In [175]:
# exporting to excel files
results.to_excel('prediction_results.xlsx')
y_test.to_excel('test.xlsx')