In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from mean_evaluation import roman_mean
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from scipy.spatial import distance



In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

final_status = train.final_status
projest_id = train.project_id
backers_count = train.backers_count

ltr = len(train)
train.drop(['final_status', 'backers_count'], axis = 1, inplace = True)

data = pd.concat([train, test], axis = 0)
data.index = range(len(data))

In [6]:
roman_model = roman_mean(directory = 'path', 
                         n_folds_gen = 10, 
                         n_folds_sub = 5, 
                         seed = 322, 
                         sub_seed = 228, 
                         ltr = ltr, 
                         data = data, 
                         target = final_status)

In [7]:
int_project_id = []
for x in data.project_id.tolist():
    int_project_id += [int(x[4:])]
data['int_project_id'] = int_project_id

In [8]:
int_disable_communication = []
for x in data.disable_communication.tolist():
    if x == False:
        int_disable_communication += [0]
    else:
        int_disable_communication += [1]
data['disable_communication_int'] = int_disable_communication

In [10]:
data['deadline-created_at'] = data.deadline - data.created_at
data['launched_at-created_at'] = data.deadline - data.created_at
data['state_changed_at-created_at'] = data.deadline - data.created_at
data['state_changed_at-deadline'] = data.state_changed_at - data.deadline
data['deadline-launched_at'] = data.deadline - data.launched_at
data['state_changed_at-launched_at'] = data.state_changed_at - data.launched_at

In [11]:
data['len_name'] = [len(str(x)) for x in data.name.tolist()]
data['len_desc'] = [len(str(x)) for x in data.desc.tolist()]
data['len_keywords'] = [len(str(x)) for x in data.keywords.tolist()]
data['numb_keywords'] = [len(str(x).split('-')) for x in data.keywords.tolist()]

In [13]:
len_cov = []
for x in data.desc.tolist():
    tokens = re.findall('\"', str(x))
    len_cov += [len(tokens)]
data['len_cov'] = len_cov
data['bad_znak'] =  data['len_cov'] / data['len_desc']

In [14]:
keywords = [re.sub('-', ' ', str(x)) for x in data.keywords.tolist()]
vectorizer = TfidfVectorizer(max_features = 3500, stop_words = 'english')
keywords_vect = vectorizer.fit_transform(keywords)

In [17]:
names = [str(x) for x in data.name.tolist()]
vectorizer = TfidfVectorizer(max_features = 3500, stop_words = 'english')
names_vect = vectorizer.fit_transform(names)

In [18]:
desc = [str(x) for x in data.desc.tolist()]
vectorizer = TfidfVectorizer(max_features = 3500, stop_words = 'english')
desc_vect = vectorizer.fit_transform(desc)

In [19]:
del vectorizer

In [20]:
sp_data = hstack([keywords_vect, names_vect, desc_vect]).tocsr()

In [22]:
time_feat = ['deadline', 'created_at', 'launched_at',  'state_changed_at']
for time in time_feat:
    weekday = []
    hour = []
    day = []
    for x in data.loc[:, time].tolist():
        weekday += [datetime.datetime.fromtimestamp(x).weekday()]
        hour += [datetime.datetime.fromtimestamp(x).hour]
        day += [datetime.datetime.fromtimestamp(x).day]
    data[time + '_' + 'weekday'] = weekday
    data[time + '_' + 'hour'] = hour
    data[time + '_' + 'day'] = day

In [24]:
for time in time_feat:
    print(time + '_' + 'hour_weekday')
    data[time + '_' + 'hour_weekday'] = data[time + '_' + 'hour'].astype(str) + '_' + data[time + '_' + 'weekday'].astype(str)
    data[time + '_' + 'hour_country'] = data[time + '_' + 'hour'].astype(str) + '_' + data['country'].astype(str)
    data[time + '_' + 'weekday_country'] = data[time + '_' + 'weekday'].astype(str) + '_' + data['country'].astype(str)
    data[time + '_' + 'day_country'] = data[time + '_' + 'day'].astype(str) + '_' + data['country'].astype(str)

deadline_hour_weekday
created_at_hour_weekday
launched_at_hour_weekday
state_changed_at_hour_weekday


In [25]:
normal_goal = []
for x, y in zip(data.currency.tolist(), data.goal.tolist()):
    if x == 'USD':
        normal_goal += [y]
    if x == 'GBP':
        normal_goal += [1.5 * y]
    if x == 'EUR':
        normal_goal += [1.2 * y]
    if x == 'CAD':
        normal_goal += [0.85 * y]
    if x == 'AUD':
        normal_goal += [0.85 * y]
    if x == 'SEK':
        normal_goal += [0.14 * y]
    if x == 'NZD':
        normal_goal += [0.70 * y]
    if x == 'DKK':
        normal_goal += [0.17 * y]
    if x == 'NOK':
        normal_goal += [0.15 * y]
    if x == 'CHF':
        normal_goal += [y]
    if x == 'MXN':
        normal_goal += [0.07 * y]
    if x == 'SGD':
        normal_goal += [0.73 * y]
    if x == 'HKD':
        normal_goal += [0.13 * y]

data['normal_goal'] = normal_goal
data['deadline-created_at_normal_goal'] = data.loc[:, 'deadline-created_at'] / data.normal_goal
data['launched_at-created_at_normal_goal'] = data.loc[:, 'launched_at-created_at'] / data.normal_goal
data['state_changed_at-created_at_normal_goal'] = data.loc[:, 'state_changed_at-created_at'] / data.normal_goal
data['state_changed_at-deadline_normal_goal'] = data.loc[:, 'state_changed_at-deadline'] / data.normal_goal
data['deadline-launched_at_normal_goal'] = data.loc[:, 'deadline-launched_at'] / data.normal_goal
data['state_changed_at-launched_at_normal_goal'] = data.loc[:, 'state_changed_at-launched_at'] / data.normal_goal

In [27]:
roman_model.cols_mean(['country', 'currency', 'deadline_hour_weekday',
'created_at_hour_weekday', 'launched_at_hour_weekday', 'state_changed_at_hour_weekday'])

country
currency
deadline_hour_weekday
created_at_hour_weekday
launched_at_hour_weekday
state_changed_at_hour_weekday


In [28]:
roman_model.save_in_file(data)

/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\features\project_id
/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\features\name
/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\features\desc
/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\features\goal
/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\features\keywords
/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\features\disable_communication
/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter

In [29]:
dic_par = {'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'eval_metric': 'auc',
               'max_depth':8, 'subsample': 0.7, 'colsample_bytree': 0.7, 'min_child_weight': 5}

In [30]:
feature_list = ['country_mean', 
                'currency_mean', 
                'disable_communication_int', 
                'normal_goal',
                'deadline-created_at', 
                'launched_at-created_at', 
                'state_changed_at-created_at', 
                'deadline-launched_at',
                'state_changed_at-deadline',
                'state_changed_at-launched_at',
                'deadline-created_at_normal_goal', 
                'launched_at-created_at_normal_goal', 
                'state_changed_at-created_at_normal_goal', 
                'deadline-launched_at_normal_goal',
                'state_changed_at-deadline_normal_goal',
                'state_changed_at-launched_at_normal_goal', 
                'len_name', 
                'len_desc', 
                'len_keywords',  
                'created_at_hour', 'created_at_weekday', 'created_at_day', 
                'deadline_hour', 'deadline_weekday', 'deadline_day', 
                'launched_at_hour', 'launched_at_weekday', 'launched_at_day', 
                'state_changed_at_hour', 'state_changed_at_weekday', 'state_changed_at_day', 
        'canceled', 'deadline_hour_weekday_mean',
'created_at_hour_weekday_mean', 
'launched_at_hour_weekday_mean',
'state_changed_at_hour_weekday_mean']

In [32]:
roman_model.predictSparse(dic_par, sp_data, feature_list, 5000, True, False)
# roman_model.predict(dic_par, stack_feat, 5000, True, False)

Calculate 1/10
1/36


FileNotFoundError: File b'/home/manish/Desktop/ML_Challenge_Creatives/Challenge #2 Data/kickstarter/comp_data/final_data/download_data/3149def2-5-datafiles/pomah\\features\\country_mean\\country_mean.csv' does not exist