In [1]:
!pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio===0.8.1 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [2]:
!pip install transformers requests beautifulsoup4 pandas numpy



In [3]:
!pip install category_encoders



In [4]:
!pip install tensorflow



In [5]:
import pandas as pd
import os, sys
import datetime
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
          ### CONCAT FILES IN CSVs FOLDER ###

files = [file_path for file_path in os.listdir('CSVs') if file_path.endswith('.csv')]

df = pd.read_csv(os.path.join('CSVs', files[0]))

for csv_path in files[1:]:
    df_to_add = pd.read_csv(os.path.join('CSVs', csv_path))
    df = pd.concat([df,df_to_add])
    
    
df.reset_index(drop=True, inplace=True)

df.to_csv('Kick(2021-2022).csv')

In [8]:
COLs = ['country','location','category', 'blurb', 'slug', 'backers_count', 
       'created_at','goal','launched_at','deadline','converted_pledged_amount',
        'state', 'currency', 'staff_pick', 'static_usd_rate','usd_type']
    

def wrangle(CSV_PATH):
    df = pd.read_csv(CSV_PATH, usecols= COLs)
    df.dropna(inplace=True)

    #Reorder columns
    df = df[['country','location','category', 'blurb', 'slug', 'backers_count', 
        'created_at','launched_at','deadline','converted_pledged_amount','goal',
        'state', 'currency',
        'staff_pick', 'static_usd_rate','usd_type']]
    
    # Dropping rows where state is = to 'live' or 'canceled'
    df = df[df['state'].isin(['successful','failed'])]
    
    # Convert str to dicts (category and location columns)
    convert_to_dicts = lambda x: json.loads(x)
    df['category'] = df['category'].apply(convert_to_dicts)
    df['location'] = df['location'].apply(convert_to_dicts)
    
    # Exctract name from dicts (category and location columns)
    extract_name = lambda x: x['name']
    df['category'] = df['category'].apply(extract_name)
    df['location'] = df['location'].apply(extract_name)
    
    # Timestamps to Dates
    convert = lambda x: datetime.datetime.fromtimestamp(x)
    df['created_at'] = df['created_at'].apply(convert)
    df['launched_at'] = df['launched_at'].apply(convert)
    df['deadline'] = df['deadline'].apply(convert)
      
    # Rename
    df.columns = ['country','city_name','category', 'description', 'project_name', 'n_backers', 
       'created_at','launch_date','deadline','pledged_$','goal',
        'state', 'currency',
          'staff_pick', 'usd_rate','usd_type']
    
    # New Features 
    duration = df['deadline'] - df['launch_date']
    df['launch_month_number_of_the_year'] = [i.month for i in df['launch_date']]
    df['campaign_duration_in_days'] = [i.days for i in duration]
    
    # Drop Cols
    df.drop(columns=['created_at','launch_date','deadline','usd_type', 'usd_rate','staff_pick', 'n_backers','pledged_$'], inplace=True)
    
    # Removes Dash from (project_name column)
    dash_remover = lambda x: x.replace('-',' ')
    df['project_name'] = df['project_name'].apply(dash_remover)
    
    return df

df = wrangle('Kick(2021-2022).csv')

df.to_csv('Kick(2021-2022)_wrangled.csv')

In [14]:
df = pd.read_csv('Kick(2021-2022)_wrangled.zip',compression = "zip").drop(columns=['Unnamed: 0'])

# get string lenght from 'description' and 'project_name' columns
df['description'] = [len(desc) for desc in df['description']]
df['project_name'] = [len(title) for title in df['project_name']]

df['state'] = [int(x) for x in df['state'].isin(['successful'])]

In [15]:
df = df[['state','country', 'city_name', 'category',
         'currency', 'launch_month_number_of_the_year','goal',
       'campaign_duration_in_days', 'description', 'project_name']]

In [16]:
not_encoded_df = df.copy()

In [17]:
not_encoded_df.head()

Unnamed: 0,state,country,city_name,category,currency,launch_month_number_of_the_year,goal,campaign_duration_in_days,description,project_name
0,0,US,Burlington,Glass,USD,8,2000.0,30,96,50
1,1,US,Vineyard,Events,USD,7,3000.0,16,133,26
2,1,BE,Antwerp,Comedy,EUR,10,300.0,60,134,26
3,1,US,Oxnard,Children's Books,USD,12,4500.0,21,133,43
4,1,US,New York,Musical,USD,7,10000.0,31,92,41


In [18]:
encoder = LabelEncoder()
df['country'] = encoder.fit_transform(df['country'])

df['city_name'] = encoder.fit_transform(df['city_name'])

df['category'] = encoder.fit_transform(df['category'])

df['currency'] = encoder.fit_transform(df['currency'])


In [19]:
df.head()

Unnamed: 0,state,country,city_name,category,currency,launch_month_number_of_the_year,goal,campaign_duration_in_days,description,project_name
0,0,24,1529,64,14,8,2000.0,30,96,50
1,1,24,10038,44,14,7,3000.0,16,133,26
2,1,2,548,24,4,10,300.0,60,134,26
3,1,24,7174,19,14,12,4500.0,21,133,43
4,1,24,6641,92,14,7,10000.0,31,92,41


In [24]:
dict_country = {}
dict_city = {}
dict_category = {}
dict_currency = {}

for key, val in zip(not_encoded_df['country'],df['country']):
    dict_country[key] = val
    
for key, val in zip(not_encoded_df['city_name'],df['city_name']):
    dict_city[key] = val
    
for key, val in zip(not_encoded_df['category'],df['category']):
    dict_category[key] = val
    
for key, val in zip(not_encoded_df['currency'],df['currency']):
    dict_currency[key] = val

In [37]:
Encode_user_input = pd.Series([dict_country, dict_city, dict_category, dict_currency], index=['country', 'city_name', 'category', 'currency'])
Encode_user_input

country      {'US': 24, 'BE': 2, 'CA': 3, 'ES': 7, 'MX': 16...
city_name    {'Burlington': 1529, 'Vineyard': 10038, 'Antwe...
category     {'Glass': 64, 'Events': 44, 'Comedy': 24, 'Chi...
currency     {'USD': 14, 'EUR': 4, 'CAD': 1, 'MXN': 8, 'HKD...
dtype: object

In [833]:
df.head({not_encoded_df['country']:df['country']})

Unnamed: 0,state,country,city_name,category,currency,launch_month_number_of_the_year,goal,campaign_duration_in_days,description,project_name
0,0,24,1529,64,14,8,2000.0,30,96,50
1,1,24,10038,44,14,7,3000.0,16,133,26
2,1,2,548,24,4,10,300.0,60,134,26
3,1,24,7174,19,14,12,4500.0,21,133,43
4,1,24,6641,92,14,7,10000.0,31,92,41


In [773]:
df_train, df_test = train_test_split(df, test_size=.25, train_size=.75, random_state=42)

In [774]:
X_train = df_train.values[:,1:]
y_train = df_train.values[:,:1]

X_test = df_test.values[:,1:]
y_test = df_test.values[:,:1]

In [775]:
X_train[:5]

array([[2.400e+01, 1.221e+03, 9.600e+01, 1.400e+01, 9.000e+00, 3.500e+03,
        3.000e+01, 8.900e+01, 3.200e+01],
       [2.400e+01, 3.650e+02, 1.160e+02, 1.400e+01, 5.000e+00, 1.500e+03,
        3.000e+01, 1.300e+02, 5.000e+01],
       [7.000e+00, 8.230e+02, 1.530e+02, 4.000e+00, 8.000e+00, 1.500e+03,
        3.000e+01, 9.200e+01, 2.700e+01],
       [9.000e+00, 5.494e+03, 5.400e+01, 5.000e+00, 5.000e+00, 7.000e+02,
        3.000e+01, 1.220e+02, 4.400e+01],
       [2.400e+01, 7.384e+03, 6.000e+00, 1.400e+01, 6.000e+00, 1.000e+04,
        4.500e+01, 1.340e+02, 5.000e+01]])

In [776]:
X_train.shape

(156161, 9)

In [834]:
def make_model():
    model = Sequential()
    model.add(Dense(20, input_dim=9, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                optimizer='adam', 
                metrics=['accuracy'])
    return model

In [835]:
# from keras.wrappers.scikit_learn import KerasClassifier
# estimator = KerasClassifier(build_fn=make_model, epochs=5, verbose=1)

In [836]:
# estimator.fit(x=X_train,
#           y=y_train,
#           epochs=5,
#           validation_data=(X_test, y_test)
#           )

In [837]:
model = make_model()

In [838]:
model.summary()

Model: "sequential_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_67 (Dense)            (None, 20)                200       
                                                                 
 dense_68 (Dense)            (None, 100)               2100      
                                                                 
 dense_69 (Dense)            (None, 1)                 101       
                                                                 
Total params: 2,401
Trainable params: 2,401
Non-trainable params: 0
_________________________________________________________________


In [839]:
model.fit(x=X_train,
          y=y_train,
          epochs=5,
          validation_data=(X_test, y_test)
          )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x29767208be0>