## Imports

In [1]:
import numpy as np 
import pandas as pd 
import os 
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import json
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.impute import SimpleImputer
# import eli5 # permutation imprtance
# from eli5.sklearn import PermutationImportance
# from xgboost import XGBClassifier
import nltk # Natural language
nltk.download('stopwords')
nltk.download('wordnet')
from bs4 import BeautifulSoup #to strip url
import string # for a list of puntuation
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hakuj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hakuj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Functions

In [None]:
def get_some(year):
    df = pd.DataFrame(
            columns=['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'fx_rate', 'goal', 'id', 'is_starrable',
       'launched_at', 'name', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type', 'location',
       'friends', 'is_backing', 'is_starred', 'permissions']
    )
    folders = os.listdir(f'Data\\{year}') #Get the monthly folders inside the year
    for folder in folders[:1]:
        files = os.listdir(f'Data\\{year}\\{folder}')  #Get the filenames inside monthly folders
        monthly = pd.concat(
            [pd.read_csv(
                f'Data\\{year}\\{folder}\\{file}') for file in files[:2]] #Not getting a whole year for now
        ) #Reads in all the csv files in a given month
        df = df.append(monthly)
        df = df.reset_index().drop(columns='index')
    return df

In [None]:
def get_a_year(year):
    df = pd.DataFrame(
            columns=['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'fx_rate', 'goal', 'id', 'is_starrable',
       'launched_at', 'name', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type', 'location',
       'friends', 'is_backing', 'is_starred', 'permissions']
    )
    folders = os.listdir(f'Data\\{year}') #Get the monthly folders inside the year
    for folder in folders:
        files = os.listdir(f'Data\\{year}\\{folder}')  #Get the filenames inside monthly folders
        monthly = pd.concat(
            [pd.read_csv(
                f'Data\\{year}\\{folder}\\{file}') for file in files]
        ) #Reads in all the csv files in a given month
        df = df.append(monthly)
        df = df.reset_index().drop(columns='index')
    return df

In [None]:
def get_a_few(year):
    df = pd.DataFrame(
            columns=['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'fx_rate', 'goal', 'id', 'is_starrable',
       'launched_at', 'name', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type', 'location',
       'friends', 'is_backing', 'is_starred', 'permissions']
    )
    folders = os.listdir(f'Data\\{year}') #Get the monthly folders inside the year
    for folder in folders[:1]: #Grab a folder from that year
        files = os.listdir(f'Data\\{year}\\{folder}')  #Get the filenames inside monthly folders
        monthly = pd.concat(
            [pd.read_csv(
                f'Data\\{year}\\{folder}\\{file}') for file in files[:1]] #Grab a file
        ) #Reads in all the csv files in a given month
        df = df.append(monthly)
        df = df.reset_index().drop(columns='index')
    return df

In [None]:
def datetime_convert(df):
    #Time is in seconds (epoch)
    df['created_at'] = pd.to_datetime(df['created_at'], unit='s')
    df['deadline'] = pd.to_datetime(df['deadline'], unit='s')
    df['launched_at'] = pd.to_datetime(df['launched_at'], unit='s')
    # df['state_changed_at'] = pd.to_datetime(df['state_changed_at'], unit='s') Leakage for current project goals

    #Break time up into columns Month day etc
    df['month_started'] = df['created_at'].dt.month
    df['day_started'] = df['created_at'].dt.weekday
    df['year_started'] = df['created_at'].dt.year
    df['month_launched'] = df['launched_at'].dt.month
    df['day_launched'] = df['launched_at'].dt.weekday
    df['year_launched'] = df['launched_at'].dt.year
    df['deadline_month'] = df['deadline'].dt.month
    df['deadline day'] = df['deadline'].dt.weekday
    df['deadline_year'] = df['deadline'].dt.year

    #Feature engineering
    df['days_to_launch'] = (df['launched_at'] - df['created_at']).dt.days
    df['campaign_length'] = (df['deadline'] - df['launched_at']).dt.days #campaign length in days
    
    return df

def time_to_string(df):
    #Convert back into strings so that we can pass to model
    df['created_at'] = pd.to_datetime(df['created_at'], format='%m%d%Y').astype(str)
    df['deadline'] = pd.to_datetime(df['deadline'], format='%m%d%Y').astype(str)
    df['launched_at'] = pd.to_datetime(df['launched_at'], format='%m%d%Y').astype(str)
    df['state_changed_at'] = pd.to_datetime(df['state_changed_at'], format='%m%d%Y').astype(str)

    return df

In [None]:
def drop_dupes(df):
    df = df[~df.duplicated('id')]
    df = df.reset_index().drop(columns='index')
    return df

In [None]:
def completed_campaigns(df):
    df = df[df['state'].isin(['failed', 'successful'])]
    return df

In [None]:
# # X and y
# X = df.drop(columns=['state','pledged', 'usd_pledged', 'state_changed_at', 'spotlight',
#                      'converted_pledged_amount', 'source_url', 'backers_count', 'state',
#                      'is_backing',	'is_starrable', 'is_starred'])
# y = df['state']

# # X_train, X_val,y_train, y_val = train_test_split(X, y, random_state=42)

## Fetch and look

#### Fetch

In [None]:
cd c:\Users\Hakuj\Documents\DataSets\Kickstarter

In [None]:
df = get_a_few(2018)

In [None]:
# df = get_a_year(2019)

#### Look

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

## Datetime research

In [None]:
#Time is in seconds from epoch time

# https://www.epochconverter.com/

In [None]:
# df.head()


#### Date manipulation

In [None]:
# df = datetime_convert(df)

In [None]:
# df['launched_at'][0]

In [None]:
# df['created_at'][0]

In [None]:
# (df['launched_at'] - df['created_at']).dt.days

In [None]:
# (df['deadline'] - df['launched_at']).dt.days

## Duplicate research

In [None]:
# df['id'].nunique(), df.shape

In [None]:
# df[df.duplicated('id')].sort_values('id')

## Break up dictionaries
- There are dictionaries in some of the columns, let's break them up.

In [None]:
df.head(1)

In [None]:
cols_with_dics = ['category', 'creator', 'location', 'photo', 'profile', 'urls']

##### Category
- Old method using map(eval)

In [2]:
def break_category(df):
    df['category'] = df['category'].map(eval) # converts row values to dict
    df_of_column = df[col].apply(pd.Series)  #Breaks dict up into columns
    df_of_column.columns = [f'{col}_'+col_name for col_name in df_of_column.columns] #Rename cols
    df.join(df_of_column)
    return df.drop(columns=col)

In [None]:
# df['category'] = df['category'].map(eval) # converts row values to dict

In [None]:
# df['category'][1]

In [None]:
# test = df['category'].apply(pd.Series) #Breaks dict up into columns into

In [None]:
# test.columns #Col names for copypaste

In [None]:
# test = test.rename(columns={'id': 'cat_id', 'name': 'cat_name',
#                     'slug': 'cat_slug', 'position': 'cat_position', 'parent_id': 'cat_parent_id',
#                     'color': 'cat_color'}) #Rename cols

In [None]:
# test = test.drop(columns='urls') #Drop the urls

In [None]:
# test #Ready to concat

##### Function design

In [None]:
test = df.copy()

In [None]:
# test['category'].apply(json.loads)

In [None]:
def col_dict(df, col):
    """Takes in a DataFrame and a list of column
    names and unpacks the 'dictionaries' into new columns"""
#     for col in cols: #Loop over columns
    df[col] = df[col].apply(json.loads)
    df_of_column = df[col].apply(pd.Series)
    df_of_column.columns = [f'{col}_'+col_name for col_name in df_of_column.columns]
    df.join(df_of_column)
    return df.drop(columns=col)

In [None]:
# (test['location'][0])

In [None]:
# {"id":2543897,"project_id":2543897,"state":"inactive","state_changed_at":1464927602,
#  "name":null,"blurb":null,"background_color":null,"text_color":null,
#  "link_background_color":null,"link_text_color":null,"link_text":null,
#  "link_url":null,"show_feature_image":false,
#  "background_image_opacity":0.8,
#  "feature_image_attributes":{"image_urls":
#                              {"default":"https://ksr-ugc.imgix.net/assets/012/621/061/7baad730e3071f23e83b8f419fc27768_original.JPG?ixlib=rb-1.1.0&crop=faces&w=1552&h=873&fit=crop&v=1464927868&auto=format&frame=1&q=92&s=269af376aaa7d2d88d577ad2477fa309",
#                               "baseball_card":"https://ksr-ugc.imgix.net/assets/012/621/061/7baad730e3071f23e83b8f419fc27768_original.JPG?ixlib=rb-1.1.0&crop=faces&w=560&h=315&fit=crop&v=1464927868&auto=format&frame=1&q=92&s=0c83ea1c807978fa63977e6fdadd7445"}}}

In [None]:
# pd.io.json.json_normalize(df['creator'])

In [None]:
# dict(jsond.decode(df['creator'][0]))

## Natural Language
- Let's use some natural language processing on the blurb and slug.

### Blurb

#### Tokenize 
- removing punctuation and putting in list

In [None]:
df['blurb'] = df['blurb'].astype(str)

In [None]:
df['blurb'][0]

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

In [None]:
df['blurb'] = df['blurb'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [None]:
df['blurb'][0]

#### Remove stop words

In [None]:
def remove_stopwords(text):
    """ Removes stop words. (i.e. 'i', me, you, he)"""
    words = [word for word in text if word not in stopwords.words('english')]
    return words

In [None]:
df['blurb'] = df['blurb'].apply(lambda x: remove_stopwords(x))

#### Lemmatize 
- Reduces words down to root words

In [None]:
lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

In [None]:
df['blurb'] = df['blurb'].apply(lambda x: word_lemmatizer(x))

## Wrangling

In [None]:
df = datetime_convert(df)

In [None]:
df = drop_dupes(df)

In [None]:
df.shape

In [None]:
df.head()