In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from ydata_profiling import ProfileReport

In [2]:
data = pd.read_csv('./data/kickstarter.csv')
data.shape

(1163, 20)

## Prepare Data
* Perform EDA
* Handle Missing Data (Remove or Impute)
* Identify Target Data
* Scale the Data
* Encode any Text Data (Categorical Data)

In [3]:
data.head()

Unnamed: 0,Funded,URL,Title,Year,Month,Type,Has FB,Backed Projects,Previous Projects,Creator Desc Len,Title Len,Goal,Duration,Pledge Levels,Min Pledge Tiers,Max Pledge Tiers,Proj Desc Len,Images,Videos,Has Video
0,no,https://www.kickstarter.com/projects/mischaa/p...,Pixelstart: Choose Your Own Pixels (Canceled),2016,Apr,Art,1,11,2,125,57,2829.59,53,7,1.14,171.0,2001,2,1,1
1,no,https://www.kickstarter.com/projects/105587445...,Smart shop Icons (Canceled),2016,Apr,Art,1,0,0,111,27,28295.87,51,3,1.14,46.0,2508,0,0,0
2,no,https://www.kickstarter.com/projects/minimalpr...,Minimal Haus Prints: Digital Prints for DIY Wa...,2016,Apr,Art,0,4,0,294,52,766.25,30,8,1.51,755.0,2325,1,1,1
3,no,https://www.kickstarter.com/projects/797661619...,NeoN: Altering the Alternative (Canceled),2016,Mar,Art,0,0,0,179,41,1439.1,24,5,7.0,141.0,3736,13,1,1
4,no,https://www.kickstarter.com/projects/198369359...,Nintendo NES 8bit retro canvas (Canceled),2016,Mar,Art,0,0,0,51,41,1000.0,30,2,5.0,20.0,636,0,0,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1163 entries, 0 to 1162
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Funded             1163 non-null   object 
 1   URL                1163 non-null   object 
 2   Title              1163 non-null   object 
 3   Year               1163 non-null   int64  
 4   Month              1163 non-null   object 
 5   Type               1163 non-null   object 
 6   Has FB             1163 non-null   int64  
 7   Backed Projects    1163 non-null   int64  
 8   Previous Projects  1163 non-null   int64  
 9   Creator Desc Len   1163 non-null   int64  
 10  Title Len          1163 non-null   int64  
 11  Goal               1163 non-null   float64
 12  Duration           1163 non-null   int64  
 13  Pledge Levels      1163 non-null   int64  
 14  Min Pledge Tiers   1163 non-null   float64
 15  Max Pledge Tiers   1163 non-null   float64
 16  Proj Desc Len      1163 

In [5]:
data.isnull().sum()

Funded               0
URL                  0
Title                0
Year                 0
Month                0
Type                 0
Has FB               0
Backed Projects      0
Previous Projects    0
Creator Desc Len     0
Title Len            0
Goal                 0
Duration             0
Pledge Levels        0
Min Pledge Tiers     0
Max Pledge Tiers     0
Proj Desc Len        0
Images               0
Videos               0
Has Video            0
dtype: int64

In [6]:
data['Funded'].value_counts()

Funded
no     685
yes    478
Name: count, dtype: int64

In [7]:
data.columns

Index(['Funded', 'URL', 'Title', 'Year', 'Month', 'Type', 'Has FB',
       'Backed Projects', 'Previous Projects', 'Creator Desc Len', 'Title Len',
       'Goal', 'Duration', 'Pledge Levels', 'Min Pledge Tiers',
       'Max Pledge Tiers', 'Proj Desc Len', 'Images', 'Videos', 'Has Video'],
      dtype='object')

In [8]:
# PERFORM ANY RENAMES...       we rename by copying columns and saving them back
#df.columns = [' ',....]

In [9]:
#ProfileReport(data, title ='Kickstarter')

### Create a classifier:
* Using the kickstarter.csv file for train and test
* Using the unseen_data.csv below for a final test
* 'Funded' is the feature I will predict 
### Classifier Steps
1) Gather sample data for "unseen data"
    * a. create a train / test csv file
    * b. create a unseen data csv file
2) Pycaret EDA notebook 
    * a. identify the best (sklearn) model 
       * i. using the train / test csv from step 1
    * b. EDA
3) Lean Pycaret notebook:
    * create the model
    * export the pipline (i.e. finalize model)
4) Import using joblib
    * run against unseen data
    * test results

#### **1.**  First I set aside 20% of data sample to be used in the end against the trined and tested data and i save them each in .csv file

In [10]:
unseen_data = data.sample(frac=.20)   # set asides data.sample(frac=.30, random_state=42) , i decided for 20% 
df = data.drop(unseen_data.index)

In [11]:
# does my row count match
data.shape[0] == df.shape[0] + unseen_data.shape[0]

True

In [12]:
#unseen_data.to_csv('./data/unseen_data.csv')
#reset the index
unseen_data.to_csv('./data/unseen_data.csv', index=False)

In [13]:
#df.to_csv('./data/adult_census.csv')
#reset index
df.to_csv('./data/Kickstarter0.8.csv', index=False)

In [14]:
df.describe().T   

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,930.0,2016.0,0.0,2016.0,2016.0,2016.0,2016.0,2016.0
Has FB,930.0,0.516129,0.500009,0.0,0.0,1.0,1.0,1.0
Backed Projects,930.0,4.208602,16.546751,0.0,0.0,0.0,2.0,348.0
Previous Projects,930.0,0.866667,2.574084,0.0,0.0,0.0,0.0,29.0
Creator Desc Len,930.0,358.488172,168.425254,0.0,215.0,398.0,504.0,747.0
Title Len,930.0,36.419355,17.890724,1.0,20.0,37.0,53.0,82.0
Goal,930.0,40767.897,338342.594066,15.32,1936.715,7000.0,22636.7,10000000.0
Duration,930.0,33.593548,11.46147,4.0,30.0,30.0,35.0,62.0
Pledge Levels,930.0,7.168817,5.985855,1.0,3.0,6.0,10.0,89.0
Min Pledge Tiers,930.0,40.875527,415.250747,0.7,1.13,5.0,14.0,10000.0


In [15]:
df.columns

Index(['Funded', 'URL', 'Title', 'Year', 'Month', 'Type', 'Has FB',
       'Backed Projects', 'Previous Projects', 'Creator Desc Len', 'Title Len',
       'Goal', 'Duration', 'Pledge Levels', 'Min Pledge Tiers',
       'Max Pledge Tiers', 'Proj Desc Len', 'Images', 'Videos', 'Has Video'],
      dtype='object')

#### **2.** Second, we import Classification Experiment and/or classification models and set up the Target + select best model

In [16]:
#pip install --upgrade joblib
# !pip install joblib==1.3.0

In [17]:
from pycaret.classification import ClassificationExperiment
s = ClassificationExperiment()

In [18]:
s.setup(data, target = 'Funded', session_id = 42)       #quickstart has 123, we do 42

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Funded
2,Target type,Binary
3,Target mapping,"no: 0, yes: 1"
4,Original data shape,"(1163, 20)"
5,Transformed data shape,"(1163, 29)"
6,Transformed train set shape,"(814, 29)"
7,Transformed test set shape,"(349, 29)"
8,Numeric features,15
9,Categorical features,4


<pycaret.classification.oop.ClassificationExperiment at 0x19c7b128fd0>

In [19]:
%%time
best = s.compare_models()  

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7801,0.0,0.7801,0.7794,0.7776,0.5381,0.5413,0.655
ridge,Ridge Classifier,0.7298,0.0,0.7298,0.7879,0.6953,0.3882,0.4654,0.021
svm,SVM - Linear Kernel,0.6671,0.0,0.6671,0.7161,0.6591,0.3409,0.3741,0.016
knn,K Neighbors Classifier,0.6525,0.0,0.6525,0.6498,0.6477,0.2709,0.2739,0.361
et,Extra Trees Classifier,0.5897,0.0,0.5897,0.4291,0.4409,0.0045,0.0172,0.034
dt,Decision Tree Classifier,0.5885,0.0,0.5885,0.3463,0.436,0.0,0.0,0.017
rf,Random Forest Classifier,0.5885,0.0,0.5885,0.3463,0.436,0.0,0.0,0.039
qda,Quadratic Discriminant Analysis,0.5885,0.0,0.5885,0.3463,0.436,0.0,0.0,0.017
ada,Ada Boost Classifier,0.5885,0.0,0.5885,0.3463,0.436,0.0,0.0,0.017
gbc,Gradient Boosting Classifier,0.5885,0.0,0.5885,0.3463,0.436,0.0,0.0,0.03


CPU times: total: 4.22 s
Wall time: 20.9 s


In [20]:
s.models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [21]:
from pycaret.classification import setup, models, create_model, tune_model, finalize_model, save_model

cls = setup(data, target='Funded', session_id=42, fix_imbalance=True);

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Funded
2,Target type,Binary
3,Target mapping,"no: 0, yes: 1"
4,Original data shape,"(1163, 20)"
5,Transformed data shape,"(1307, 29)"
6,Transformed train set shape,"(958, 29)"
7,Transformed test set shape,"(349, 29)"
8,Numeric features,15
9,Categorical features,4


#### **3.** Third, we create, tune and finalize the model and export the .pkl file

In [22]:
lr = s.create_model('lr')
lr

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7805,0.0,0.7805,0.7794,0.778,0.5399,0.5427
1,0.8049,0.0,0.8049,0.8045,0.8027,0.591,0.5941
2,0.7927,0.0,0.7927,0.792,0.7922,0.5711,0.5713
3,0.7561,0.0,0.7561,0.7543,0.7534,0.4888,0.4914
4,0.7284,0.0,0.7284,0.7264,0.7269,0.4321,0.4327
5,0.7778,0.0,0.7778,0.7763,0.7766,0.5354,0.5361
6,0.7901,0.0,0.7901,0.7924,0.7848,0.5504,0.5601
7,0.8272,0.0,0.8272,0.8263,0.8262,0.6386,0.6395
8,0.7654,0.0,0.7654,0.7659,0.7595,0.4976,0.5063
9,0.7778,0.0,0.7778,0.7769,0.7754,0.5363,0.5391


In [23]:
lr = cls.create_model('lr')
lr

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7927,0.0,0.7927,0.8014,0.7941,0.5819,0.5863
1,0.8049,0.0,0.8049,0.8045,0.8027,0.591,0.5941
2,0.7561,0.0,0.7561,0.7626,0.7576,0.506,0.5085
3,0.7439,0.0,0.7439,0.7451,0.7444,0.4747,0.4748
4,0.7407,0.0,0.7407,0.7421,0.7413,0.4656,0.4657
5,0.7284,0.0,0.7284,0.7548,0.7305,0.4629,0.4774
6,0.7778,0.0,0.7778,0.7778,0.7778,0.5398,0.5398
7,0.8148,0.0,0.8148,0.8191,0.8159,0.6218,0.6236
8,0.8395,0.0,0.8395,0.8436,0.8404,0.6723,0.6742
9,0.7901,0.0,0.7901,0.7987,0.7914,0.5777,0.5822


In [24]:
tune_lr=tune_model(lr)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8049,0.0,0.8049,0.8111,0.8061,0.6048,0.6078
1,0.8049,0.0,0.8049,0.8045,0.8027,0.591,0.5941
2,0.7805,0.0,0.7805,0.7868,0.7818,0.5554,0.5582
3,0.7439,0.0,0.7439,0.7451,0.7444,0.4747,0.4748
4,0.7284,0.0,0.7284,0.7315,0.7295,0.4428,0.4433
5,0.7407,0.0,0.7407,0.7633,0.7429,0.485,0.4973
6,0.7778,0.0,0.7778,0.7805,0.7787,0.5441,0.5448
7,0.8272,0.0,0.8272,0.8296,0.8279,0.6454,0.6462
8,0.8642,0.0,0.8642,0.8729,0.8652,0.7253,0.731
9,0.7778,0.0,0.7778,0.784,0.779,0.5511,0.5539


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [36]:
final_lr = finalize_model(tune_lr)
final_lr

We will save and use this  ***'final_lr'*** to predict the unseen data that will be imported below.

In [37]:
save_model(final_lr, './models/lr_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Year', 'Has FB',
                                              'Backed Projects',
                                              'Previous Projects',
                                              'Creator Desc Len', 'Title Len',
                                              'Goal', 'Duration',
                                              'Pledge Levels',
                                              'Min Pledge Tiers',
                                              'Max Pledge T...
                  TransformerWrapper(exclude=None, include=None,
                                     transformer=CleanColumnNames(match='[\\]\\[\\

#### **4.** Forth, we run the test against *unseen_data* and report on predictions and scores

In [38]:
unseen = pd.read_csv('./data/unseen_data.csv')
unseen.tail(3)

Unnamed: 0,Funded,URL,Title,Year,Month,Type,Has FB,Backed Projects,Previous Projects,Creator Desc Len,Title Len,Goal,Duration,Pledge Levels,Min Pledge Tiers,Max Pledge Tiers,Proj Desc Len,Images,Videos,Has Video
230,no,https://www.kickstarter.com/projects/330311462...,PlastaPrint3D,2016,Mar,Design,0,0,0,510,13,16977.52,30,13,6.0,3086.0,2422,8,2,1
231,no,https://www.kickstarter.com/projects/853681317...,SolidProjection SolidWorks to Unity3D with Sub...,2016,Mar,Software,0,0,0,488,60,766.25,30,5,151.0,151.0,1395,0,1,1
232,yes,https://www.kickstarter.com/projects/901324798...,Penitent,2016,Apr,Video,1,2,2,507,8,1200.0,10,6,10.0,500.0,3206,0,1,1


In [39]:
unseen.shape

(233, 20)

In [40]:
import joblib
import pandas as pd

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
#from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score, root_mean_squared_error, mean_absolute_percentage_error

test = pd.read_csv('./data/unseen_data.csv')
test.shape

(233, 20)

In [41]:
cls = joblib.load('./models/lr_model.pkl')

In [42]:
### Test against the Unseen ###
unseen_X = test.drop('Funded', axis=1)
y = test['Funded']

In [43]:
y_pred = cls.predict(unseen_X)

In [44]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

          no       0.88      0.82      0.85       139
         yes       0.76      0.84      0.80        94

    accuracy                           0.83       233
   macro avg       0.82      0.83      0.82       233
weighted avg       0.83      0.83      0.83       233



In [45]:
### to be able to encode the classification report we can import get_metrics() 

In [46]:
from pycaret.classification import *

In [47]:
get_metrics()

Unnamed: 0_level_0,Name,Display Name,Score Function,Scorer,Target,Args,Greater is Better,Multiclass,Custom
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
acc,Accuracy,Accuracy,<function accuracy_score at 0x0000019C7BA86980>,accuracy,pred,{},True,True,False
auc,AUC,AUC,<pycaret.internal.metrics.BinaryMulticlassScor...,"make_scorer(roc_auc_score, response_method='pr...",pred_proba,"{'average': 'weighted', 'multi_class': 'ovr'}",True,True,False
recall,Recall,Recall,<pycaret.internal.metrics.BinaryMulticlassScor...,"make_scorer(recall_score, response_method='pre...",pred,{'average': 'weighted'},True,True,False
precision,Precision,Prec.,<pycaret.internal.metrics.BinaryMulticlassScor...,"make_scorer(precision_score, response_method='...",pred,{'average': 'weighted'},True,True,False
f1,F1,F1,<pycaret.internal.metrics.BinaryMulticlassScor...,"make_scorer(f1_score, response_method='predict...",pred,{'average': 'weighted'},True,True,False
kappa,Kappa,Kappa,<function cohen_kappa_score at 0x0000019C7BA86...,"make_scorer(cohen_kappa_score, response_method...",pred,{},True,True,False
mcc,MCC,MCC,<function matthews_corrcoef at 0x0000019C7BA86...,"make_scorer(matthews_corrcoef, response_method...",pred,{},True,True,False


1. **Accuracy**: This is the ratio of the number of correct predictions to the total number of predictions. It's used when the classes are balanced. It is calculated as:
    $$\text{Accuracy} = \frac{\text{Number of Correct Predictions}}{\text{Total Number of Predictions}}$$

2. **AUC (Area Under the ROC Curve)**: This metric is used in binary classification. It measures the ability of the classifier to distinguish between classes. An AUC of 1 indicates a perfect classifier, while an AUC of 0.5 suggests a poor classifier.

3. **Recall (Sensitivity or True Positive Rate)**: This is the ratio of the number of true positives divided by the sum of the true positives and the false negatives. It shows how many of the positive samples have been identified correctly. It is calculated as:
    $$\text{Recall} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}$$

4. **Precision (Positive Predictive Value)**: This is the ratio of the number of true positives divided by the sum of the true positives and the false positives. It shows how many of the positively classified samples are actually positive. It is calculated as:
    $$\text{Precision} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Positives}}$$

5. **F1 Score**: This is the harmonic mean of Precision and Recall and tries to balance the two. It is calculated as:
    $$\text{F1 Score} = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}$$

6. **Kappa (Cohen's Kappa)**: This is used for multi-class classification. It measures the agreement between two raters who each classify items into mutually exclusive categories. A kappa of 1 indicates perfect agreement, while a kappa of 0 indicates agreement equivalent to chance.

7. **MCC (Matthews Correlation Coefficient)**: This is used in binary classification. It takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes. It is calculated as:
    $$\text{MCC} = \frac{(\text{TP} \times \text{TN}) - (\text{FP} \times \text{FN})}{\sqrt{(\text{TP} + \text{FP})(\text{TP} + \text{FN})(\text{TN} + \text{FP})(\text{TN} + \text{FN})}}$$
    where TP is True Positive, TN is True Negative, FP is False Positive, and FN is False Negative.

These metrics provide a comprehensive view of the performance of a classification model. Different metrics are suitable for different scenarios and it's important to choose the right metric for the right task. For example, in a highly imbalanced dataset, accuracy might not be a good metric to use, and one might resort to using F1 Score, Precision, Recall, or AUC-ROC instead. Similarly, MCC is a good metric when the classes are of very different sizes.

In [None]:
df.dtypes == 'object'    #this is how we filter on in if we do == ''

In [51]:
df.dtypes == 'object'    #this is how we filter on in if we do == ''

Funded                True
URL                   True
Title                 True
Year                 False
Month                 True
Type                  True
Has FB               False
Backed Projects      False
Previous Projects    False
Creator Desc Len     False
Title Len            False
Goal                 False
Duration             False
Pledge Levels        False
Min Pledge Tiers     False
Max Pledge Tiers     False
Proj Desc Len        False
Images               False
Videos               False
Has Video            False
dtype: bool

In [52]:
df.dtypes != 'object'

Funded               False
URL                  False
Title                False
Year                  True
Month                False
Type                 False
Has FB                True
Backed Projects       True
Previous Projects     True
Creator Desc Len      True
Title Len             True
Goal                  True
Duration              True
Pledge Levels         True
Min Pledge Tiers      True
Max Pledge Tiers      True
Proj Desc Len         True
Images                True
Videos                True
Has Video             True
dtype: bool

In [53]:
df_numeric

Unnamed: 0,Year,Has FB,Backed Projects,Previous Projects,Creator Desc Len,Title Len,Goal,Duration,Pledge Levels,Min Pledge Tiers,Max Pledge Tiers,Proj Desc Len,Images,Videos,Has Video
0,2016,1,11,2,125,57,2829.59,53,7,1.14,171.0,2001,2,1,1
2,2016,0,4,0,294,52,766.25,30,8,1.51,755.0,2325,1,1,1
3,2016,0,0,0,179,41,1439.10,24,5,7.00,141.0,3736,13,1,1
5,2016,0,0,0,162,30,800.00,28,1,10.00,10.0,1199,0,0,0
6,2016,1,2,0,110,55,2000.00,50,9,5.00,500.0,2384,11,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156,2016,1,0,0,55,20,28295.87,40,6,5.00,546.0,4362,11,1,1
1157,2016,0,0,0,96,13,10000.00,60,1,100.00,100.0,2547,0,0,0
1158,2016,1,0,0,53,9,143909.50,30,2,14.00,143.0,508,0,0,0
1161,2016,0,2,0,513,28,3831.25,60,2,0.71,708.0,852,0,1,1


In [55]:
# preprocess the data - create scaler...   
from sklearn.preprocessing import MinMaxScaler   
scaler = MinMaxScaler()

In [56]:
scaled_numeric = scaler.fit_transform(df_numeric)
scaled_numeric

array([[0.        , 1.        , 0.0316092 , ..., 0.02105263, 0.07692308,
        1.        ],
       [0.        , 0.        , 0.01149425, ..., 0.01052632, 0.07692308,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.13684211, 0.07692308,
        1.        ],
       ...,
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00574713, ..., 0.        , 0.07692308,
        1.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [57]:
scaled_numeric.shape     #only the numeric columns

(930, 15)

In [58]:
df_scaled_numeric = pd.DataFrame(scaled_numeric)
df_scaled_numeric.columns = df_numeric.columns
df_scaled_numeric

Unnamed: 0,Year,Has FB,Backed Projects,Previous Projects,Creator Desc Len,Title Len,Goal,Duration,Pledge Levels,Min Pledge Tiers,Max Pledge Tiers,Proj Desc Len,Images,Videos,Has Video
0,0.0,1.0,0.031609,0.068966,0.167336,0.691358,0.000281,0.844828,0.068182,0.000044,0.006810,0.068782,0.021053,0.076923,1.0
1,0.0,0.0,0.011494,0.000000,0.393574,0.629630,0.000075,0.448276,0.079545,0.000081,0.030171,0.080258,0.010526,0.076923,1.0
2,0.0,0.0,0.000000,0.000000,0.239625,0.493827,0.000142,0.344828,0.045455,0.000630,0.005610,0.130233,0.136842,0.076923,1.0
3,0.0,0.0,0.000000,0.000000,0.216867,0.358025,0.000078,0.413793,0.000000,0.000930,0.000370,0.040377,0.000000,0.000000,0.0
4,0.0,1.0,0.005747,0.000000,0.147256,0.666667,0.000198,0.793103,0.090909,0.000430,0.019970,0.082348,0.115789,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
925,0.0,1.0,0.000000,0.000000,0.073628,0.234568,0.002828,0.620690,0.056818,0.000430,0.021810,0.152405,0.115789,0.076923,1.0
926,0.0,0.0,0.000000,0.000000,0.128514,0.148148,0.000998,0.965517,0.000000,0.009931,0.003970,0.088121,0.000000,0.000000,0.0
927,0.0,1.0,0.000000,0.000000,0.070950,0.098765,0.014389,0.448276,0.011364,0.001330,0.005690,0.015903,0.000000,0.000000,0.0
928,0.0,0.0,0.005747,0.000000,0.686747,0.333333,0.000382,0.965517,0.011364,0.000001,0.028290,0.028087,0.000000,0.076923,1.0


### Encode the DATA

In [59]:
#encode any text features that will be features (you do not encode Target / label/ y)
target = df_text['Funded']
df_text.drop(['Funded'], axis=1, inplace=True)

In [60]:
df_scaled_text = pd.get_dummies(df_text)     # dummies are ONE HOT ENCODING, specific variables
df_scaled_text

Unnamed: 0,URL_https://www.kickstarter.com/projects/1001163131/brandys-soy-candles,URL_https://www.kickstarter.com/projects/1002686019/muros,URL_https://www.kickstarter.com/projects/1005151588/my-first-horror-alphabet-book,URL_https://www.kickstarter.com/projects/1005151588/the-little-abc-book-of-horror,URL_https://www.kickstarter.com/projects/1007968674/artistic-collaboration-2016,URL_https://www.kickstarter.com/projects/1008856355/crafts-and-jewelry-etsy-shop-start-up-by-jessilynn,URL_https://www.kickstarter.com/projects/1009586486/slemish-woodcrafts,URL_https://www.kickstarter.com/projects/1013189283/quicktask,URL_https://www.kickstarter.com/projects/1013222245/serum-a-short-film,URL_https://www.kickstarter.com/projects/1015397918/cybersecurity-for-small-businesses,...,Month_Feb,Month_Mar,Month_May,Type_Apps,Type_Art,Type_Crafts,Type_Design,Type_Gadgets,Type_Software,Type_Video
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,True,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,True,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1156,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False
1157,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
1158,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False
1161,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False


In [61]:
df_final = pd.concat([df_scaled_text, df_scaled_numeric], axis=1)    #this is waht statistical algorithm wants to see in machine learning:
df_final     

Unnamed: 0,URL_https://www.kickstarter.com/projects/1001163131/brandys-soy-candles,URL_https://www.kickstarter.com/projects/1002686019/muros,URL_https://www.kickstarter.com/projects/1005151588/my-first-horror-alphabet-book,URL_https://www.kickstarter.com/projects/1005151588/the-little-abc-book-of-horror,URL_https://www.kickstarter.com/projects/1007968674/artistic-collaboration-2016,URL_https://www.kickstarter.com/projects/1008856355/crafts-and-jewelry-etsy-shop-start-up-by-jessilynn,URL_https://www.kickstarter.com/projects/1009586486/slemish-woodcrafts,URL_https://www.kickstarter.com/projects/1013189283/quicktask,URL_https://www.kickstarter.com/projects/1013222245/serum-a-short-film,URL_https://www.kickstarter.com/projects/1015397918/cybersecurity-for-small-businesses,...,Title Len,Goal,Duration,Pledge Levels,Min Pledge Tiers,Max Pledge Tiers,Proj Desc Len,Images,Videos,Has Video
0,False,False,False,False,False,False,False,False,False,False,...,0.691358,0.000281,0.844828,0.068182,0.000044,0.00681,0.068782,0.021053,0.076923,1.0
2,False,False,False,False,False,False,False,False,False,False,...,0.493827,0.000142,0.344828,0.045455,0.000630,0.00561,0.130233,0.136842,0.076923,1.0
3,False,False,False,False,False,False,False,False,False,False,...,0.358025,0.000078,0.413793,0.000000,0.000930,0.00037,0.040377,0.000000,0.000000,0.0
5,False,False,False,False,False,False,False,False,False,False,...,0.703704,0.000098,0.448276,0.022727,0.000030,0.00117,0.054686,0.010526,0.076923,1.0
6,False,False,False,False,False,False,False,False,False,False,...,0.222222,0.000106,0.448276,0.170455,0.000630,0.00625,0.082808,0.094737,0.076923,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
904,,,,,,,,,,,...,0.123457,0.000225,0.448276,0.056818,0.001030,0.00905,0.171247,0.052632,0.076923,1.0
907,,,,,,,,,,,...,0.518519,1.000000,0.965517,0.000000,0.000030,0.00001,0.000000,0.000000,0.000000,0.0
912,,,,,,,,,,,...,0.098765,0.003498,0.448276,0.056818,0.000930,0.00997,0.259616,0.000000,0.076923,1.0
925,,,,,,,,,,,...,0.234568,0.002828,0.620690,0.056818,0.000430,0.02181,0.152405,0.115789,0.076923,1.0


### Get the train model scores

In [62]:
y = target
y.shape, y.dtype

((930,), dtype('O'))

In [63]:
y.value_counts()

Funded
no     546
yes    384
Name: count, dtype: int64

In [64]:
X = df_final
X.head()

Unnamed: 0,URL_https://www.kickstarter.com/projects/1001163131/brandys-soy-candles,URL_https://www.kickstarter.com/projects/1002686019/muros,URL_https://www.kickstarter.com/projects/1005151588/my-first-horror-alphabet-book,URL_https://www.kickstarter.com/projects/1005151588/the-little-abc-book-of-horror,URL_https://www.kickstarter.com/projects/1007968674/artistic-collaboration-2016,URL_https://www.kickstarter.com/projects/1008856355/crafts-and-jewelry-etsy-shop-start-up-by-jessilynn,URL_https://www.kickstarter.com/projects/1009586486/slemish-woodcrafts,URL_https://www.kickstarter.com/projects/1013189283/quicktask,URL_https://www.kickstarter.com/projects/1013222245/serum-a-short-film,URL_https://www.kickstarter.com/projects/1015397918/cybersecurity-for-small-businesses,...,Title Len,Goal,Duration,Pledge Levels,Min Pledge Tiers,Max Pledge Tiers,Proj Desc Len,Images,Videos,Has Video
0,False,False,False,False,False,False,False,False,False,False,...,0.691358,0.000281,0.844828,0.068182,4.4e-05,0.00681,0.068782,0.021053,0.076923,1.0
2,False,False,False,False,False,False,False,False,False,False,...,0.493827,0.000142,0.344828,0.045455,0.00063,0.00561,0.130233,0.136842,0.076923,1.0
3,False,False,False,False,False,False,False,False,False,False,...,0.358025,7.8e-05,0.413793,0.0,0.00093,0.00037,0.040377,0.0,0.0,0.0
5,False,False,False,False,False,False,False,False,False,False,...,0.703704,9.8e-05,0.448276,0.022727,3e-05,0.00117,0.054686,0.010526,0.076923,1.0
6,False,False,False,False,False,False,False,False,False,False,...,0.222222,0.000106,0.448276,0.170455,0.00063,0.00625,0.082808,0.094737,0.076923,1.0


In [69]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [1119, 930]

In [67]:
X_train[0:5]

NameError: name 'X_train' is not defined

In [68]:
X_test.head()

NameError: name 'X_test' is not defined

In [None]:
y_train.head()

In [None]:
y_test.head()

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
lgr = LogisticRegression() 

In [None]:
lgr.fit(X_train, y_train)

In [None]:
lgr_preds_train = lgr.predict(X_train)

In [None]:
lgr.score(X_train, y_train)

In [None]:
lgr_cm = confusion_matrix(lgr_preds_train, y_train)
lgr_cm

In [None]:
sns.heatmap(lgr_cm, annot=True, fmt='d')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
#import seaborn as sns

In [None]:
from sklearn.metrics import classification_report
print(classification_report(X, y_pred))

### BONUS 'TIP of a DAY'

#### Difference between predict. and predict.model()

In [None]:
from pycaret.classification import ClassificationExperiment
cls_oop = ClassificationExperiment()      # it is not real set up , but just an experiment itself

In [None]:
cls_oop

In [None]:
cls_oop.predict_model(final_lr, unseen)       #because this brings back whole dataframe, put it into df

In [None]:
# this is like 1 hot ecoding, it is original data with actual = Feedback and the predictive label that is predicted actual and pred scores

In [None]:
df_oop = cls_oop.predict_model(final_lr, unseen) 

In [None]:
y_true = df_oop['Funded']
y_pred = df_oop['prediction_label']

In [None]:
#df_oop[df_oop['Feedback'] != df_oop['prediction_label']]   #look how the 99 had an 85% prediction score ti will be wirgh, but it still predicted wrong!

In [None]:
df_oop.sort_values(['prediction_score'])    #the whole enchilada

In [None]:
type(y_true.dtype), type(y_pred.dtype), y_true.dtype, y_pred.dtype, type(y_true[0]), type (y_pred[0]) 

In [None]:
cls_oop      #it returns classification experiment

### FINDINGS

In [None]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cls = final_lr
unseen_X = unseen.drop('Funded', axis=1)
y_true = unseen['Funded']

In [None]:
y_pred = cls.predict(unseen_X)       # in the settings it does say it returns array
# y_true, y_pred # once you have pred_y and true_y you can go straigh to scoring

In [None]:
unseen_X.shape, y_pred.shape

In [None]:
classification_report(unseen['Funded'], cls.predict(unseen_X))    # better output if print()

In [None]:
report = classification_report(y_true, y_pred)   #Shitf+Tab will show what it will show
print(report)

#### ALWAYS STOP AND LOOK AT WHAT YOU GET: STRING, OBJECT ECT...

In [None]:
y_true.dtype, y_pred.dtype  , #(y_true[0])    # mine and Jumanas pred_y was dataframe, where Pats was object

In [None]:
type(y_true[0]), type (y_pred[0]) 

In [None]:
type(y_true.dtype), type(y_pred.dtype)      # Pat's are numpy arrays

## Functional API vs Object Oriented Programming (OOP) API

In [None]:
### With Object Oriented OOP we can run different experiments at same time, like in one we used balancing and on one didnt

In [None]:
from pycaret.classification import ClassificationExperiment
s1 = ClassificationExperiment()      # it is not real set up , but just an experiment itself
s2 = ClassificationExperiment()
s1.setup(data, target='Funded', session_id=42, fix_imbalance=True);             # we can compare this 2 set up, with fix imbalance or not
s2.setup(data, target='Funded', session_id=42, fix_imbalance=False);

In [None]:
accuracy_score(y_true, y_pred)

In [None]:
precision_score(y_true, y_pred, pos_label='yes')

In [None]:
recall_score(y_true, y_pred, pos_label='yes')

In [None]:
cm = confusion_matrix(y_true, y_pred)
cm

In [None]:
ax = sns.heatmap(cm, annot=True, cmap='flare')    #all this has to be in the same cell and we take control of my axis
ax.set_xticklabels(['Negative', 'Positive'])
ax.set_xticklabels(['Negative', 'Positive'])
plt.xlabel('Predict Labels')
plt.ylabel('True Labels')
plt.title('Unseen Dataset Confusion Matrix')

In [None]:
# the way to test the results
df = pd.DataFrame({'Pred':y_pred, 'True':y_true})

In [None]:
# OOP API
kmeans = s.create_model('kmeans', num_clusters=4)

In [None]:
diff = df[df['Pred'] != df['True']]
diff['Pred'].value_counts()

In [None]:
from pycaret.classification import *
plot_model(lr, plot = 'class_report')

### SMOTE (Synthetic Minority Oversampling Technique) is a type of DATA AUGEMENTATION TECHNIQUE

In [None]:
from pycaret.classification import setup, models, create_model, tune_model, finalize_model, save_model, evaluate_model
cls = setup(data, target='Funded', session_id=42, fix_imbalance=True, fix_imbalance_method= 'SMOTE')