# <center>Predicting The Success of Crowdfunfing Projects on Kickstarter </center>


### Import Libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
import os

### Load Dataset


In [2]:
data = pd.read_csv("./ks-projects-201801.csv")
data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [3]:
data.tail()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
378656,999976400,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,USD,2014-10-17,50000.0,2014-09-17 02:35:30,25.0,canceled,1,US,25.0,25.0,50000.0
378657,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19,1500.0,2011-06-22 03:35:14,155.0,failed,5,US,155.0,155.0,1500.0
378658,999986353,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,USD,2010-08-16,15000.0,2010-07-01 19:40:30,20.0,failed,1,US,20.0,20.0,15000.0
378659,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13,15000.0,2016-01-13 18:13:53,200.0,failed,6,US,200.0,200.0,15000.0
378660,999988282,Nou Renmen Ayiti! We Love Haiti!,Performance Art,Art,USD,2011-08-16,2000.0,2011-07-19 09:07:47,524.0,failed,17,US,524.0,524.0,2000.0


### EDA


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                378661 non-null  int64  
 1   name              378657 non-null  object 
 2   category          378661 non-null  object 
 3   main_category     378661 non-null  object 
 4   currency          378661 non-null  object 
 5   deadline          378661 non-null  object 
 6   goal              378661 non-null  float64
 7   launched          378661 non-null  object 
 8   pledged           378661 non-null  float64
 9   state             378661 non-null  object 
 10  backers           378661 non-null  int64  
 11  country           378661 non-null  object 
 12  usd pledged       374864 non-null  float64
 13  usd_pledged_real  378661 non-null  float64
 14  usd_goal_real     378661 non-null  float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [5]:
data.isnull().sum()

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

#### Handling missing values


In [6]:
# Replace missing values in the 'usd pledged' column with the mean value
data["usd pledged"] = data["usd pledged"].fillna(data["usd pledged"].mean())

In [7]:
data.isnull().sum()

ID                  0
name                4
category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
dtype: int64

#### Distribution of Projects By States


In [8]:
state_counts = data["state"].value_counts().reset_index()
state_counts.columns = ["state", "count"]
fig1 = px.bar(
    state_counts,
    x="state",
    y="count",
    color="state",
    title="Distribution of Projects by State",
    text_auto=True,
    labels={"state": "State", "count": "Number of Projects"},
)
fig1.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False,
    marker_line_width=1,
    marker_line_color="black",
)
fig1.update_layout(showlegend=False)
fig1.show()

In [9]:
fig2 = px.pie(
    state_counts,
    values="count",
    names="state",
    title="Distribution of Projects by State",
)
fig2.update_traces(textfont_size=12, marker_line_width=1, marker_line_color="black")
fig2.update_layout(legend_title="State")
fig2.show()

In [10]:
df = data.loc[data["state"].isin(["failed", "successful"])]
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0


In [11]:
df["state"].value_counts()

state
failed        197719
successful    133956
Name: count, dtype: int64

#### Distribution By Categories


In [12]:
df["main_category"].describe()

count           331675
unique              15
top       Film & Video
freq             56527
Name: main_category, dtype: object

In [13]:
df["category"].describe()

count             331675
unique               159
top       Product Design
freq               18680
Name: category, dtype: object

In [14]:
category_state_counts = df.groupby(["main_category", "state"]).size().reset_index()
category_state_counts.columns = ["main_category", "state", "count"]
fig3 = px.bar(
    category_state_counts,
    x="main_category",
    y="count",
    color="state",
    barmode="group",
    title="Distribution of Projects by Main Categories and States",
    text_auto=True,
    labels={"main_category": "Category", "count": "Number of Projects"},
)
fig3.update_traces(textfont_size=12, marker_line_width=1, marker_line_color="black")
fig3.update_layout(legend_title="State")
fig3.show()

In [None]:
rate_success_cat = (
    df[df["state"] == "successful"].groupby(["main_category"]).count()["ID"]
    / df.groupby(["main_category"]).count()["ID"]
    * 100
)
rate_failed_cat = (
    df[df["state"] == "failed"].groupby(["main_category"]).count()["ID"]
    / df.groupby(["main_category"]).count()["ID"]
    * 100
)

success_df = rate_success_cat.reset_index()
success_df.columns = ["main_category", "rate"]
success_df["state"] = "successful"

failed_df = rate_failed_cat.reset_index()
failed_df.columns = ["main_category", "rate"]
failed_df["state"] = "failed"

rate_df = pd.concat([success_df, failed_df])

fig4 = px.bar(
    rate_df,
    x="main_category",
    y="rate",
    color="state",
    title="Percentages of Successful and Failed Projects by Main Category",
    labels={"main_category": "Main Category", "rate": "Percentage", "state": "State"},
)

fig4.update_layout(barmode="stack")

fig4.update_traces(textfont_size=12, marker_line_width=1, marker_line_color="black")

fig4.show()

In [16]:
top_categories = df["category"].value_counts().head(20).index
top_categories_df = df.loc[df["category"].isin(top_categories)]
top_categories_state_counts = (
    top_categories_df.groupby(["category", "state"]).size().reset_index()
)
top_categories_state_counts.columns = ["category", "state", "count"]
fig4 = px.bar(
    top_categories_state_counts,
    x="category",
    y="count",
    color="state",
    barmode="group",
    title="Distribution of Top 20 Categories by States",
    text_auto=True,
    labels={"category": "Category", "count": "Number of Projects"},
)
fig4.update_traces(textfont_size=12, marker_line_width=1, marker_line_color="black")
fig4.update_layout(legend_title="State")
fig4.show()

In [17]:
successful_projects = df.loc[df["state"] == "successful"]
successful_categories = successful_projects["category"].value_counts().head(20).index
successful_categories_df = successful_projects.loc[
    successful_projects["category"].isin(successful_categories)
]
successful_categories_counts = (
    successful_categories_df["category"].value_counts().reset_index()
)
successful_categories_counts.columns = ["category", "count"]
fig5 = px.bar(
    successful_categories_counts,
    x="category",
    y="count",
    title="Distribution of Top 20 Categories of Successful Projects",
    text_auto=True,
    labels={"category": "Category", "count": "Number of Successful Projects"},
)
fig5.update_traces(textfont_size=12, marker_line_width=1, marker_line_color="black")
fig5.show()

In [18]:
failed_projects = df.loc[df["state"] == "failed"]
failed_categories = failed_projects["category"].value_counts().head(20).index
failed_categories_df = failed_projects.loc[
    failed_projects["category"].isin(failed_categories)
]
failed_categories_counts = failed_categories_df["category"].value_counts().reset_index()
failed_categories_counts.columns = ["category", "count"]
fig6 = px.bar(
    failed_categories_counts,
    x="category",
    y="count",
    title="Distribution of Top 20 Categories of Failed Projects",
    text_auto=True,
    labels={"category": "Category", "count": "Number of Failed Projects"},
)
fig6.update_traces(
    textfont_size=12,
    marker_line_width=1,
    marker_line_color="black",
    marker_color="#EF553B",
    textposition="inside",
    textfont_color="white",
)
fig6.show()

In [19]:
from plotly.subplots import make_subplots

fig = px.bar(
    successful_categories_counts,
    x="category",
    y="count",
    title="Distribution of Top 20 Categories of Successful Projects",
    labels={"category": "Category", "count": "Number of Successful Projects"},
    text_auto=True,
)

fig2 = px.bar(
    failed_categories_counts,
    x="category",
    y="count",
    title="Distribution of Top 20 Categories of Failed Projects",
    labels={"category": "Category", "count": "Number of Failed Projects"},
    text_auto=True,
)

fig.update_traces(marker_line_width=1, marker_line_color="black")
fig2.update_traces(
    marker_line_width=1, marker_line_color="black", marker_color="#EF553B"
)


fig_combined = make_subplots(rows=1, cols=2, subplot_titles=("Successful", "Failed"))

for trace in fig.data:
    fig_combined.add_trace(trace, row=1, col=1)

for trace in fig2.data:
    fig_combined.add_trace(trace, row=1, col=2)

fig_combined.update_layout(
    height=600,
    width=1200,
    title_text="Distribution of Top 20 Categories in Successful & Failed Projects",
)

fig_combined.show()

#### By Duration


In [20]:
# Create a new feature 'duration' to calculate the duration of each project in days
df["launched"] = pd.to_datetime(df["launched"])
df["deadline"] = pd.to_datetime(df["deadline"])
df["duration"] = (df["deadline"] - df["launched"]).dt.days + 1

In [21]:
df["duration"].describe()

count    331675.000000
mean         33.954902
std          12.713329
min           1.000000
25%          30.000000
50%          30.000000
75%          36.000000
max          92.000000
Name: duration, dtype: float64

In [22]:
mean_duration_failed = df[df["state"] == "failed"]["duration"].mean()
mean_duration_successful = df[df["state"] == "successful"]["duration"].mean()

print("Mean duration of failed projects:", mean_duration_failed)
print("Mean duration of successful projects:", mean_duration_successful)

Mean duration of failed projects: 35.17335208047785
Mean duration of successful projects: 32.15646928842306


In [23]:
mean_duration = df.groupby("state")["duration"].mean().reset_index()
mean_duration.columns = ["state", "mean_duration"]
fig7 = px.bar(
    mean_duration,
    x="state",
    y="mean_duration",
    color="state",
    title="Mean Duration (Days) of Projects by State",
    text_auto=True,
    labels={"state": "State", "mean_duration": "Mean Duration"},
)
fig7.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False,
    marker_line_width=1,
    marker_line_color="black",
)
fig7.update_layout(showlegend=False)
fig7.show()

In [24]:
df["launch_year"] = pd.to_datetime(df["launched"], format="mixed").dt.year
df["launch_year"].head()

0    2015
1    2017
2    2013
3    2012
5    2016
Name: launch_year, dtype: int32

In [25]:
df["launch_month"] = pd.to_datetime(df["launched"], format="mixed").dt.month

#### By Currency


In [26]:
df["currency"].describe()

count     331675
unique        14
top          USD
freq      261511
Name: currency, dtype: object

In [27]:
currency_counts = df["currency"].value_counts().reset_index()
currency_counts.columns = ["currency", "count"]
fig8 = px.bar(
    currency_counts,
    x="currency",
    y="count",
    color="currency",
    labels={"currency": "Currency", "count": "Number of Projects"},
    title="Distribution of Projects by Currency",
)
fig8.update_traces(
    textfont_size=12,
    textangle=0,
    textposition="outside",
    cliponaxis=False,
    marker_line_width=1,
    marker_line_color="black",
)
fig8.update_layout(showlegend=False)
fig8.show()

#### By Country


In [28]:
country_counts = data["country"].value_counts().reset_index()
country_counts.columns = ["country", "count"]
fig9 = px.bar(
    country_counts,
    x="country",
    y="count",
    color="country",
    labels={"country": "Countries", "count": "Number of Projects"},
    title="Distribution of Projects by Countries",
)
fig9.update_layout(showlegend=False)
fig9.update_traces(textfont_size=12, marker_line_width=1, marker_line_color="black")
fig9.show()

In [29]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,duration,launch_year,launch_month
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,59,2015,8
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,60,2017,9
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,45,2013,1
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,30,2012,3
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0,35,2016,2


In [30]:
df = df.drop(["ID", "deadline", "launched", "launch_year", "name"], axis=1)

#### Encoding


In [31]:
df.state = df.state.map({"failed": 0, "successful": 1})

In [32]:
df["state"].value_counts()

state
0    197719
1    133956
Name: count, dtype: int64

#### Get Dummies


In [33]:
df = pd.get_dummies(
    df,
    columns=["category", "main_category", "currency", "country"],
    prefix=["cat", "main_cat", "currency", "country"],
    drop_first=True,
)

In [34]:
df.head()

Unnamed: 0,goal,pledged,state,backers,usd pledged,usd_pledged_real,usd_goal_real,duration,launch_month,cat_Academic,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,1000.0,0.0,0,0,0.0,0.0,1533.95,59,8,False,...,False,False,False,False,False,False,False,False,False,False
1,30000.0,2421.0,0,15,100.0,2421.0,30000.0,60,9,False,...,False,False,False,False,False,False,False,False,False,True
2,45000.0,220.0,0,3,220.0,220.0,45000.0,45,1,False,...,False,False,False,False,False,False,False,False,False,True
3,5000.0,1.0,0,1,1.0,1.0,5000.0,30,3,False,...,False,False,False,False,False,False,False,False,False,True
5,50000.0,52375.0,1,224,52375.0,52375.0,50000.0,35,2,False,...,False,False,False,False,False,False,False,False,False,True


#### Scaling


In [None]:
from sklearn.preprocessing import minmax_scale

In [41]:
num_cols = ["usd_goal_real", "duration"]

for col in num_cols:
    df[col] = minmax_scale(df[col], feature_range=(0, 1))

In [42]:
df.head()

Unnamed: 0,goal,pledged,state,backers,usd pledged,usd_pledged_real,usd_goal_real,duration,launch_month,cat_Academic,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,1000.0,0.0,0,0,0.0,0.0,9e-06,0.637363,8,False,...,False,False,False,False,False,False,False,False,False,False
1,30000.0,2421.0,0,15,100.0,2421.0,0.00018,0.648352,9,False,...,False,False,False,False,False,False,False,False,False,True
2,45000.0,220.0,0,3,220.0,220.0,0.00027,0.483516,1,False,...,False,False,False,False,False,False,False,False,False,True
3,5000.0,1.0,0,1,1.0,1.0,3e-05,0.318681,3,False,...,False,False,False,False,False,False,False,False,False,True
5,50000.0,52375.0,1,224,52375.0,52375.0,0.000301,0.373626,2,False,...,False,False,False,False,False,False,False,False,False,True


### Import ML Libraries


In [43]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

#### Train Test Split


In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop("state", axis=1), df["state"], test_size=0.20, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.15, random_state=42
)

In [45]:
print(f"Shape train: {X_train.shape}")
print(f"Shape validation: {X_val.shape}")
print(f"Shape test: {X_test.shape}")

Shape train: (225539, 215)
Shape validation: (39801, 215)
Shape test: (66335, 215)
