In [1]:
import pandas as pd
import nltk
import re
from bs4 import BeautifulSoup
from langdetect import detect
from readability import Readability
from tqdm.notebook import tqdm
tqdm.pandas()

# Data

In [3]:
df_kickstarter = pd.read_csv('../kickstarter.csv')

# Functions

In [None]:
def numWords(text):
    r = Readability(text)
    try:
        t = r.statistics()
        return t['num_words']
    except:
        return None

In [None]:
def removeHTML(html_str):
    soup = BeautifulSoup(str(html_str), features="html.parser")
    return soup.get_text()

In [None]:
def remove_urls(document):
    document = re.sub(r'http\S+', '', str(document))
    return document

In [None]:
def remove_parting_lines(document):
    pattern = r'^[\*\-#]{5,}.+$'
    document = re.sub(pattern, '', str(document), flags=re.MULTILINE)
    return document

In [None]:
def detectLang(t):
    try:
        return detect(t)
    except:
        return None

# General Cleaning

In [None]:
# Filter rows based on column: 'project_country'
df_kickstarter = df_kickstarter[df_kickstarter['project_country'] == "US"]

In [None]:
# Filter rows based on column: 'project_currency'
df_kickstarter = df_kickstarter[df_kickstarter['project_currency'] == "USD"]

In [None]:
# Drop rows with missing data in column: 'project_title'
df_kickstarter = df_kickstarter.dropna(subset=['project_title'])

In [None]:
# Drop rows with missing data in column: 'project_description'
df_kickstarter = df_kickstarter.dropna(subset=['project_title'])

In [None]:
# Drop duplicate rows in column: 'project_description'
df_kickstarter = df_kickstarter.drop_duplicates(subset=['project_description'])

In [None]:
df_kickstarter['project_description'] = df_kickstarter['project_description'].apply(removeHTML)

In [None]:
df_kickstarter['project_description'] = df_kickstarter['project_description'].apply(remove_urls)

In [None]:
df_kickstarter['project_description'] = df_kickstarter['project_description'].apply(remove_parting_lines)

In [None]:
nltk.download('punkt')
df_kickstarter['num_words'] = df_kickstarter['project_description'].progress_apply(numWords)
df_kickstarter = df_kickstarter[df_kickstarter['num_words'] >= 100]
# Reducing outliers (very long project descriptions might effect topic modelling negatively)
df_kickstarter = df_kickstarter[df_kickstarter['num_words'] <= 2000]

In [None]:
df_kickstarter['project_lang'] = df_kickstarter['project_description'].progress_apply(detectLang)
df_kickstarter = df_kickstarter[df_kickstarter['project_lang'] == 'en']

In [None]:
df_kickstarter.to_csv('./kickstarter_cleaned.csv')