# Event Tracking on WikiHow Website

In [220]:
import os
import string

import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## Data Collection

In [3]:
DATA_PATH = os.path.dirname(os.getcwd())+'/data/processed/trend/'

In [149]:
df = pd.DataFrame()
for file in os.listdir(DATA_PATH):
    df = df.append(pd.read_csv(os.path.join(DATA_PATH, file), index_col=0), ignore_index=True)
    break

In [150]:
df.head()

Unnamed: 0,date_crawled,title,date_published,date_modified,n_views,n_votes,mean_votes,description,steps
0,2020-03-28,How to Make Hand Sanitizer: 8 Steps (with Pict...,2008-03-09,2020-03-20,1962384,92,91,"\nWashing your hands with soap is better, but ...","[{'@type': 'HowToSection', 'name': 'Alcohol-Ba..."
1,2020-03-28,3 Ways to Play UNO - wikiHow,2007-07-28,2020-03-27,2877878,200,74,\nIf you're looking for a fun card game to pla...,"[{'@type': 'HowToSection', 'name': 'Jumping in..."
2,2020-03-28,3 Easy Ways to Stay Productive While Working f...,2020-03-13,2020-03-26,3742,9,78,\nWorking from home offers a wonderful level o...,"[{'@type': 'HowToSection', 'name': 'Organizing..."
3,2020-03-28,How to Make Disinfectant Wipes: 14 Steps (with...,2020-03-12,2020-03-25,5348,5,100,\nDisposable disinfecting wipes offer a quick ...,"[{'@type': 'HowToSection', 'name': 'Creating S..."
4,2020-03-28,4 Easy Ways to Take the U.S. Census - wikiHow,2020-03-06,2020-03-23,6608,5,100,"\nEvery 10 years, the U.S. government carries ...","[{'@type': 'HowToSection', 'name': 'Responding..."


## Feature Selection and Extraction

ref: https://towardsdatascience.com/text-preprocessing-steps-and-universal-pipeline-94233cb6725a?gi=3ae959754bd9


In [168]:
# 0- concat title and description since we use just them 
df['text'] = df['title'] + df['description']

In [169]:
# import nltk
# nltk.download('punkt')

In [170]:
# 1. tokenize description and description
df['text'] = df['text'].apply(word_tokenize)

In [171]:
# 2- convet upper case to lower case
df['text'] = df['text'].apply(lambda token: [w.lower() for w in token])

In [172]:
# 3- remove punctuation
table = str.maketrans('','', string.punctuation)
df['text'] = df['text'].apply(lambda token: [w.translate(table) for w in token])

In [173]:
# 4- remian just alphabet tockens
df['text'] = df['text'].apply(lambda token: [w for w in token if w.isalpha()])

In [174]:
# Download Stop words
# import nltk
# nltk.download('stopwords')

In [175]:
# 5- remove stop words
stop_words = set(stopwords.words('english'))
stop_words.update({'make', 'steps', 'step', 'easy', 'ways', 'way', 'wikihow', 'also'})
df['text'] = df['text'].apply(lambda token: [w for w in token if not w in stop_words])

In [183]:
# 6- Normalize text 
porter = PorterStemmer()
df['text'] = df['text'].apply(lambda token: [porter.stem(w) for w in token])

In [184]:
# TFidf identifier
tfidf = TfidfVectorizer()
features_vector = tfidf.fit_transform(df['text'].apply(lambda token: " ".join(token)))

In [186]:
features_df = pd.DataFrame(features_vector.toarray(), columns=tfidf.get_feature_names())

# Clustering 

https://medium.com/@adriensieg/text-similarities-da019229c894

In [241]:
# choose optimum number of cluser
def get_cluster_number(df):
    score_list = []
    for c in range(2, 6):
        kmeans = KMeans(n_clusters=c, random_state=0).fit(features_df)
        score_list.append((c, silhouette_score(features_df, kmeans.labels_)))
    score_list.sort(key=lambda x: x[1], reverse=True)
    return score_list[0][0]

In [246]:
kmeans = KMeans(n_clusters=get_cluster_number(features_df), random_state=0).fit(features_df)
df['class_label'] = kmeans.labels_

In [247]:
df

Unnamed: 0,date_crawled,title,date_published,date_modified,n_views,n_votes,mean_votes,description,steps,text,class_label
0,2020-03-28,How to Make Hand Sanitizer: 8 Steps (with Pict...,2008-03-09,2020-03-20,1962384,92,91,"\nWashing your hands with soap is better, but ...","[{'@type': 'HowToSection', 'name': 'Alcohol-Ba...","[hand, sanit, pictur, wash, hand, soap, better...",2
1,2020-03-28,3 Ways to Play UNO - wikiHow,2007-07-28,2020-03-27,2877878,200,74,\nIf you're looking for a fun card game to pla...,"[{'@type': 'HowToSection', 'name': 'Jumping in...","[play, uno, look, fun, card, game, play, frien...",1
2,2020-03-28,3 Easy Ways to Stay Productive While Working f...,2020-03-13,2020-03-26,3742,9,78,\nWorking from home offers a wonderful level o...,"[{'@type': 'HowToSection', 'name': 'Organizing...","[stay, product, work, home, work, home, offer,...",0
3,2020-03-28,How to Make Disinfectant Wipes: 14 Steps (with...,2020-03-12,2020-03-25,5348,5,100,\nDisposable disinfecting wipes offer a quick ...,"[{'@type': 'HowToSection', 'name': 'Creating S...","[disinfect, wipe, pictur, dispos, disinfect, w...",2
4,2020-03-28,4 Easy Ways to Take the U.S. Census - wikiHow,2020-03-06,2020-03-23,6608,5,100,"\nEvery 10 years, the U.S. government carries ...","[{'@type': 'HowToSection', 'name': 'Responding...","[take, us, censu, everi, year, us, govern, car...",0
5,2020-03-28,Easy Ways to Disinfect Your Devices: 11 Steps ...,2020-03-16,2020-03-27,2634,0,0,\nWith the coronavirus COVID-19 making its way...,"[{'@type': 'HowToSection', 'name': 'Disinfecti...","[disinfect, devic, coronaviru, make, commun, a...",2


In [249]:
len(kmeans.cluster_centers_)

3