In [None]:
import pandas as pd
import os
file = "reddit_dataframe.pkl"
df1 = pd.read_pickle(file)

In [None]:
dfs=df1.copy()
dfs=dfs.iloc[0:100,:] #資料太大很耗時, 這裡我用小部分來做

In [None]:
import re

RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')

def impurity(text, min_len=10):
    """returns the share of suspicious characters in a text"""
    if text == None or len(text) < min_len:
        return 0
    else:
        return len(RE_SUSPICIOUS.findall(text))/len(text)
    
dfs['impurity'] = dfs['text'].apply(impurity, min_len=10)    


# get the top 3 records
dfs[['text', 'impurity']].sort_values(by='impurity', ascending=False).head(3)

In [None]:
dfs['text'][68]

In [None]:
#####################################################
###Part II: Removing Nose with Regular Expressions###
#####################################################
#remark: html.unescape
import html
p = '&lt;abc&gt;' #&lt; and &gt; are special simbles in html
#not showing in text example
txt= html.unescape(p)
print (txt)

import html

def clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text) #in this example, this part does nothing
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', ' ', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


dfs['clean_text'] = dfs['text'].apply(clean)
dfs['impurity']   = dfs['clean_text'].apply(impurity, min_len=20)

dfs[['clean_text', 'impurity']].sort_values(by='impurity', ascending=False) \
                              .head(3)


In [None]:
####################################################
###Part III: Character Normalization with textacy###
####################################################
import textacy.preprocessing as tprep
#you need to install textacy
def normalize(text):
    text = tprep.normalize.hyphenated_words(text)
    text = tprep.normalize.quotation_marks(text)
    text = tprep.normalize.unicode(text)
    text = tprep.remove.accents(text)
    return text

dfs['clean_text'] = dfs['clean_text'].apply(normalize)

In [None]:
#############################################
###Part IV: Character Masking with textacy###
############################################# 

from textacy.preprocessing import replace
dfs['clean_text'] = dfs['clean_text'].apply(replace.urls)

##最後整理
dfs.rename(columns={'text': 'raw_text', 'clean_text': 'text'}, inplace=True)
dfs.drop(columns=['impurity'], inplace=True)

In [None]:
dfs

In [None]:
##########################
###Liguistic Processing###
##########################

#All steps in one by using spacy

import spacy
nlp = spacy.load('en_core_web_sm')

dfs['doc']=dfs['text'].apply(nlp)

In [None]:
import textacy

def extract_lemmas(doc, **kwargs):
    return [t.lemma_ for t in textacy.extract.basics.words(doc, **kwargs)]


dfs['lemmas'] = dfs['doc'].apply(extract_lemmas, include_pos=['ADJ', 'NOUN'])
dfs['lemmas']

In [None]:
#############
#Freq Charts#
#############

from collections import Counter
counter = Counter()#use a empty string first
dfs['lemmas'].map(counter.update)

print(counter.most_common(5))
# transform counter into data frame
min_freq=2
#transform dict into dataframe
freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
freq_df = freq_df.query('freq >= @min_freq')
freq_df.index.name = 'token'
freq_df = freq_df.sort_values('freq', ascending=False)
freq_df.head(15)

In [None]:
ax = freq_df.head(15).plot(kind='barh', width=0.95, figsize=(8,3))
ax.invert_yaxis()
ax.set(xlabel='Frequency', ylabel='Token', title='Top Words')

In [None]:
###Creating Word Clouds
from matplotlib import pyplot as plt
from wordcloud import WordCloud ###
from collections import Counter ###

wordcloud = WordCloud(font_path="SimHei.ttf", background_color="white")
wordcloud.generate_from_frequencies(freq_df['freq'])
#plt.figure(figsize=(20,10)) 
plt.imshow(wordcloud)

In [None]:
dfs['lemmas'][0]

In [None]:
dfs1=dfs.copy()

In [None]:
def list_to_string(org_list, seperator=' '):
    return seperator.join(org_list)

In [None]:
dfs1['C_lemmas']=dfs1['lemmas'].apply(list_to_string)
dfs1['C_lemmas'][0]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(decode_error='ignore', min_df=2)

In [None]:
dt01 = cv.fit_transform(dfs1['C_lemmas'])
print(cv.get_feature_names())
fn=cv.get_feature_names()

In [None]:
import pandas as pd
dtmatrix=pd.DataFrame(dt01.toarray(), columns=cv.get_feature_names())

In [None]:
dtmatrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dt01[30], dt01[63])

In [None]:
sm = pd.DataFrame(cosine_similarity(dt01, dt01))

In [None]:
sm

In [None]:
dfs1.iloc[98,:].text

In [None]:
dfs1.iloc[12,:].text

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()

In [None]:
tfidf_dt = tfidf.fit_transform(dt01)

In [None]:
tfidfmatrix = pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names())

In [None]:
sm1 =pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))
#sm1 =pd.DataFrame(cosine_similarity(tfidf_dt.T, tfidf_dt.T))

In [None]:
sm1

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud ###
from collections import Counter ###

In [None]:
tfidfsum=tfidfmatrix.T.sum(axis=1)

In [None]:
wordcloud = WordCloud(font_path="SimHei.ttf", background_color="white")
wordcloud.generate_from_frequencies(tfidfsum)
#plt.figure(figsize=(20,10)) 
plt.imshow(wordcloud)

In [None]:
from sklearn.cluster import KMeans
from sklearn import preprocessing 

In [None]:
distortions = []
for i in range(1, 5):
    km = KMeans(
        n_clusters=i, init='random',
        n_init=10, max_iter=300,
        tol=1e-04, random_state=0
    )
    km.fit(preprocessing.normalize(tfidf_dt))
    distortions.append(km.inertia_)

In [None]:
# plot
from matplotlib import pyplot as plt
plt.plot(range(1, 5), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

In [None]:
km = KMeans(
    n_clusters=5, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)
y_km = km.fit_predict(preprocessing.normalize(tfidf_dt))

g0 = dfs1['text'][y_km==0]
g0.head()
g1 = dfs1['text'][y_km==1]
g1.head()
g2 = dfs1['text'][y_km==2]
g2.head()
g3 = dfs1['text'][y_km==3]
g3.head()
g4 = dfs1['text'][y_km==4]
g4.head()