# 自動篩選惡意言論，找出省錢關鍵！
## 作者：葉庭妤、趙熙寧(臺灣行銷研究特邀作者)、鍾皓軒(臺灣行銷研究有限公司創辦人）

## 任務：
原競賽要求根據問題敘述預測是否為惡意言論。 但我們再進一步討論若以機器學習模型篩選惡意言論，將會對Quora幫助省下多少錢。

# 一、敘述性統計分析

資料可以在此[取得](https://drive.google.com/drive/folders/1Aeuz138JFNqGzA_hQWe10KixzUhLTZML?usp=sharing)，並下載下來後，將資料與本ipynb檔案放於同一個工作目錄中，再執行下方程式即可。

In [1]:
# 引入套件
import numpy as np 
import pandas as pd
import tensorflow as tf
import jieba
from collections import Counter
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D, RNN
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import concatenate
from keras.callbacks import *
from plotly import tools
from plotly.offline import plot, iplot
import plotly.graph_objs as go

In [2]:
# 讀取資料
train= pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 資料簡介

## 資料數量
若需要完整資料集，可到[Kaggle網站](https://www.kaggle.com/c/quora-insincere-questions-classification/data)下載！

In [3]:
print("訓練集資料型態 : ",train.shape)
print("測試集資料型態 : ",test.shape)

訓練集資料型態 :  (1044897, 4)
測試集資料型態 :  (261225, 4)


In [4]:
# 在訓練集中有幾則正常言論與惡意言論
print('訓練集中惡意言論有',len(train[train['target'] ==1]),'則')
print('訓練集中正常言論有',len(train[train['target'] ==0]),'則')


訓練集中惡意言論有 64954 則
訓練集中正常言論有 979943 則


In [5]:
# 查看訓練集前5筆資料型態與變數
train.head(5)

Unnamed: 0.1,Unnamed: 0,qid,question_text,target
0,298773,3a820a95342d28ad402f,How is strategic positioning is different from...,0
1,815475,9fca299caa0cf8f12eac,What is the best way for promote Facebook mark...,0
2,1133453,de23f10ad011a6fb13c7,How much energized proton radiation does the I...,0
3,1076426,d2eef16340896e963a63,Would any Indian men want to marry a women tha...,0
4,203792,27d584db9bd46b6ab44e,Which is the best business for startups in Ind...,0


## 變數介紹
1. qid：問題的辨識碼，每個問題不重複。
2. question_text：問題敘述。
3. target：是否為惡意言論。1為惡意言論(insincere)，0為正常言論(sincere)。以下舉訓練集中惡意言論與正常言論各一則。

- 惡意言論：Why do most Palestinians support terrorism?
- 正常言論：How is strategic positioning is different from marketing positioning?

# 資料基本訊息

In [6]:
# 製作圓餅圖
cnt_srs = train['target'].value_counts()
labels = (np.array(cnt_srs.index))
sizes = (np.array((cnt_srs / cnt_srs.sum())*100))

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Target distribution',
    font=dict(size=18),
    width=600,
    height=600,
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename="usertype")


- 93.8%的言論被分類為正常言論(標記為0)，只有少許的6.22%是惡意言論(標記為1)。<br>
- 大部分Quora平台的文章都是正常言論，只有極少部分是惡意言論，資料的分佈極不平均，相差15倍之多！這會造成後續機器模型辨識惡意言論時會造成許多挑戰！

## 惡意言論包含哪些字詞？
我們已經了解資料基本內容與言論分佈狀況，接著，我們可以進一步了解被歸類為惡意言論與正常言論可能分別包含哪些字詞！

In [7]:
# ngram前置作業
from plotly import tools
from wordcloud import WordCloud, STOPWORDS
import plotly.graph_objects as go
import plotly as py
stopwords = set(STOPWORDS)
more_stopwords = {'one', 'br', 'Po', 'th', 'sayi', 'fo', 'Unknown'}
stopwords = stopwords.union(more_stopwords)

from collections import defaultdict
train1_df = train[train["target"]==1]
train0_df = train[train["target"]==0]

## custom function for ngram generation ##
def generate_ngrams(text, n_gram=1):
    token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [" ".join(ngram) for ngram in ngrams]

## custom function for horizontal bar chart ##
def horizontal_bar_chart(df, color):
    trace = go.Bar(
        y=df["word"].values[::-1],
        x=df["wordcount"].values[::-1],
        showlegend=False,
        orientation = 'h',
        marker=dict(
            color=color,
        ),
    )
    return trace

In [8]:
# 1-gram
## Get the bar chart from sincere questions ##
freq_dict = defaultdict(int)
for sent in train0_df["question_text"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(5), 'blue')

## Get the bar chart from insincere questions ##
freq_dict = defaultdict(int)
for sent in train1_df["question_text"]:
    for word in generate_ngrams(sent):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(5), 'blue')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,
                          subplot_titles=["Frequent words of sincere questions", 
                                          "Frequent words of insincere questions"])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=400, width=1000, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
# 匯出html檔
plot(fig, filename='Word_Count_Plots.html')
# 顯示長條圖
fig.show()



plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [9]:
# 2-gram
## Get the bar chart from sincere questions ##
freq_dict = defaultdict(int)
for sent in train0_df["question_text"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(5), 'orange')


freq_dict = defaultdict(int)
for sent in train1_df["question_text"]:
    for word in generate_ngrams(sent,2):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(5), 'orange')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04,horizontal_spacing=0.15,
                          subplot_titles=["Frequent bigrams of sincere questions", 
                                          "Frequent bigrams of insincere questions"])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=400, width=1000, paper_bgcolor='rgb(233,233,233)', title="Bigram Count Plots")
# 匯出html檔
plot(fig, filename='Bigram_Count_Plots') 
# 顯示長條圖
fig.show()


Your filename `Bigram_Count_Plots` didn't end with .html. Adding .html to the end of your file.



In [10]:
# 3-gram
## Get the bar chart from sincere questions ##
freq_dict = defaultdict(int)
for sent in train0_df["question_text"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace0 = horizontal_bar_chart(fd_sorted.head(5), 'green')
freq_dict = defaultdict(int)
for sent in train1_df["question_text"]:
    for word in generate_ngrams(sent,3):
        freq_dict[word] += 1
fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
fd_sorted.columns = ["word", "wordcount"]
trace1 = horizontal_bar_chart(fd_sorted.head(5), 'green')

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, vertical_spacing=0.04, horizontal_spacing=0.2,
                          subplot_titles=["Frequent trigrams of sincere questions", 
                                          "Frequent trigrams of insincere questions"])
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig['layout'].update(height=400, width=1000, paper_bgcolor='rgb(233,233,233)', title="Trigram Count Plots")
# 匯出html檔
plot(fig, filename='Trigram_Count_Plots')
# 顯示長條圖
fig.show()



Your filename `Trigram_Count_Plots` didn't end with .html. Adding .html to the end of your file.

