# Calculate Texts Similarities
We use the method proposed in the paper [Copycats versus Original Mobile Apps: A Machine Learning Copycat Detection Method and Empirical Analysis](https://pubsonline.informs.org/doi/abs/10.1287/isre.2017.0735) , we applied the method to our data.

The main steps include: 
* Data Cleaning 
* Feature Abstraction by TF-IDF
* Dimension Reduction by PCA
* Markov Clustering & K-Means 


In [None]:
from google.colab import drive
drive.mount('/content/drive/')
import os
path="/content/drive/My Drive/Research/Data/"

os.chdir(path)
os.listdir(path)

Mounted at /content/drive/


['english.txt',
 '2019_2020_all_data_final.csv',
 '2014_abs',
 '2015_abs',
 '2016_abs',
 '2017_abs',
 '2018_abs',
 'My_Process',
 'all_data_statistic.csv',
 'top_category_details.csv',
 'top5_statistic_months.csv',
 'top5_statistics_col_1.csv',
 'cs_summary',
 'My_result']

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import RandomizedSearchCV

import gensim
from gensim.utils import simple_preprocess
#import markov_clustering as mc
import nltk
import spacy

nlp = spacy.load('en_core_web_sm')

In [None]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#导入停用词
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [None]:
#导入数据
summary_2014 = pd.read_csv(path+'My_Process/Summary_2014.csv')
summary_2015 = pd.read_csv(path+'My_Process/Summary_2015.csv')
summary_2016 = pd.read_csv(path+'My_Process/Summary_2016.csv')
summary_2017 = pd.read_csv(path+'My_Process/Summary_2017.csv')
summary_2018 = pd.read_csv(path+'My_Process/Summary_2018.csv')
summary_2019 = pd.read_csv(path+'My_Process/Summary_2019.csv')
summary_2020 = pd.read_csv(path+'My_Process/Summary_2020.csv')
source = pd.concat([summary_2014,summary_2015,summary_2016,summary_2017,
                  summary_2018,summary_2019,summary_2020])

In [None]:
summary_2014.head()

Unnamed: 0,origin,link,authors,summary,category,Date
0,['Radial basis function process neural network...,https://arxiv.org/pdf/1405.7349,"['Bing Wang', 'Yao-hua Meng', 'Xiao-hong Yu']",For learning problem of Radial Basis Function ...,Neural and Evolutionary Computing (cs.NE),\n Submission history From: Bing Wang [vi...
1,['Efficient and Reliable Hybrid Cloud Architec...,https://arxiv.org/pdf/1405.5200,"['Narzu Tarannum', 'Nova Ahmed']",The objective of our paper is to propose a Clo...,"Distributed, Parallel, and Cluster Computing (...",\n Submission history From: Narzu Tarannu...
2,"['Internet of Things: Concept, Building blocks...",https://arxiv.org/pdf/1401.6877,"['Riad Abdmeziem', 'Djamel Tandjaoui']",Internet of things (IoT) constitutes one of th...,Computers and Society (cs.CY),\n Submission history From: Riad Abdmezie...
3,"[""Analysis and Diversion of Duqu's Driver""]",https://arxiv.org/pdf/1401.6120,"['Guillaume Bonfante', 'Jean-Yves Marion', 'Fa...",The propagation techniques and the payload of ...,Cryptography and Security (cs.CR),\n Submission history From: Aurelien Thie...
4,['e-commerce business models in the context of...,https://arxiv.org/pdf/1401.6102,"['Fernando Almeida', 'JosÃ© D. Santos', 'JosÃ©...",Web 3.0 promises to have a significant effect ...,Computers and Society (cs.CY),\n Submission history From: Fernando Alme...


In [None]:
'''
#导入停用词
stopwords_source=pd.read_table(path+'english.txt',header=None)
stopwords=[]
for word in stopwords_source[0]:
  word=str(word)
  if len(word)>2:
      element=re.split('\W+',word)
      for ele in element:
        ele=str(ele)
        if ele not in stopwords:
          stopwords.append(ele)
'''

In [None]:
# 清洗数据
#把句子拆成单词
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


def cleandata(text):
  # Remove new line characters
  text_result = [re.sub('\s+', ' ', sent) for sent in text]

  # Remove distracting single quotes
  text_result = [re.sub("\'", "", sent) for sent in text_result]

  text_result = list(sent_to_words(text_result))

  return text_result

def remove_stopwords(texts):
  return [[word for word in simple_preprocess(str(doc)) if word not in stopwords and len(word) >= 2] for doc in texts]


def lemmatization(texts, allowed_postags):
    texts_out = []
    number = 0
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])#只要这四种词性的词

        number = number + 1
    return texts_out


#将清洗后的dataframe类型转化为list
def cleanAtransform(list_data):
 result = cleandata(list_data)
 result = remove_stopwords(result)
 result = lemmatization(result,allowed_postags=['NOUN', 'VERB','ADJ','ADV'])

 final_result = []
 for text in result:
   sentence =''
   for word in text:
     sentence = sentence+' '+word
   final_result.append(sentence)
 return final_result


#抽取日期信息
#input：text  output:日期，若input为dirty data,返回none
def Date_processor(text):
  year='none'
  month='none'
  if type(text) != float:
    list_text=text.split()
    if len(list_text) > 5:
      year= list_text[-5]
      month=list_text[-6]
  return year,month

In [None]:
#中间处理变量
month_dict={'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7,'Aug':8,
            'Sep':9,'Oct':10,'Nov':11,'Dec':12}
#字典形式储存data
sum_dict ={}


for year in range(2014,2021):
  for month in range(1,13):
    label = str(year)+'_'+ str(month)
    sum_dict[label] =[]


In [None]:
#分类加载data
for index in range(len(source['origin'])):
    row_info = source.iloc[index]
    summary = row_info[0]
    Date = row_info[5]
    

    year,month = Date_processor(Date)

    if year != '2021':
      data_label = year+'_'+str(month_dict[month])
      sum_dict[data_label].append(summary)
      

In [None]:
index_year =[0]
total_num =0

for key,value in sum_dict.items():
  length = len(value)
  print(key+': ',length)
  total_num = total_num + length
  index_year.append(total_num)

2014_1:  1049
2014_2:  806
2014_3:  793
2014_4:  995
2014_5:  998
2014_6:  849
2014_7:  887
2014_8:  838
2014_9:  923
2014_10:  877
2014_11:  881
2014_12:  906
2015_1:  886
2015_2:  912
2015_3:  1023
2015_4:  1084
2015_5:  1094
2015_6:  1038
2015_7:  1086
2015_8:  898
2015_9:  1096
2015_10:  1020
2015_11:  1067
2015_12:  1042
2016_1:  1099
2016_2:  1117
2016_3:  1249
2016_4:  1427
2016_5:  1415
2016_6:  1435
2016_7:  1316
2016_8:  1302
2016_9:  1452
2016_10:  1372
2016_11:  1511
2016_12:  1294
2017_1:  1318
2017_2:  1448
2017_3:  1802
2017_4:  1691
2017_5:  1919
2017_6:  1677
2017_7:  1937
2017_8:  1903
2017_9:  1921
2017_10:  1573
2017_11:  1936
2017_12:  1660
2018_1:  1683
2018_2:  1869
2018_3:  2179
2018_4:  2142
2018_5:  2583
2018_6:  2284
2018_7:  2353
2018_8:  2140
2018_9:  2208
2018_10:  2327
2018_11:  2403
2018_12:  1964
2019_1:  2248
2019_2:  2353
2019_3:  2736
2019_4:  3269
2019_5:  2984
2019_6:  2834
2019_7:  2900
2019_8:  2960
2019_9:  3132
2019_10:  2964
2019_11:  2853
201

In [None]:
# 制作按照年份和月份排列的list
data = []

for key,value in sum_dict.items():
   value = cleanAtransform(value)
   data.extend(value)

   print(key+' complete')


2014_1 complete
2014_2 complete
2014_3 complete
2014_4 complete
2014_5 complete
2014_6 complete
2014_7 complete
2014_8 complete
2014_9 complete
2014_10 complete
2014_11 complete
2014_12 complete
2015_1 complete
2015_2 complete
2015_3 complete
2015_4 complete
2015_5 complete
2015_6 complete
2015_7 complete
2015_8 complete
2015_9 complete
2015_10 complete
2015_11 complete
2015_12 complete
2016_1 complete
2016_2 complete
2016_3 complete
2016_4 complete
2016_5 complete
2016_6 complete
2016_7 complete
2016_8 complete
2016_9 complete
2016_10 complete
2016_11 complete
2016_12 complete
2017_1 complete
2017_2 complete
2017_3 complete
2017_4 complete
2017_5 complete
2017_6 complete
2017_7 complete
2017_8 complete
2017_9 complete
2017_10 complete
2017_11 complete
2017_12 complete
2018_1 complete
2018_2 complete
2018_3 complete
2018_4 complete
2018_5 complete
2018_6 complete
2018_7 complete
2018_8 complete
2018_9 complete
2018_10 complete
2018_11 complete
2018_12 complete
2019_1 complete
2019_2 co

In [None]:
# 保存处理好的数据
fileobject = open(path+'My_Process/clean_cs_title.txt','w')

for value in data:
  fileobject.write(value)
  fileobject.write('\n')
fileobject.close()

In [None]:
#加载index_year
index_year_ = np.load(path+'My_Process/index_year.npy')
index_year_=index_year_.tolist()
index_year_

[0,
 1049,
 1855,
 2648,
 3643,
 4641,
 5490,
 6377,
 7215,
 8138,
 9015,
 9896,
 10802,
 11688,
 12600,
 13623,
 14707,
 15801,
 16839,
 17925,
 18823,
 19919,
 20939,
 22006,
 23048,
 24147,
 25264,
 26513,
 27940,
 29355,
 30790,
 32106,
 33408,
 34860,
 36232,
 37743,
 39037,
 40355,
 41803,
 43605,
 45296,
 47215,
 48892,
 50829,
 52732,
 54653,
 56226,
 58162,
 59822,
 61505,
 63374,
 65553,
 67695,
 70278,
 72562,
 74915,
 77055,
 79263,
 81590,
 83993,
 85957,
 88205,
 90558,
 93294,
 96563,
 99547,
 102381,
 105281,
 108241,
 111373,
 114337,
 117190,
 119700,
 122295,
 125227,
 129153,
 134877,
 139188,
 142121,
 146427,
 150244,
 154325,
 160190,
 165023,
 165165]

In [None]:
# 加载处理好的data
fileopen = open(path+'My_Process/clean_cs_summary.txt','r')
data = fileopen.readlines()
fileopen.close()

#data

In [None]:
start_index =index_year_[len(index_year_)-1-12]
start_index
end_index = len(data)
sub_data = data[start_index:end_index]

In [None]:
len(sub_data)

45465

In [None]:
# 用if-idf算法
transfer = TfidfVectorizer(stop_words=stopwords)

result = transfer.fit_transform(sub_data)
print('data_new:\n',result.toarray())
print('特征名字：\n',transfer.get_feature_names())

data_new:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
特征名字：




In [None]:
print(len(transfer.get_feature_names()))

32627




In [None]:
result=result.toarray()

In [None]:
result_2 = result[98,:]

index=0
for value in result_2:
  if value!=0:
    print('index:',index,'  ',value)
  index+=1

In [None]:
result_2[:100]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
# 构造相似矩阵
#result=result.toarray()
num_lines = result.shape[0]
num_cols = result.shape[1]

print('lines: ',num_lines)
print('columns: ',num_cols)
#sim_mat = np.zeros([num_lines,num_lines])
for i in range(3):
  for j in range(3):
    if i > j:
      #row = i
      #column =j
      print(result[i,:].reshape(1,-1))
      print(result[j,:].reshape(1,-1))
      simm =  cosine_similarity(result[i,:].reshape(1,-1),result[j,:].reshape(1,-1))
      print(simm)

lines:  45465
columns:  32627
  (0, 5994)	0.10437464631808871
  (0, 23768)	0.09840580792279857
  (0, 19612)	0.12958300146879798
  (0, 7506)	0.12648843537583007
  (0, 7507)	0.10222616217884449
  (0, 22558)	0.15326368447091174
  (0, 17215)	0.22741906730865177
  (0, 792)	0.10665112675152708
  (0, 16691)	0.09809173444269001
  (0, 8033)	0.2049086459992817
  (0, 15786)	0.09794681677225874
  (0, 24637)	0.08815938354177134
  (0, 2173)	0.3458919783297846
  (0, 16138)	0.10448829224690398
  (0, 788)	0.14247736736692726
  (0, 24481)	0.09201626348647322
  (0, 29750)	0.13254244839000506
  (0, 5558)	0.10343356269471007
  (0, 5131)	0.08915028722872737
  (0, 28304)	0.43051322222523497
  (0, 17998)	0.1053318637738871
  (0, 8486)	0.25398626676105934
  (0, 12099)	0.10014541979261306
  (0, 7344)	0.10256015241769127
  (0, 28303)	0.22347025310349042
  (0, 668)	0.1712089207998113
  (0, 15208)	0.08659281903582734
  (0, 17482)	0.14775503245729704
  (0, 14818)	0.1452621630955224
  (0, 4974)	0.32493108401355775
 

In [None]:
# 降维 PCA
de_transfer = PCA(n_components=0.8,svd_solver='full')

de_data = de_transfer.fit_transform(result.toarray())
de_model = de_transfer.fit(result.toarray())

print('decomposed data: \n',de_data)
print(de_model.explained_variance_ratio_)

In [None]:
len(de_data)

In [None]:
# 聚类 网格搜索  KMeans
cluster_1 = KMeans()
param_list = {'n_clusters': range(1,20,1)}  

grid = RandomizedSearchCV(cluster_1,param_distributions=param_list,cv=4)

grid.fit(de_data)

print('最佳参数：', grid.best_params_)
print('最佳模型：',grid.best_estimator_)

In [None]:
# 聚类 K-Means
cluster = KMeans(n_clusters=19)
cluster.fit(de_data)

cluster_result = cluster.predict(de_data)
cluster_result[:300]

In [None]:
# K-Means  评估聚类模型
silhouette_score(de_data,cluster_result)

In [None]:
## Markov Clustering
import markov_clustering as mc
import networkx as nx
import random

# number of nodes to use
numnodes = len(de_data)

# generate random positions as a dictionary where the key is the node id and the value
# is a tuple containing 2D coordinates
positions = {i:tuple(de_data[i]) for i in range(len(de_data))}

# use networkx to generate the graph
network = nx.random_geometric_graph(numnodes, 0.3, pos=positions)

# then get the adjacency matrix (in sparse form)
matrix = nx.to_scipy_sparse_matrix(network)

In [None]:
# 训练
result = mc.run_mcl(matrix)           
clusters = mc.get_clusters(result)    

In [None]:
mc.modularity(matrix=result, clusters=clusters)

In [None]:
#mc.draw_graph(matrix, clusters, pos=positions, node_size=50, with_labels=False, edge_color="silver")

In [None]:
# For 循环找超参数
for inflation in [i / 10 for i in range(15, 26)]:
    result = mc.run_mcl(matrix, inflation=inflation)
    clusters = mc.get_clusters(result)
    Q = mc.modularity(matrix=result, clusters=clusters)
    print("inflation:", inflation, "modularity:", Q)