The code below creates datasets for tuning and modeling (research clones) lizon user eva.ru

In [1]:
import os
import datetime
import pandas as pd
import numpy as np

In [2]:
lizon_id = 69715

In [3]:
Data='/home/kate/Projects/eva/Data'

Messages_filename='Main/Messages.csv'
Messages_full_filename=os.path.join(Data, Messages_filename)

OriginalMessages_filename='Main/OriginalMessages.csv'
OriginalMessages_full_filename=os.path.join(Data, OriginalMessages_filename)

Authors_filename='Main/Authors.csv'
Authors_full_filename=os.path.join(Data, Authors_filename)

ta_extension_filename='Main/TextAttributes_ext.csv'
ta_extension_full_filename=os.path.join(Data, ta_extension_filename)


ft_filename='Subprojects/lizon/lizon_data_for_finetuning.csv'
ft_full_filename=os.path.join(Data, ft_filename)


SimilarAuthors_filename='Subprojects/lizon/lizon_similar_authors.csv'
SimilarAuthors_full_filename=os.path.join(Data, SimilarAuthors_filename)

all_lizon_messages_filename='Subprojects/lizon/all_lizon_and_clones_messages.csv'
all_lizon_messages_full_filename=os.path.join(Data, all_lizon_messages_filename)

#medium size posts
short_message_limit_words=5
medium_message_limit_words=100

In [4]:
Messages = pd.read_csv(Messages_full_filename, error_bad_lines=False, index_col=False)

In [5]:
AMessages = Messages[Messages['Author_Id']!=0].copy(deep=True)

In [6]:
del Messages

## Messages, no less then 5 and no more then 100 words, from all topics where lizon present

In [7]:
topic_id_with_lizon=AMessages[AMessages['Author_Id']==lizon_id]['Topic_Id'].unique()

In [8]:
len(topic_id_with_lizon)

288

In [9]:
AMessages=AMessages[AMessages['Topic_Id'].isin(topic_id_with_lizon)].copy(deep=True)

In [10]:
AMessages=AMessages[AMessages['Author_Id']!=0]

In [11]:
len(AMessages)

24979

In [12]:
AMessages.columns

Index(['Message_Id', 'Timestamp', 'Topic_1st_Message', 'Parent_Id',
       'Author_Id', 'author', 'Topic_Id', 'Topic', 'Chapter_Id', 'Chapter',
       'message', 'cnt_childs', 'cnt_immediate_childs'],
      dtype='object')

There is no length of message in words in AMessage dataset, load additional data

In [13]:
ta_ext = pd.read_csv(ta_extension_full_filename, error_bad_lines=False, index_col=False)

In [14]:
ta_ext.columns

Index(['Message_Id', 'Timestamp', 'Topic_1st_Message', 'Parent_Id',
       'Author_Id', 'author', 'Topic_Id', 'Topic', 'Chapter_Id', 'Chapter',
       'message_characters', 'message_words', 'emojis', 'images', 'links',
       'original_paragraphs', 'actual_paragraphs', 'avg_paragraph_characters',
       'avg_paragraph_words', 'cnt_childs', 'cnt_immediate_childs',
       'flg_excessive_exclamations', 'flg_excessive_questions',
       'flg_excessive_other', 'num_Adj', 'num_unique_Adj', 'num_Nouns',
       'num_unique_Nouns', 'num_Verb', 'num_unique_Verb', 'num_tokens',
       'num_unique_tokens', 'num_syllables', 'num_long_words',
       'num_unique_long_words', 'num_commas', 'num_exclamations',
       'num_questions', 'num_words', 'ASW', 'PLW', 'TTR', 'TTR_A', 'TTR_N',
       'TTR_V', 'NAV', 'UNAV', 'fraction_of_commas',
       'fraction_of_exclamations', 'fraction_of_questions', 'fraction_of_Adj',
       'fraction_of_Nouns', 'fraction_of_Verbs', 'num_sent', 'avg_sent_len'],
      dtype

In [15]:
AMessages=pd.merge(AMessages, ta_ext[['Message_Id','message_words']], left_on='Message_Id', right_on='Message_Id', how='inner')

In [16]:
del ta_ext

In [17]:
AMessages= AMessages[((AMessages['message_words']>short_message_limit_words) & (AMessages['message_words']<=medium_message_limit_words))]

In [18]:
len(AMessages)

20437

Set target column

In [19]:
AMessages['target']=['lizon' if x == lizon_id else 'Other' for x in AMessages['Author_Id']]

In [20]:
AMessages.groupby(['target']).size().reset_index(name='counts').sort_values('counts', ascending=False)

Unnamed: 0,target,counts
0,Other,19371
1,lizon,1066


Joining original forum post text, without cleaning html tags etc

In [21]:
OriginalMessages = pd.read_csv(OriginalMessages_full_filename, error_bad_lines=False, index_col=False)

In [22]:
AMessages=pd.merge(AMessages, OriginalMessages[['Message_Id','original_message']], left_on='Message_Id', right_on='Message_Id', how='inner')

In [23]:
AMessages.columns

Index(['Message_Id', 'Timestamp', 'Topic_1st_Message', 'Parent_Id',
       'Author_Id', 'author', 'Topic_Id', 'Topic', 'Chapter_Id', 'Chapter',
       'message', 'cnt_childs', 'cnt_immediate_childs', 'message_words',
       'target', 'original_message'],
      dtype='object')

In [24]:
len(AMessages)

20437

In [25]:
del OriginalMessages

In [26]:
AMessages=AMessages[['Message_Id','Author_Id','author', 'message','original_message','target']]

In [27]:
AMessages.to_csv(ft_full_filename, header=True, index=False)

## Dataset to find lizon's clones (model application)

### Authors to check

In [28]:
Authors = pd.read_csv(Authors_full_filename, error_bad_lines=False, index_col=False)

In [29]:
Authors.head()

Unnamed: 0,Author_Id,author,cnt_messages,First_Message_Id,Last_Message_Id,first_message_ts,last_message_ts,Binned Number of Messages,cnt_2021_messages,num_of_tneg_messages,...,avg_fraction_of_Verbs,avg_TTR,avg_TTR_A,avg_TTR_N,avg_TTR_V,avg_NAV,avg_UNAV,avg_fraction_of_Nouns,avg_ASW,avg_PLW
0,23,volv ♧ Clair,1535,67913773,101696362,2011-07-06 11:00:00,2020-11-09 23:07:00,"(1000, 5000]",0.0,267.0,...,0.168575,0.866406,0.662028,0.92154,0.872181,1.43301,2.290852,0.272158,2.099146,0.139118
1,27,Oxygen 27 F*,16,97031390,102288727,2018-05-17 14:17:00,2021-02-19 17:01:00,"(0, 100]",0.0,8.0,...,0.135263,0.855567,0.692308,0.972494,0.769231,1.280186,1.888462,0.273964,1.911883,0.088368
2,32,Редакция Евы,784,80470511,96142872,2013-04-29 13:46:00,2017-12-11 20:44:00,"(100, 1000]",0.0,96.0,...,0.142459,0.904268,0.905599,0.88255,0.964668,1.868945,3.498784,0.329188,2.130166,0.160048
3,35,Малефисента ⚜**,3415,51064403,101933631,2009-11-02 14:26:00,2020-12-21 17:49:00,"(1000, 5000]",0.0,1433.0,...,0.167977,0.890777,0.605887,0.952464,0.913617,1.516702,2.006752,0.236029,1.895766,0.094588
4,36,Доктор Филатов SD*,4992,81110399,101760074,2013-06-18 17:11:00,2020-11-20 18:00:00,"(1000, 5000]",0.0,1855.0,...,0.166422,0.896571,0.801671,0.901457,0.923862,1.785766,2.255664,0.247477,2.059361,0.141871


In [30]:
Authors['first_message_ts']=pd.to_datetime(Authors['first_message_ts'], format='%Y-%m-%d %H:%M:%S')
Authors['last_message_ts']=pd.to_datetime(Authors['last_message_ts'], format='%Y-%m-%d %H:%M:%S')

In [31]:
#known lizon last message
last_message_ts = Authors[Authors['Author_Id']==lizon_id]['last_message_ts'].max()

In [32]:
#All new authors after the date
len(Authors[Authors['first_message_ts']>last_message_ts])

10735

In [33]:
#Authors posted after the last lizon's date with reasonable number of messages
#5000 because it's definetly not a clone
#100 because with less messages authors write as Anonymous
#TotalDays>1 because authors with 100+ messages in very few days are spamers most likely and they are banned
len(Authors[((Authors['first_message_ts']>last_message_ts) & (Authors['TotalDays']>1) & (Authors['cnt_messages']>100) & (Authors['cnt_messages']<5000))])

670

In [34]:
Authors = Authors[((Authors['Author_Id']==lizon_id) | ((Authors['first_message_ts']>last_message_ts) & (Authors['TotalDays']>1) & (Authors['cnt_messages']>100) & (Authors['cnt_messages']<5000)))]

### Can we limit the number of authors to check even more?

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
columns_to_compare=[
'fraction_of_tneg_messages',
'fraction_of_tpos_messages',
'fraction_of_Large_Topics',
'fraction_of_messages_with_immediate_responses',
'avg_immediate_responses',
'fraction_of_messages_with_discussion',
'avg_responses',
'fraction_of_neg_rersponses',
'fraction_of_pos_rersponses',
'fraction_of_messages_with_emojis',
'fraction_of_messages_with_images',
'fraction_of_messages_with_links',
'fraction_of_messages_with_excessive_exclamations',
'fraction_of_messages_with_excessive_questions',
'fraction_of_messages_with_excessive_other',
'fraction_of_medium_with_commas',
'avg_num_commas',
'avg_sent_len',
'avg_num_sent',
'avg_ASW',
'avg_PLW',
'avg_fraction_of_Adj',
'avg_fraction_of_Verbs',
'avg_fraction_of_Nouns',
'avg_TTR',
'avg_TTR_A',
'avg_TTR_N',
'avg_TTR_V',
'avg_NAV',
'avg_UNAV']

In [37]:
lizon=np.array(Authors[Authors['Author_Id']==lizon_id][columns_to_compare])
lizon

array([[4.87804878e-01, 1.71669794e-01, 7.89473684e-01, 6.54631083e-01,
        1.06122449e+00, 4.12872841e-01, 1.01596958e+01, 4.11982249e-01,
        2.21153846e-01, 0.00000000e+00, 0.00000000e+00, 1.42072214e-01,
        1.72684458e-02, 3.92464678e-02, 2.35478807e-03, 2.20450281e+00,
        2.46716698e+00, 1.10714444e+01, 2.79268293e+00, 1.88769785e+00,
        1.02372067e-01, 8.24955780e-02, 1.60779108e-01, 2.18236627e-01,
        8.93254711e-01, 7.27772842e-01, 9.27792082e-01, 9.03903230e-01,
        1.69437184e+00, 1.98724571e+00]])

In [38]:
others=np.array(Authors[columns_to_compare])
others

array([[0.31010453, 0.10801394, 0.35      , ..., 0.87218147, 1.43300956,
        2.29085165],
       [0.12467532, 0.20519481, 0.10686016, ..., 0.9646684 , 1.86894488,
        3.49878412],
       [0.45532646, 0.16102111, 0.5       , ..., 0.92386224, 1.78576606,
        2.2556641 ],
       ...,
       [0.45989305, 0.04812834, 0.        , ..., 0.91584755, 1.53872252,
        2.10129249],
       [0.6746988 , 0.1686747 , 0.        , ..., 0.91176725, 1.8948778 ,
        1.97651328],
       [0.48031496, 0.04724409, 0.        , ..., 0.87512282, 1.44298199,
        1.97819462]])

In [39]:
result=cosine_similarity(lizon,others)

In [40]:
Authors['lizon_sim']=result[0]

In [41]:
SimilarAuthors = Authors[Authors['lizon_sim']>0.96].sort_values(['lizon_sim'],ascending=False)

In [42]:
SimilarAuthors['Author_Id'].nunique()

518

In [43]:
SimilarAuthors.to_csv(SimilarAuthors_full_filename, header=True, index=False)

In [44]:
SimilarAuthors[SimilarAuthors['Author_Id'].isin([69715,300514,100899,424711,472696,475085,711697,731728])]

Unnamed: 0,Author_Id,author,cnt_messages,First_Message_Id,Last_Message_Id,first_message_ts,last_message_ts,Binned Number of Messages,cnt_2021_messages,num_of_tneg_messages,...,avg_TTR,avg_TTR_A,avg_TTR_N,avg_TTR_V,avg_NAV,avg_UNAV,avg_fraction_of_Nouns,avg_ASW,avg_PLW,lizon_sim
3485,69715,lizon **,1274,49189796,62257633,2009-08-28 11:50:00,2010-12-07 23:18:00,"(1000, 5000]",0.0,520.0,...,0.893255,0.727773,0.927792,0.903903,1.694372,1.987246,0.218237,1.887698,0.102372,1.0
17627,475085,Элиззи H*,3317,90184059,98561081,2015-07-26 22:11:00,2019-03-21 21:20:00,"(1000, 5000]",0.0,1159.0,...,0.878784,0.751416,0.903675,0.891083,1.758879,2.211228,0.210558,1.799589,0.092318,0.99794
17566,472696,Кассиапея D,1093,89966258,98530888,2015-06-24 22:40:00,2019-03-16 15:05:00,"(1000, 5000]",0.0,365.0,...,0.87231,0.752578,0.899809,0.87733,1.70885,2.180732,0.210773,1.799353,0.096391,0.997702
19218,711697,Lizonn +,136,98135519,98598759,2018-12-29 16:55:00,2019-03-28 23:41:00,"(100, 1000]",0.0,35.0,...,0.882333,0.764493,0.89573,0.916748,1.807151,2.284331,0.210087,1.847633,0.097056,0.996895
16940,424711,Joconda Mona Liza _,390,84147592,89805527,2014-02-07 02:23:00,2015-06-04 17:02:00,"(100, 1000]",0.0,136.0,...,0.891069,0.78263,0.900957,0.898081,1.742168,2.004457,0.213476,1.843177,0.105652,0.996605
14771,300514,"Lizon "" **K**",955,62997968,78899997,2011-01-04 14:22:00,2013-01-19 20:38:00,"(100, 1000]",0.0,370.0,...,0.901131,0.764044,0.924115,0.915488,1.770354,2.0803,0.219439,1.844406,0.096304,0.994347
19739,731728,newyorck D,2142,100170690,103317149,2020-02-17 15:43:00,2021-09-16 20:49:00,"(1000, 5000]",759.0,711.0,...,0.879905,0.746728,0.90968,0.887026,1.714989,2.376002,0.225905,1.828435,0.091608,0.985828
5399,100899,lizlizon *,471,78259900,83602925,2012-12-09 13:15:00,2013-12-27 03:05:00,"(100, 1000]",0.0,164.0,...,0.886032,0.784745,0.897056,0.892494,1.794401,2.117271,0.214524,1.909488,0.113478,0.982347


In [45]:
SimilarAuthors.tail()

Unnamed: 0,Author_Id,author,cnt_messages,First_Message_Id,Last_Message_Id,first_message_ts,last_message_ts,Binned Number of Messages,cnt_2021_messages,num_of_tneg_messages,...,avg_TTR,avg_TTR_A,avg_TTR_N,avg_TTR_V,avg_NAV,avg_UNAV,avg_fraction_of_Nouns,avg_ASW,avg_PLW,lizon_sim
18038,492389,id32060016 F**,167,94552308,103118353,2017-03-09 17:59:00,2021-08-02 22:32:00,"(100, 1000]",0.0,65.0,...,0.899059,0.77815,0.91716,0.907656,1.786543,2.024365,0.270697,1.87633,0.08683,0.960762
14430,283990,Il segreto H*,255,62469252,99168769,2010-12-14 18:27:00,2019-07-22 00:04:00,"(100, 1000]",0.0,107.0,...,0.877898,0.710177,0.91784,0.896751,1.673079,1.989966,0.200532,1.826354,0.083641,0.960754
14017,272443,Giny SD*,136,67866568,99955691,2011-07-03 19:37:00,2020-01-07 18:38:00,"(100, 1000]",0.0,47.0,...,0.883695,0.787058,0.869659,0.920404,1.780714,1.959954,0.194448,1.74262,0.071489,0.960658
5590,103636,Yaja F,464,84386944,100781738,2014-02-22 03:57:00,2020-05-25 15:50:00,"(100, 1000]",0.0,196.0,...,0.870896,0.594214,0.90789,0.909075,1.544486,1.646318,0.206442,1.821358,0.076211,0.960476
8505,150162,twingo1 D,364,66778954,103086987,2011-05-12 22:47:00,2021-07-27 16:18:00,"(100, 1000]",0.0,109.0,...,0.923722,0.806684,0.959305,0.916087,1.777846,2.270575,0.21643,1.975135,0.113492,0.960152


In [46]:
SimilarAuthors.head()

Unnamed: 0,Author_Id,author,cnt_messages,First_Message_Id,Last_Message_Id,first_message_ts,last_message_ts,Binned Number of Messages,cnt_2021_messages,num_of_tneg_messages,...,avg_TTR,avg_TTR_A,avg_TTR_N,avg_TTR_V,avg_NAV,avg_UNAV,avg_fraction_of_Nouns,avg_ASW,avg_PLW,lizon_sim
3485,69715,lizon **,1274,49189796,62257633,2009-08-28 11:50:00,2010-12-07 23:18:00,"(1000, 5000]",0.0,520.0,...,0.893255,0.727773,0.927792,0.903903,1.694372,1.987246,0.218237,1.887698,0.102372,1.0
14695,296493,Ленкаа SD**,1502,65364505,98686725,2011-03-24 15:41:00,2019-04-13 15:55:00,"(1000, 5000]",0.0,761.0,...,0.879763,0.70829,0.903753,0.909783,1.70371,1.810607,0.220513,1.81644,0.07795,0.999307
20291,751973,Canari KF*,621,101951222,103020466,2020-12-24 14:14:00,2021-07-14 13:35:00,"(100, 1000]",275.0,205.0,...,0.849056,0.566851,0.903223,0.884859,1.460159,1.758969,0.210652,1.811461,0.077361,0.998967
17627,475085,Элиззи H*,3317,90184059,98561081,2015-07-26 22:11:00,2019-03-21 21:20:00,"(1000, 5000]",0.0,1159.0,...,0.878784,0.751416,0.903675,0.891083,1.758879,2.211228,0.210558,1.799589,0.092318,0.99794
17566,472696,Кассиапея D,1093,89966258,98530888,2015-06-24 22:40:00,2019-03-16 15:05:00,"(1000, 5000]",0.0,365.0,...,0.87231,0.752578,0.899809,0.87733,1.70885,2.180732,0.210773,1.799353,0.096391,0.997702


## Messages to analyze

In [47]:
SimilarAuthors = pd.read_csv(SimilarAuthors_full_filename, error_bad_lines=False, index_col=False)

In [48]:
Messages = pd.read_csv(Messages_full_filename, error_bad_lines=False, index_col=False)

In [49]:
SMessages=pd.merge(Messages, SimilarAuthors[['Author_Id']], left_on='Author_Id', right_on='Author_Id', how='inner')

In [50]:
del Messages

In [51]:
len(SMessages)

369130

Medium Size messages only (need to load additional data)

In [52]:
ta_ext = pd.read_csv(ta_extension_full_filename, error_bad_lines=False, index_col=False) 

In [53]:
SMessages=pd.merge(SMessages, ta_ext[['Message_Id', 'message_words']], left_on='Message_Id', right_on='Message_Id', how='inner')

In [54]:
del ta_ext

In [55]:
SMessages= SMessages[((SMessages['message_words']>short_message_limit_words) & (SMessages['message_words']<=medium_message_limit_words))]

In [56]:
len(SMessages)

296782

Joining original forum post text, without cleaning html tags etc

In [57]:
OriginalMessages = pd.read_csv(OriginalMessages_full_filename, error_bad_lines=False, index_col=False)

In [58]:
SMessages=pd.merge(SMessages, OriginalMessages[['Message_Id','original_message']], left_on='Message_Id', right_on='Message_Id', how='inner')

In [59]:
del OriginalMessages

In [60]:
SMessages.columns

Index(['Message_Id', 'Timestamp', 'Topic_1st_Message', 'Parent_Id',
       'Author_Id', 'author', 'Topic_Id', 'Topic', 'Chapter_Id', 'Chapter',
       'message', 'cnt_childs', 'cnt_immediate_childs', 'message_words',
       'original_message'],
      dtype='object')

In [61]:
SMessages=SMessages[['Message_Id', 'Timestamp', 'Topic_1st_Message', 'Parent_Id',
       'Author_Id',  'Topic_Id', 'Topic', 'Chapter_Id', 'Chapter',
       'original_message', 'message', 'cnt_childs', 'cnt_immediate_childs']]

Unifing author names (can vary in posts) for further analysis

In [62]:
#Authors = pd.read_csv(Authors_full_filename, error_bad_lines=False, index_col=False)

In [63]:
SMessages=pd.merge(SMessages, Authors[['Author_Id','author']],left_on='Author_Id', right_on='Author_Id', how='inner')

In [64]:
len(SMessages)

296782

In [65]:
SMessages.head()

Unnamed: 0,Message_Id,Timestamp,Topic_1st_Message,Parent_Id,Author_Id,Topic_Id,Topic,Chapter_Id,Chapter,original_message,message,cnt_childs,cnt_immediate_childs,author
0,90328113,2015-08-17 22:06:00,N,0,72667,3372512,секции плавания,137,Детская психология и развитие,"<div class=""body"">А можно ли сразу сориентиров...",А можно ли сразу сориентировать именно на спор...,7,2,kathy04 H*
1,92104197,2016-03-15 11:17:00,N,0,72667,3419591,Дети красивые,137,Детская психология и развитие,"<div class=""body"">Присоединяюсь. У ребенка не ...","Присоединяюсь. У ребенка не будет комплексов, ...",3,1,kathy04 H*
2,92841631,2016-06-24 18:39:00,N,0,72667,3437299,попрошайка,137,Детская психология и развитие,"<div class=""body"">Мне лично кажется, что это о...","Мне лично кажется, что это отголосок американс...",0,0,kathy04 H*
3,92838747,2016-06-24 11:30:00,N,92838447,72667,3437454,Что умела 10-летняя девочка 100 лет назад на Р...,137,Детская психология и развитие,"<div class=""body"">Это больше советское выражен...","Это больше советское выражение, хотя было в хо...",4,1,kathy04 H*
4,92838772,2016-06-24 11:34:00,N,92838615,72667,3437454,Что умела 10-летняя девочка 100 лет назад на Р...,137,Детская психология и развитие,"<div class=""body"">Весь вопрос в размерах помощ...",Весь вопрос в размерах помощи. На мне реально ...,0,0,kathy04 H*


In [66]:
SMessages.to_csv(all_lizon_messages_full_filename, header=True, index=False)

In [67]:
SimilarAuthors[SimilarAuthors['Author_Id'].isin([69715,300514,100899,424711,472696,475085,711697,731728])]['Author_Id'].unique()

array([ 69715, 475085, 472696, 711697, 424711, 300514, 731728, 100899])