In [44]:
import pandas as pd
import random 
from bertopic import BERTopic

from bert_helpers import * 
import sys
sys.path.append('../helpers_python')
from pre_processing import *
from helpers import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Chose the country

In [8]:
country = 'Turkey'

# Load data
df = load_data_bert(country)

# array that can save the models
saved_models = []

## Set parameters

In [73]:
## Best params until now but cannot catch coronavirus
params_france = {
    'UMAP' : {
        'metric' : 'cosine', 
        'n_neighbors' : 15, 
        'n_components' : 20, 
        'min_dist' : 0.3, 
        'low_memory' : False,
        'random_state': 8
    },
    'HDBSCAN': {
        'min_cluster_size':15,
        'min_samples': 1,
        'cluster_selection_epsilon': 0.6,
        'metric': 'euclidean',                      
        'cluster_selection_method': 'eom',
        'prediction_data': True}
}

## Train model with french parameters
Using the french parameters we find 4 topics:
1. Politics of Turkey and tweets that are against  the state order. Two party and their leader are cited Gergerlioglu that defend human right and Erdogan than is the Prime minister from the right
1. Woman right and muslim religion 
1. Police and jail
1. Revolts 

A lot of topics are mixed and we can see on the vizualisation that there might be clusters inside clusters 

In [74]:
tweets = df.clean.to_list()
tweets_sub = random.sample(tweets,k=int(len(tweets)*0.2))

In [75]:
model_fr = get_model(params_france, additional_stop_words= [])
topics_fr, probs_fr = model_fr.fit_transform(tweets_sub)
model_fr.get_topic_info()

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

2022-12-17 18:03:00,202 - BERTopic - Transformed documents to Embeddings
2022-12-17 18:03:15,757 - BERTopic - Reduced dimensionality
2022-12-17 18:03:16,078 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,189,-1_continue_hpg_prison_people
1,0,436,0_say_life_god_one
2,1,415,1_turkey_turkish_people_kurdish
3,2,16,2_tweet_twitter_tag_follow


In [76]:
model_fr.get_topics()

{-1: [('continue', 0.027934142388239158),
  ('hpg', 0.02254576703001712),
  ('prison', 0.022505022176422904),
  ('people', 0.02175778295192381),
  ('resistance', 0.020174406557373153),
  ('camp', 0.019689862867660845),
  ('want', 0.01955389967176741),
  ('police', 0.019366787536394897),
  ('house', 0.018994960390026804),
  ('day', 0.018857273655944258)],
 0: [('say', 0.039538063044033),
  ('life', 0.030884439875588258),
  ('god', 0.02903674101039711),
  ('one', 0.026811276264926253),
  ('child', 0.025890103443839126),
  ('mother', 0.025489818210189225),
  ('hakan', 0.023512045641005556),
  ('year', 0.02049849647810913),
  ('prison', 0.020187912283320005),
  ('justice', 0.019624293055014643)],
 1: [('turkey', 0.05253238116704927),
  ('turkish', 0.048903769580161814),
  ('people', 0.044066497907941544),
  ('kurdish', 0.03940554563544222),
  ('gergerlio', 0.02575295456500725),
  ('state', 0.025583585874143513),
  ('kurd', 0.024107590750244563),
  ('march', 0.023814673955813978),
  ('erdog

In [77]:
get_tweets_of_topic(topics_fr, 2, tweets_sub, 15)

['news story anything tweet',
 'akp mhp supporter water news men curse shook social medium one',
 'think unfollowed mistake understand course easy secret follower current follower',
 'type password account hack enable two step protection setting change password',
 'beg tag tweet tell follow anyway come guy play dm',
 'aim anyway thank',
 'fit unidad political force denounces million active ingredient produce p',
 'follow person twitter allow say follow increase allow h seyin ga',
 'tag tt list terrible attack tag rain tweet far tweet system',
 'get start go live twitter tap tweet composer icon tap camera icon select live bott',
 'please share rt',
 'like comment tt please either tweet retweet ver rkiye gazi hapiste',
 'unfollowing anyone rt tweet anl yonmu',
 'ya hack',
 'say pk kci thousand troll spammed account account suspend twitter say intense spam attack']

In [66]:
model_fr.visualize_heatmap()

In [78]:
model_fr.visualize_documents(tweets_sub)

## Find parameters that are more adapted to Turkey 

1. Tweets related to kurdistan and the questio of security around it
1. Tweets related to prison 
1. All tweets related to Omer Faruk Gergerlioglu
1. Tweets related to the army and possible conflict with russia 
1. Tweets related to the KHK party that is close to nazis party
1. Tweets related to the struggle

In [68]:
params= {
    'UMAP' : {
        'metric' : 'cosine', 
        'n_neighbors' : 15, 
        'n_components' : 30, 
        'min_dist' : 0.2, 
        'low_memory' : False,
        'random_state': 8
    },
    'HDBSCAN': {
        'min_cluster_size':15,
        'min_samples': 1,
        'cluster_selection_epsilon': 0.5,
        'metric': 'euclidean',                      
        'cluster_selection_method': 'eom',
        'prediction_data': True}
}
model = get_model(params, additional_stop_words= [])
topics, probs = model.fit_transform(tweets_sub)
model.get_topic_info()

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

2022-12-17 13:27:18,196 - BERTopic - Transformed documents to Embeddings
2022-12-17 13:27:26,564 - BERTopic - Reduced dimensionality
2022-12-17 13:27:26,721 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,225,-1_people_newroz_right_one
1,0,278,0_turkey_turkish_kurdish_kurd
2,1,232,1_gergerlio_lu_say_people
3,2,91,2_woman_march_freedom_day
4,3,90,3_parliament_vote_akp_hdp
5,4,34,4_account_mehmet_baransu_course
6,5,33,5_osman_service_publication_book
7,6,23,6_university_student_azi_bo
8,7,18,7_prison_isolation_hunger_strike
9,8,16,8_vaccine_corona_treat_coronavirus


In [69]:
get_tweets_of_topic(topics_fr, 2, tweets_sub, 15)

[]

In [70]:
model.get_topics()

{-1: [('people', 0.0269694805226164),
  ('newroz', 0.023030446614633892),
  ('right', 0.022604352873812968),
  ('one', 0.020992527672773195),
  ('police', 0.02033768140030585),
  ('terrorist', 0.020018986580170612),
  ('say', 0.019111962550127193),
  ('human', 0.017506735045125885),
  ('death', 0.017272834960975418),
  ('like', 0.016557308499935364)],
 0: [('turkey', 0.06417184137227112),
  ('turkish', 0.06051693368862293),
  ('kurdish', 0.03962279508071177),
  ('kurd', 0.026629805372155767),
  ('attack', 0.025825107354068344),
  ('state', 0.025157593277810728),
  ('erdogan', 0.024356427331305767),
  ('year', 0.02342747686593441),
  ('people', 0.023288636503777115),
  ('kill', 0.01990884337321266)],
 1: [('gergerlio', 0.04375398092454129),
  ('lu', 0.04109871258934767),
  ('say', 0.03193757068868136),
  ('people', 0.026000822432012027),
  ('god', 0.024954639147307747),
  ('life', 0.024830512158400177),
  ('mfg', 0.024319563718664696),
  ('hand', 0.022822121276876224),
  ('heart', 0.022

In [71]:
model.visualize_documents(tweets_sub)

In [72]:
get_coherence(model,tweets_sub, topics)

0.6603577843041664