# Data cleaning

In this file, we cleaned the downloaded data:
The main steps include:
1. Adding party affiliation to tweet rows
2. Deleting unnecessary downloaded Retweets.
3. Deleting links and mentions from the tweets text and saving them to separate columns
4. Expanding the column of public metrics
5. Encoding emojis in a unified format
6. Translating tweets using Google Translate in Google Sheets
7. Saving all downloaded tweets to one file

### 1.  Used libraries

In [427]:
import os
import pandas as pd
import re
import emoji

### 2. Reading JSON files and transforming them into party-specific pickle files

In [428]:
base_input_path = 'data/01.raw/tweets_data_final' # change folder if needed
subfolders = ['Konfederacja', 'NL', 'PiS', 'PO', 'PL2050', 'PSL']
output_folder = 'data/01.raw/tweets_data_combined'

os.makedirs(output_folder, exist_ok=True)

for subfolder in subfolders:
    folder_path = os.path.join(base_input_path, subfolder)
    dataframes = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            politician = filename.split("_tweets.json")[0]
            try:
                df = pd.read_json(file_path)  
                df["username"] = politician  
                df["party"] = subfolder
                print(f"Read {len(df)} rows from {file_path}")  
                dataframes.append(df)
            except ValueError as e:
                print(f"Error reading {file_path}: {e}")
    
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        
        output_file_path = os.path.join(output_folder, f'{subfolder}_combined.pkl')
        combined_df.to_pickle(output_file_path) 
        
        print(f"Saved {subfolder} combined data to {output_file_path}")

print("Processing complete!")

Read 11 rows from data/01.raw/tweets_data_final\Konfederacja\Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json
Read 964 rows from data/01.raw/tweets_data_final\Konfederacja\bartlomiejpejo_2023-10-16_2024-10-15.json
Read 1318 rows from data/01.raw/tweets_data_final\Konfederacja\KonradBerkowicz_2024-04-16_2024-10-15_vol1.json
Read 597 rows from data/01.raw/tweets_data_final\Konfederacja\MichalWawer_2023-10-16_2024-10-15.json
Read 772 rows from data/01.raw/tweets_data_final\Konfederacja\MarSypniewski_2023-10-16_2024-10-15.json
Read 175 rows from data/01.raw/tweets_data_final\Konfederacja\TudujKrzysztof_2023-10-16_2024-10-15.json
Read 721 rows from data/01.raw/tweets_data_final\Konfederacja\SlawomirMentzen_2023-10-16_2024-10-15.json
Read 750 rows from data/01.raw/tweets_data_final\Konfederacja\WTumanowicz_2023-10-16_2024-10-15.json
Read 950 rows from data/01.raw/tweets_data_final\Konfederacja\Wlodek_Skalik_2023-10-16_2024-10-15.json
Read 289 rows from data/01.raw/tweets_data_final\Konfederacja\Ko

### 3. Data cleaning

In [429]:
df_konf = pd.read_pickle(os.path.join(output_folder, 'Konfederacja_combined.pkl'))
df_NL = pd.read_pickle(os.path.join(output_folder, 'NL_combined.pkl'))
df_PIS = pd.read_pickle(os.path.join(output_folder, 'PIS_combined.pkl'))
df_PO = pd.read_pickle(os.path.join(output_folder, 'PO_combined.pkl'))
df_PL2050 = pd.read_pickle(os.path.join(output_folder, 'PL2050_combined.pkl'))
df_PSL = pd.read_pickle(os.path.join(output_folder, 'PSL_combined.pkl'))

In [430]:
df_konf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8177 entries, 0 to 8176
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   created_at              8177 non-null   datetime64[ns, UTC]
 1   text                    8177 non-null   object             
 2   id                      8177 non-null   int64              
 3   author_id               8177 non-null   int64              
 4   edit_controls           8177 non-null   object             
 5   possibly_sensitive      8177 non-null   bool               
 6   reply_settings          8177 non-null   object             
 7   public_metrics          8177 non-null   object             
 8   lang                    8177 non-null   object             
 9   edit_history_tweet_ids  8177 non-null   object             
 10  conversation_id         8177 non-null   int64              
 11  category                8177 non-null   obj

In [431]:
df_konf.head()

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 17, 'reply_count': 4, 'like_...",pl,[1765380709114409272],1765380709114409216,Original,,,Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1764245727646044196],1764245727646044160,Original,{'media_keys': ['3_1764245720805040128']},"{'urls': [{'start': 57, 'end': 80, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763931255521857839],1763931255521857792,Original,{'media_keys': ['7_1763931103151202304']},"{'urls': [{'start': 135, 'end': 158, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763576337078313201],1763576337078313216,Original,{'media_keys': ['3_1763576329385967617']},"{'urls': [{'start': 64, 'end': 87, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763173358068281524],1763173358068281600,Original,{'media_keys': ['3_1763173351466401792']},"{'urls': [{'start': 128, 'end': 151, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,


In [432]:
# Merge all dataframes into one
df = pd.concat([df_konf, df_NL, df_PIS, df_PO, df_PL2050, df_PSL], ignore_index=True)

In [433]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26677 entries, 0 to 26676
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   created_at              26677 non-null  datetime64[ns, UTC]
 1   text                    26677 non-null  object             
 2   id                      26661 non-null  float64            
 3   author_id               26677 non-null  float64            
 4   edit_controls           26661 non-null  object             
 5   possibly_sensitive      26661 non-null  object             
 6   reply_settings          26677 non-null  object             
 7   public_metrics          26677 non-null  object             
 8   lang                    26677 non-null  object             
 9   edit_history_tweet_ids  26661 non-null  object             
 10  conversation_id         26661 non-null  float64            
 11  category                26677 non-null  o

In [434]:
len(df)

26677

In [435]:
pd.options.display.float_format = '{:.0f}'.format
df['id'] = df['id'].fillna(0).astype('int64')
df['id']

0        1765380709114409216
1        1764245727646044160
2        1763931255521857792
3        1763576337078313216
4        1763173358068281600
                ...         
26672    1715066644018381312
26673    1714927388637696256
26674    1714675864237924864
26675    1714370461507748096
26676    1714370461507748096
Name: id, Length: 26677, dtype: int64

In [436]:
# Get the value counts of 'id'
id_counts = df['id'].value_counts()

# Filter the counts to show only those greater than 1
id_counts_above_1 = id_counts[id_counts > 1]

# Display the counts
print(f"IDs with counts greater than 1:\n{id_counts_above_1}")

IDs with counts greater than 1:
id
0                      16
1780108572161945856     3
1780104870302732544     2
1734658925658619904     2
1713706323672539392     2
                       ..
1806263584877326848     2
1778400392470056960     2
1714215043720442368     2
1713961177028415488     2
1734231599116374272     2
Name: count, Length: 114, dtype: int64


In [437]:
id_counts_above_1.sum()

243

In [438]:
# Count unique IDs
non_duplicate_counts = df['id'].nunique()
print(f"Number of unique IDs: {non_duplicate_counts}")

# Count duplicate IDs
duplicate_counts = df['id'].duplicated().sum()
print(f"Number of duplicate IDs: {duplicate_counts}")

# Get the value counts of 'id'
id_counts = df['id'].value_counts()

# Filter the counts to show only those greater than 1
id_counts_above_1 = id_counts[id_counts > 1]

# Sum of counts of IDs that appear more than once
total_duplicate_rows = id_counts_above_1.sum()
print(f"Total number of duplicate rows based on 'id': {total_duplicate_rows}")

# Convert all columns to strings to avoid unhashable types
df_str = df.astype(str)

# Now check for exact duplicate rows across all columns
duplicates_all = df_str[df_str.duplicated(keep=False)]
print(f"Total duplicate rows (exact match across all columns): {duplicates_all.shape[0]}")
duplicates_all

Number of unique IDs: 26548
Number of duplicate IDs: 129
Total number of duplicate rows based on 'id': 243
Total duplicate rows (exact match across all columns): 130


Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
9,2024-02-26 13:35:28+00:00,"Nie ma takiej obietnicy, której polityk nie ob...",1762109123800293376,1.5544839915117036e+18,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,['1762109123800293457'],1.7621091238002934e+18,Original,{'media_keys': ['3_1762109117865304064']},"{'urls': [{'start': 80, 'end': 103, 'url': 'ht...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
10,2024-02-26 13:35:28+00:00,"Nie ma takiej obietnicy, której polityk nie ob...",1762109123800293376,1.5544839915117036e+18,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,['1762109123800293457'],1.7621091238002934e+18,Original,{'media_keys': ['3_1762109117865304064']},"{'urls': [{'start': 80, 'end': 103, 'url': 'ht...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
2291,2024-04-16 05:38:51+00:00,❗SPOTKANIE CZŁONKÓW I SYMPATYKÓW KLUBU KONFEDE...,1780108572161945856,1420353350.0,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 5, 'reply_count': 1, 'like_c...",pl,['1780108572161945969'],1.7801085721619459e+18,Original,{'media_keys': ['3_1780108567447560193']},"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",KonradBerkowicz_2024-04-16_2024-10-15_vol1.json,Konfederacja,,,,
2292,2024-04-16 05:38:51+00:00,❗SPOTKANIE CZŁONKÓW I SYMPATYKÓW KLUBU KONFEDE...,1780108572161945856,1420353350.0,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 5, 'reply_count': 1, 'like_c...",pl,['1780108572161945969'],1.7801085721619459e+18,Original,{'media_keys': ['3_1780108567447560193']},"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",KonradBerkowicz_2024-04-16_2024-10-15_vol1.json,Konfederacja,,,,
2888,2023-10-17 10:47:20+00:00,Ponad 43 tysiące wyborców oddało swoje głosy n...,1714231616065708544,9.417106438534472e+17,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 35, 'reply_count': 111, 'lik...",pl,['1714231616065708419'],1.7142316160657085e+18,Original,{'media_keys': ['3_1714231608536891392']},"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",MichalWawer_2023-10-16_2024-10-15.json,Konfederacja,,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25603,2023-10-16 08:28:55+00:00,"Wiara, determinacja, silne partycypacyjne przy...",1713834394127884544,325592746.0,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0.0,everyone,"{'retweet_count': 21, 'reply_count': 3, 'like_...",pl,['1713834394127884470'],1.7138343941278845e+18,Quote,,"{'hashtags': [{'start': 78, 'end': 91, 'tag': ...",JarubasAdam_2023-10-16_2024-10-15.json,PSL,,,"[{'type': 'quoted', 'id': '1713640726922195372'}]",
25835,2024-02-19 11:38:01+00:00,Kontrole @gijhars obejmują wszystkie transport...,1759542851396940288,1201875318.0,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0.0,everyone,"{'retweet_count': 7, 'reply_count': 2, 'like_c...",pl,['1759542851396940198'],1.7595428513969403e+18,Quote,,"{'urls': [{'start': 249, 'end': 272, 'url': 'h...",StefanKrajewski_2023-10-16_2024-10-15.json,PSL,,,"[{'type': 'quoted', 'id': '1759533295833096303'}]",
25836,2024-02-19 11:38:01+00:00,Kontrole @gijhars obejmują wszystkie transport...,1759542851396940288,1201875318.0,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0.0,everyone,"{'retweet_count': 7, 'reply_count': 2, 'like_c...",pl,['1759542851396940198'],1.7595428513969403e+18,Quote,,"{'urls': [{'start': 249, 'end': 272, 'url': 'h...",StefanKrajewski_2023-10-16_2024-10-15.json,PSL,,,"[{'type': 'quoted', 'id': '1759533295833096303'}]",
25855,2023-10-17 20:07:37+00:00,Wszystkim Wam serdecznie dziekuję. Za wsparcie...,1714372612611060224,1201875318.0,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0.0,everyone,"{'retweet_count': 10, 'reply_count': 6, 'like_...",pl,['1714372612611060222'],1.7143726126110602e+18,Original,{'media_keys': ['7_1714372479076954113']},"{'urls': [{'start': 81, 'end': 104, 'url': 'ht...",StefanKrajewski_2023-10-16_2024-10-15.json,PSL,,,,{'place_id': '47c001064da7125c'}


brief insight into how do these duplicates look like

In [439]:
df[df['id'].duplicated(keep=False)].sort_values(by='id')


Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
19626,2023-10-16 22:41:00+00:00,@DorotaNiedziela ja Tobie też❣️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19621,2023-10-16 00:00:00+00:00,@Gidziela 🥰✌️,0,61552404,,,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19620,2023-10-16 00:00:00+00:00,@REL_76 🥰🥰🥰,0,61552404,,,everyone,"{'retweet_count': 1, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19619,2023-10-16 00:00:00+00:00,"@MCichonAlicja Alu, czekamy jeszcze na wynik?",0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19624,2023-10-16 21:57:00+00:00,@jasinska_e ❤️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25119,2024-09-13 08:23:40+00:00,W ramach roboczego kontaktu z @WodyPolskie ora...,1834508231009325568,964017524,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",0,everyone,"{'retweet_count': 15, 'reply_count': 2, 'like_...",pl,[1834508231009325542],1834508231009325568,Original,,"{'mentions': [{'start': 30, 'end': 42, 'userna...",DariuszKlimczak_2023-10-16_2024-10-15.json,PSL,,"[{'domain': {'id': '11', 'name': 'Sport', 'des...",,
22677,2024-09-19 06:31:07+00:00,Premier @donaldtusk : namierzono człowieka prz...,1836654233296244992,52367150,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 567, 'reply_count': 221, 'li...",pl,[1836654233296244914],1836654233296244992,Original,,"{'mentions': [{'start': 8, 'end': 19, 'usernam...",CTomczyk_2023-10-16_2024-10-15_ERROR.json,PO,,"[{'domain': {'id': '10', 'name': 'Person', 'de...",,
22676,2024-09-19 06:31:07+00:00,Premier @donaldtusk : namierzono człowieka prz...,1836654233296244992,52367150,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 568, 'reply_count': 221, 'li...",pl,[1836654233296244914],1836654233296244992,Original,,"{'mentions': [{'start': 8, 'end': 19, 'usernam...",CTomczyk_2023-10-16_2024-10-15_ERROR.json,PO,,"[{'domain': {'id': '10', 'name': 'Person', 'de...",,
17012,2024-09-24 14:22:59+00:00,Potwierdza się to o czym mówiliśmy już od dawn...,1838584923688444416,138048156,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 141, 'reply_count': 125, 'li...",pl,[1838584923688444342],1838584923688444416,Quote,,"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",mblaszczak_2023-10-16_2024-10-15.json,PiS,,,"[{'type': 'quoted', 'id': '1838307356918071625'}]",


In [440]:
for col in df.columns:
    if df[col].apply(lambda x: isinstance(x, dict)).any():
        print(f"Column '{col}' contains dictionaries.")
    elif df[col].apply(lambda x: isinstance(x, list)).any():
        print(f"Column '{col}' contains lists.")

Column 'edit_controls' contains dictionaries.
Column 'public_metrics' contains dictionaries.
Column 'edit_history_tweet_ids' contains lists.
Column 'attachments' contains dictionaries.
Column 'entities' contains dictionaries.
Column 'context_annotations' contains lists.
Column 'referenced_tweets' contains lists.
Column 'geo' contains dictionaries.


In [441]:
# Get all duplicate IDs
duplicate_ids = df[df['id'].duplicated(keep=False)]

# Exclude columns with unhashable (dict-like) values
columns_to_exclude = ['edit_controls', 'public_metrics', 'attachments', 'entities', 'geo', 'edit_history_tweet_ids', 'context_annotations','referenced_tweets']
valid_columns = [col for col in df.columns if col not in columns_to_exclude]

# Find differences across valid columns
diff_summary = duplicate_ids[valid_columns].groupby('id').nunique()

# Show columns where duplicates have different values
diff_summary = diff_summary[(diff_summary > 1).any(axis=1)]

In [442]:
diff_summary

Unnamed: 0_level_0,created_at,text,author_id,possibly_sensitive,reply_settings,lang,conversation_id,category,username,party,in_reply_to_user_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,9,16,2,0,1,1,0,2,2,2,0
1780108572161945856,1,1,1,1,1,1,1,1,2,1,0
1780146130551996672,1,1,1,1,1,1,1,1,2,1,1
1780171152914034944,1,1,1,1,1,1,1,1,2,1,1
1780244011212485120,1,1,1,1,1,1,1,1,2,1,0
1780309557610258688,1,1,1,1,1,1,1,1,2,1,0
1780345695163047936,1,1,1,1,1,1,1,1,2,1,1
1780345829615636480,1,1,1,1,1,1,1,1,2,1,1
1780346025854603264,1,1,1,1,1,1,1,1,2,1,1
1780346371888833024,1,1,1,1,1,1,1,1,2,1,1


In [443]:
duplicates = df[df.duplicated(subset=['id'], keep=False)]
duplicates

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
9,2024-02-26 13:35:28+00:00,"Nie ma takiej obietnicy, której polityk nie ob...",1762109123800293376,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1762109123800293457],1762109123800293376,Original,{'media_keys': ['3_1762109117865304064']},"{'urls': [{'start': 80, 'end': 103, 'url': 'ht...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
10,2024-02-26 13:35:28+00:00,"Nie ma takiej obietnicy, której polityk nie ob...",1762109123800293376,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1762109123800293457],1762109123800293376,Original,{'media_keys': ['3_1762109117865304064']},"{'urls': [{'start': 80, 'end': 103, 'url': 'ht...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
973,2023-10-17 08:22:19+00:00,Serdeczne dzięki za każdy głos. 🤝\nDla mnie to...,1714195119706890496,1182211615,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 15, 'reply_count': 28, 'like...",pl,[1714195119706890463],1714195119706890496,Original,{'media_keys': ['3_1714195114472431617']},"{'hashtags': [{'start': 251, 'end': 264, 'tag'...",bartlomiejpejo_2023-10-16_2024-10-15.json,Konfederacja,,,,{'place_id': '535f0c2de0121451'}
974,2023-10-17 08:22:19+00:00,Serdeczne dzięki za każdy głos. 🤝\nDla mnie to...,1714195119706890496,1182211615,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 15, 'reply_count': 28, 'like...",pl,[1714195119706890463],1714195119706890496,Original,{'media_keys': ['3_1714195114472431617']},"{'urls': [{'start': 275, 'end': 298, 'url': 'h...",bartlomiejpejo_2023-10-16_2024-10-15.json,Konfederacja,,,,{'place_id': '535f0c2de0121451'}
1974,2024-05-25 09:27:54+00:00,@MKierwinski Za to Wy bronicie ambasadora kraj...,1794299341743829248,1420353350,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 104, 'reply_count': 26, 'lik...",pl,[1794299341743829316],1794294356570264064,Reply,,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",KonradBerkowicz_2024-04-16_2024-10-15_vol1.json,Konfederacja,308367619,,"[{'type': 'replied_to', 'id': '179429435657026...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25856,2023-10-17 20:07:37+00:00,Wszystkim Wam serdecznie dziekuję. Za wsparcie...,1714372612611060224,1201875318,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 10, 'reply_count': 6, 'like_...",pl,[1714372612611060222],1714372612611060224,Original,{'media_keys': ['7_1714372479076954113']},"{'urls': [{'start': 81, 'end': 104, 'url': 'ht...",StefanKrajewski_2023-10-16_2024-10-15.json,PSL,,,,{'place_id': '47c001064da7125c'}
26266,2023-10-16 12:39:29+00:00,.@TakJestTVN24 na antenie @tvn24📺. Zapraszam d...,1713897452430909952,1070635531447558144,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 6, 'reply_count': 1, 'like_c...",pl,[1713897452430909939],1713897452430909952,Original,{'media_keys': ['3_1713897440191885312']},"{'urls': [{'start': 83, 'end': 106, 'url': 'ht...",PZgorzelskiP_2023-10-16_2024-10-15.json,PSL,,,,
26267,2023-10-16 12:39:29+00:00,.@TakJestTVN24 na antenie @tvn24📺. Zapraszam d...,1713897452430909952,1070635531447558144,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 6, 'reply_count': 1, 'like_c...",pl,[1713897452430909939],1713897452430909952,Original,{'media_keys': ['3_1713897440191885312']},"{'urls': [{'start': 83, 'end': 106, 'url': 'ht...",PZgorzelskiP_2023-10-16_2024-10-15.json,PSL,,,,
26675,2023-10-17 19:59:04+00:00,"Mówiliśmy jasno: albo #TrzeciaDroga, albo trze...",1714370461507748096,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 748, 'reply_count': 2136, 'l...",pl,[1714370461507748145],1714370461507748096,Original,,"{'mentions': [{'start': 59, 'end': 68, 'userna...",KosiniakKamysz_2023-10-16_2024-10-15.json,PSL,,,,


In [444]:
duplicate_text_count = df['text'].duplicated().sum()
print(f"Number of duplicate Text Entries: {duplicate_text_count}")

Number of duplicate Text Entries: 139


In [445]:
duplicate_id_text_rows = df[df.duplicated(subset=['id', 'text'], keep=False)]
print(f"Rows where BOTH `id` and `text` are duplicated: {len(duplicate_id_text_rows)}")

Rows where BOTH `id` and `text` are duplicated: 227


In [446]:
# Count occurrences of each ID
id_counts = df['id'].value_counts()
print("Distribution of duplicate IDs:")
print(id_counts.value_counts().sort_index())

# Count occurrences of each text
text_counts = df['text'].value_counts()
print("\nDistribution of duplicate Text Entries:")
print(text_counts.value_counts().sort_index())

Distribution of duplicate IDs:
count
1     26434
2       112
3         1
16        1
Name: count, dtype: int64

Distribution of duplicate Text Entries:
count
1    26404
2      130
3        3
4        1
Name: count, dtype: int64


In [447]:
# Get all duplicate ID rows
duplicate_id_rows = df[df.duplicated(subset=['id'], keep=False)]

# Get all duplicate Text rows
duplicate_text_rows = df[df.duplicated(subset=['text'], keep=False)]

# Get rows where both ID and Text are duplicated
duplicate_id_text_rows = df[df.duplicated(subset=['id', 'text'], keep=False)]

# Compare overlaps
print(f"Rows where ID is duplicated: {len(duplicate_id_rows)}")
print(f"Rows where Text is duplicated: {len(duplicate_text_rows)}")
print(f"Rows where BOTH ID and Text are duplicated: {len(duplicate_id_text_rows)}")

# Find duplicate IDs that are NOT in the text duplicate set
id_not_in_text = duplicate_id_rows[~duplicate_id_rows['id'].isin(duplicate_text_rows['id'])]
print(f"\nDuplicate IDs NOT duplicated in Text: {len(id_not_in_text)}")

# Find duplicate Texts that are NOT in the ID duplicate set
text_not_in_id = duplicate_text_rows[~duplicate_text_rows['text'].isin(duplicate_id_rows['text'])]
print(f"Duplicate Texts NOT duplicated in ID: {len(text_not_in_id)}")


Rows where ID is duplicated: 243
Rows where Text is duplicated: 273
Rows where BOTH ID and Text are duplicated: 227

Duplicate IDs NOT duplicated in Text: 16
Duplicate Texts NOT duplicated in ID: 46


In [448]:
empty_id_rows = df[df['id'].isna()]
print(f"Rows where `id` is empty (NaN): {len(empty_id_rows)}")
#empty_id_rows

zero_id_rows = df[df['id'] == 0]
print(f"Rows where `id` is 0: {len(zero_id_rows)}")
zero_id_rows

Rows where `id` is empty (NaN): 0
Rows where `id` is 0: 16


Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
19617,2023-10-16 00:00:00+00:00,@tomekbit ✌️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19618,2023-10-16 00:00:00+00:00,"@MaciejGdynia Maćku, czekam na oficjalne wynik...",0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 2, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19619,2023-10-16 00:00:00+00:00,"@MCichonAlicja Alu, czekamy jeszcze na wynik?",0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19620,2023-10-16 00:00:00+00:00,@REL_76 🥰🥰🥰,0,61552404,,,everyone,"{'retweet_count': 1, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19621,2023-10-16 00:00:00+00:00,@Gidziela 🥰✌️,0,61552404,,,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19622,2023-10-16 00:00:00+00:00,@WHaptar Gratulacje👏🥂,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19623,2023-10-16 00:00:00+00:00,@KapenGenezyp Dziękuję❤️❤️❤️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19624,2023-10-16 21:57:00+00:00,@jasinska_e ❤️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19625,2023-10-16 22:27:00+00:00,@BMikolajewska odpowie💪,0,61552404,,,everyone,"{'retweet_count': 18, 'reply_count': 31, 'like...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19626,2023-10-16 22:41:00+00:00,@DorotaNiedziela ja Tobie też❣️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,


id 0 of a tweet may mean, that the tweet was, changed, deleted, or that it is not a tweet but something else and was wrongly categorized as one

We need to remove duplicate tweets + delete the tweets that were 0; because our custom downloading loop occasionally downloads the same tweet two times to ensure completeness.

In [449]:
import pandas as pd
import numpy as np

# 1) Copy the original DataFrame before cleaning
df_before = df.copy()

# 2) Get the initial size
initial_size = len(df_before)
print(f"Initial number of tweets: {initial_size}")

# 3) Check and report issues with the 'id' column
print("\n--- ID QUALITY CHECK ---")

# Convert id to string for consistent checking
df_before['id'] = df_before['id'].astype(str)

# Check for various problems
missing_ids = df_before['id'].isna()
empty_ids = df_before['id'] == ''
zero_ids = df_before['id'] == '0'
very_short_ids = df_before['id'].str.len() < 5  # Twitter IDs are typically longer

# Report on ID issues
print(f"Missing IDs (NaN): {missing_ids.sum()} ({missing_ids.mean():.2%})")
print(f"Empty IDs: {empty_ids.sum()} ({empty_ids.mean():.2%})")
print(f"Zero IDs ('0'): {zero_ids.sum()} ({zero_ids.mean():.2%})")
print(f"Very short IDs (< 5 chars): {very_short_ids.sum()} ({very_short_ids.mean():.2%})")

# Create a mask for all problematic IDs
problematic_ids_mask = missing_ids | empty_ids | zero_ids | very_short_ids

# Report total problematic IDs
print(f"Total problematic IDs: {problematic_ids_mask.sum()} ({problematic_ids_mask.mean():.2%})")

# 4) First filter out problematic IDs from the original dataset
df_no_problems = df_before[~problematic_ids_mask].copy()
problematic_removed = initial_size - len(df_no_problems)

# 5) Then remove duplicates from the dataset without problematic IDs
df_after = df_no_problems.drop_duplicates(subset=['id'])
duplicates_removed = len(df_no_problems) - len(df_after)

# 6) Calculate removed counts and percentages
remaining_final = len(df_after)
total_removed = initial_size - remaining_final

duplicate_percentage = (duplicates_removed / initial_size) * 100
problematic_percentage = (problematic_removed / initial_size) * 100
total_removed_percentage = (total_removed / initial_size) * 100
remaining_percentage = (remaining_final / initial_size) * 100

# 7) Print comprehensive results
print("\n--- CLEANING SUMMARY ---")
print(f"Initial tweets: {initial_size}")
print(f"Problematic ID tweets removed: {problematic_removed} ({problematic_percentage:.2f}%)")
print(f"Duplicate tweets removed: {duplicates_removed} ({duplicate_percentage:.2f}%)")
print(f"Total tweets removed: {total_removed} ({total_removed_percentage:.2f}%)")
print(f"Tweets remaining: {remaining_final} ({remaining_percentage:.2f}%)")

# 8) Show sample of problematic IDs
if problematic_ids_mask.sum() > 0:
    print("\nSample of problematic IDs:")
    sample_problematic = df_before[problematic_ids_mask].head(5)
    for i, (idx, row) in enumerate(sample_problematic.iterrows()):
        print(f"  {i+1}. ID: '{row['id']}', Text: '{row['text'][:50]}...'")

# 9) Identify the actual duplicate IDs from the data without problematic IDs
duplicate_ids = df_no_problems[df_no_problems.duplicated(subset=['id'], keep='first')]['id'].unique().tolist()
print(f"\nNumber of unique duplicate IDs: {len(duplicate_ids)}")
if duplicate_ids:
    print("Sample of duplicate IDs (first 5):")
    for i, dup_id in enumerate(duplicate_ids[:5]):
        print(f"  {i+1}. {dup_id}")
else:
    print("No duplicates found")

# 10) Keep df_after as the new df
df = df_after
print(f"\nFinal clean dataframe shape: {df.shape}")

# 11) Verify no problematic IDs remain
if (df['id'] == '0').sum() > 0 or df['id'].isna().sum() > 0 or (df['id'] == '').sum() > 0 or (df['id'].str.len() < 5).sum() > 0:
    print("WARNING: Some problematic IDs still remain in the cleaned dataframe")
else:
    print("SUCCESS: All problematic IDs have been removed")

Initial number of tweets: 26677

--- ID QUALITY CHECK ---
Missing IDs (NaN): 0 (0.00%)
Empty IDs: 0 (0.00%)
Zero IDs ('0'): 16 (0.06%)
Very short IDs (< 5 chars): 16 (0.06%)
Total problematic IDs: 16 (0.06%)

--- CLEANING SUMMARY ---
Initial tweets: 26677
Problematic ID tweets removed: 16 (0.06%)
Duplicate tweets removed: 114 (0.43%)
Total tweets removed: 130 (0.49%)
Tweets remaining: 26547 (99.51%)

Sample of problematic IDs:
  1. ID: '0', Text: '@tomekbit ✌️...'
  2. ID: '0', Text: '@MaciejGdynia Maćku, czekam na oficjalne wyniki, ż...'
  3. ID: '0', Text: '@MCichonAlicja Alu, czekamy jeszcze na wynik?...'
  4. ID: '0', Text: '@REL_76 🥰🥰🥰...'
  5. ID: '0', Text: '@Gidziela 🥰✌️...'

Number of unique duplicate IDs: 113
Sample of duplicate IDs (first 5):
  1. 1762109123800293376
  2. 1714195119706890496
  3. 1794299341743829248
  4. 1780108572161945856
  5. 1778400392470056960

Final clean dataframe shape: (26547, 20)
SUCCESS: All problematic IDs have been removed


In [450]:
# 1) How many total rows have a duplicate 'id' (including the first occurrence)?
total_dup_rows = df.duplicated(subset=['id'], keep=False).sum()
print(f"Total rows that share a duplicate ID (including the first occurrence): {total_dup_rows}")

# 2) How many rows are "extra" duplicates beyond the first?
extra_dup_rows = df.duplicated(subset=['id'], keep='first').sum()
print(f"Number of extra duplicates beyond the first occurrence: {extra_dup_rows}")

# 3) How many unique IDs appear more than once?
duplicate_ids = df[df.duplicated(subset=['id'], keep=False)]['id'].unique()
num_duplicate_ids = len(duplicate_ids)
print(f"Number of unique IDs that are duplicated: {num_duplicate_ids}")

Total rows that share a duplicate ID (including the first occurrence): 0
Number of extra duplicates beyond the first occurrence: 0
Number of unique IDs that are duplicated: 0


In [451]:
df.head()

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 17, 'reply_count': 4, 'like_...",pl,[1765380709114409272],1765380709114409216,Original,,,Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1764245727646044196],1764245727646044160,Original,{'media_keys': ['3_1764245720805040128']},"{'urls': [{'start': 57, 'end': 80, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763931255521857839],1763931255521857792,Original,{'media_keys': ['7_1763931103151202304']},"{'urls': [{'start': 135, 'end': 158, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763576337078313201],1763576337078313216,Original,{'media_keys': ['3_1763576329385967617']},"{'urls': [{'start': 64, 'end': 87, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763173358068281524],1763173358068281600,Original,{'media_keys': ['3_1763173351466401792']},"{'urls': [{'start': 128, 'end': 151, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,


In [452]:
# Get the value counts of the 'category' column
category_counts = df['category'].value_counts()

# Display the counts
print(category_counts)

# Get the number of unique categories
unique_category_count = category_counts.count()
print(f"Number of unique categories: {unique_category_count}")

category
Original    17595
Reply        5640
Quote        2826
Retweet       486
Name: count, dtype: int64
Number of unique categories: 4


We need to delete retweets because they are wrongly provided by the X API. We want to analyze only original tweets, replies, and quotes.

In [453]:
df = df[df['category'] != 'Retweet']

In [454]:
df

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 17, 'reply_count': 4, 'like_...",pl,[1765380709114409272],1765380709114409216,Original,,,Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1764245727646044196],1764245727646044160,Original,{'media_keys': ['3_1764245720805040128']},"{'urls': [{'start': 57, 'end': 80, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763931255521857839],1763931255521857792,Original,{'media_keys': ['7_1763931103151202304']},"{'urls': [{'start': 135, 'end': 158, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763576337078313201],1763576337078313216,Original,{'media_keys': ['3_1763576329385967617']},"{'urls': [{'start': 64, 'end': 87, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763173358068281524],1763173358068281600,Original,{'media_keys': ['3_1763173351466401792']},"{'urls': [{'start': 128, 'end': 151, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26671,2023-10-24 10:46:22+00:00,Formacje demokratyczne będą tworzyć przyszły r...,1716768088052359424,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 344, 'reply_count': 2013, 'l...",pl,[1716768088052359454],1716768088052359424,Original,{'media_keys': ['3_1716768082402664449']},"{'urls': [{'start': 145, 'end': 168, 'url': 'h...",KosiniakKamysz_2023-10-16_2024-10-15.json,PSL,,,,
26672,2023-10-19 18:05:26+00:00,"Żadne podchody, kłamstwa czy medialne wrzutki ...",1715066644018381312,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 1149, 'reply_count': 2406, '...",pl,[1715066644018381294],1715066644018381312,Original,,,KosiniakKamysz_2023-10-16_2024-10-15.json,PSL,,,,
26673,2023-10-19 08:52:05+00:00,"Polska potrzebuje nowego, demokratycznego rząd...",1714927388637696256,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 185, 'reply_count': 1108, 'l...",pl,[1714927388637696291],1714927388637696256,Original,,,KosiniakKamysz_2023-10-16_2024-10-15.json,PSL,,,,
26674,2023-10-18 16:12:37+00:00,95 lat temu zmarł generał Tadeusz Jordan Rozwa...,1714675864237924864,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 110, 'reply_count': 354, 'li...",pl,[1714675864237924823],1714675864237924864,Original,{'media_keys': ['3_1714675857548009473']},"{'urls': [{'start': 264, 'end': 287, 'url': 'h...",KosiniakKamysz_2023-10-16_2024-10-15.json,PSL,,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",,


In [455]:
# Update the 'username' column to keep only the string until '_2' -> split to date range

#df['username'] = df['username'].str.split('_2').str[0].copy()
df.loc[:, 'username'] = df['username'].str.split('_2').str[0]

In [456]:
category_summary = df['category'].value_counts()
print(category_summary)
total_tweets = category_summary.sum()
print(f"Total tweets: {total_tweets}")

category
Original    17595
Reply        5640
Quote        2826
Name: count, dtype: int64
Total tweets: 26061


In [457]:
# Ensure the created_at column is in datetime format

#df['created_at'] = pd.to_datetime(df['created_at'])
df.loc[:, 'created_at'] = pd.to_datetime(df['created_at'])

In [458]:
df.head()

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 17, 'reply_count': 4, 'like_...",pl,[1765380709114409272],1765380709114409216,Original,,,Iwaszkiewicz_RJ,Konfederacja,,,,
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1764245727646044196],1764245727646044160,Original,{'media_keys': ['3_1764245720805040128']},"{'urls': [{'start': 57, 'end': 80, 'url': 'htt...",Iwaszkiewicz_RJ,Konfederacja,,,,
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763931255521857839],1763931255521857792,Original,{'media_keys': ['7_1763931103151202304']},"{'urls': [{'start': 135, 'end': 158, 'url': 'h...",Iwaszkiewicz_RJ,Konfederacja,,,,
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763576337078313201],1763576337078313216,Original,{'media_keys': ['3_1763576329385967617']},"{'urls': [{'start': 64, 'end': 87, 'url': 'htt...",Iwaszkiewicz_RJ,Konfederacja,,,,
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763173358068281524],1763173358068281600,Original,{'media_keys': ['3_1763173351466401792']},"{'urls': [{'start': 128, 'end': 151, 'url': 'h...",Iwaszkiewicz_RJ,Konfederacja,,,,


In [459]:
df.loc[1, 'text']

'Zwróćcie uwagę na punkt 11.\nTu nie ma z czego się śmiać… https://t.co/I5KaMsUDzo'

Emojis handler

In [460]:
def add_space_around_emojis(text):
    return ''.join(f' {char} ' if char in emoji.EMOJI_DATA or re.match(r'[\U0001F1E6-\U0001F1FF]', char) else char for char in text)

df['text'] = df['text'].apply(add_space_around_emojis)

def clean_text(text):
    mentions = re.findall(r'@\w+', text)
    text = re.sub(r'@\w+', '', text)
    links = re.findall(r'http\S+', text)
    text = re.sub(r'http\S+', '', text)
    hashtags = re.findall(r'#\w+', text)
    text = re.sub(r'(?<!\s)([\U0001F600-\U0001F64F])', r' \1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F])(?!\s)', r'\1 ', text)
    return [text, mentions, links, hashtags]

df[['text_clean', 'mentions', 'links', 'hashtags']] = pd.DataFrame(df['text'].apply(clean_text).tolist(), index=df.index)

In [461]:
import pandas as pd
pd.options.mode.chained_assignment = None  # Turn off the warning from lack of loc

In [462]:
df.head()

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,...,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo,text_clean,mentions,links,hashtags
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 17, 'reply_count': 4, 'like_...",pl,[1765380709114409272],...,Iwaszkiewicz_RJ,Konfederacja,,,,,"Policja rzucająca kostką brukową w rolników, p...",[],[],[]
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1764245727646044196],...,Iwaszkiewicz_RJ,Konfederacja,,,,,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,[],[https://t.co/I5KaMsUDzo],[]
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763931255521857839],...,Iwaszkiewicz_RJ,Konfederacja,,,,,"Równość, ale nie dla wszystkich… \nPani „minis...",[],[https://t.co/PyKYCMvSN3],[]
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763576337078313201],...,Iwaszkiewicz_RJ,Konfederacja,,,,,Pytanie za sto punktów! \nFirma jakiego państw...,[],[https://t.co/g61Cb5BsIx],[]
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763173358068281524],...,Iwaszkiewicz_RJ,Konfederacja,,,,,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,[],[https://t.co/b0Q4Tt1FJW],[]


In [463]:
df.drop(columns=['entities'], inplace=True)

In [464]:
# Some additioanl numerical data from tweets is extracted and added to the dataframe as new variables, then the original column is dropped
df['retweet_count'] = df['public_metrics'].apply(lambda x: x['retweet_count'])
df['reply_count'] = df['public_metrics'].apply(lambda x: x['reply_count'])
df['like_count'] = df['public_metrics'].apply(lambda x: x['like_count'])
df['quote_count'] = df['public_metrics'].apply(lambda x: x['quote_count'])
df['impression_count'] = df['public_metrics'].apply(lambda x: x['impression_count'])

df.drop(columns=['public_metrics'], inplace=True)

In [465]:
df

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,lang,edit_history_tweet_ids,conversation_id,...,geo,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1765380709114409272],1765380709114409216,...,,"Policja rzucająca kostką brukową w rolników, p...",[],[],[],17,4,203,0,12028
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1764245727646044196],1764245727646044160,...,,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,[],[https://t.co/I5KaMsUDzo],[],0,0,2,0,275
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763931255521857839],1763931255521857792,...,,"Równość, ale nie dla wszystkich… \nPani „minis...",[],[https://t.co/PyKYCMvSN3],[],0,0,0,0,188
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763576337078313201],1763576337078313216,...,,Pytanie za sto punktów! \nFirma jakiego państw...,[],[https://t.co/g61Cb5BsIx],[],0,0,1,0,170
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763173358068281524],1763173358068281600,...,,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,[],[https://t.co/b0Q4Tt1FJW],[],0,0,0,0,124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26671,2023-10-24 10:46:22+00:00,Formacje demokratyczne będą tworzyć przyszły r...,1716768088052359424,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,pl,[1716768088052359454],1716768088052359424,...,,Formacje demokratyczne będą tworzyć przyszły r...,[],[https://t.co/XFrtPBIMiv],[],344,2013,4637,100,330379
26672,2023-10-19 18:05:26+00:00,"Żadne podchody, kłamstwa czy medialne wrzutki ...",1715066644018381312,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,pl,[1715066644018381294],1715066644018381312,...,,"Żadne podchody, kłamstwa czy medialne wrzutki ...",[],[],[],1149,2406,12760,182,689059
26673,2023-10-19 08:52:05+00:00,"Polska potrzebuje nowego, demokratycznego rząd...",1714927388637696256,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,pl,[1714927388637696291],1714927388637696256,...,,"Polska potrzebuje nowego, demokratycznego rząd...",[],[],[],185,1108,3443,47,385751
26674,2023-10-18 16:12:37+00:00,95 lat temu zmarł generał Tadeusz Jordan Rozwa...,1714675864237924864,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,pl,[1714675864237924823],1714675864237924864,...,,95 lat temu zmarł generał Tadeusz Jordan Rozwa...,[],[https://t.co/TZaorUiXQ1],[],110,354,1568,21,205237


In [466]:
df.dtypes

created_at                datetime64[ns, UTC]
text                                   object
id                                     object
author_id                             float64
edit_controls                          object
possibly_sensitive                     object
reply_settings                         object
lang                                   object
edit_history_tweet_ids                 object
conversation_id                       float64
category                               object
attachments                            object
username                               object
party                                  object
in_reply_to_user_id                   float64
context_annotations                    object
referenced_tweets                      object
geo                                    object
text_clean                             object
mentions                               object
links                                  object
hashtags                          

In [468]:
df.head()

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,lang,edit_history_tweet_ids,conversation_id,...,geo,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1765380709114409272],1765380709114409216,...,,"Policja rzucająca kostką brukową w rolników, p...",[],[],[],17,4,203,0,12028
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1764245727646044196],1764245727646044160,...,,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,[],[https://t.co/I5KaMsUDzo],[],0,0,2,0,275
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763931255521857839],1763931255521857792,...,,"Równość, ale nie dla wszystkich… \nPani „minis...",[],[https://t.co/PyKYCMvSN3],[],0,0,0,0,188
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763576337078313201],1763576337078313216,...,,Pytanie za sto punktów! \nFirma jakiego państw...,[],[https://t.co/g61Cb5BsIx],[],0,0,1,0,170
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763173358068281524],1763173358068281600,...,,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,[],[https://t.co/b0Q4Tt1FJW],[],0,0,0,0,124


In [469]:
df.dtypes

created_at                datetime64[ns, UTC]
text                                   object
id                                     object
author_id                             float64
edit_controls                          object
possibly_sensitive                     object
reply_settings                         object
lang                                   object
edit_history_tweet_ids                 object
conversation_id                       float64
category                               object
attachments                            object
username                               object
party                                  object
in_reply_to_user_id                   float64
context_annotations                    object
referenced_tweets                      object
geo                                    object
text_clean                             object
mentions                               object
links                                  object
hashtags                          

In [471]:
import pandas as pd

# Step 1: Check for duplicate columns and remove them
if df.columns.duplicated().any():
    print("Duplicate columns found! Removing them...")
    df_no_duplicates = df.loc[:, ~df.columns.duplicated()]  # Keep the first occurrence of each column
else:
    df_no_duplicates = df.copy()

# Step 2: Convert 'id' column to string (if needed)
df_no_duplicates['id'] = df_no_duplicates['id'].astype(str)

# Step 3: Check for missing or empty values in 'text' and 'text_clean'
empty_text = df_no_duplicates[df_no_duplicates['text'].isna() | (df_no_duplicates['text'].astype(str).str.strip() == '')]
empty_text_clean = df_no_duplicates[df_no_duplicates['text_clean'].isna() | (df_no_duplicates['text_clean'].astype(str).str.strip() == '')]

print(f"Rows where 'text' is empty or null: {empty_text.shape[0]}")
print(empty_text[['id', 'text', 'text_clean']].head())

print(f"\nRows where 'text_clean' is empty or null: {empty_text_clean.shape[0]}")
empty_text_clean[['id', 'text', 'text_clean']].head()


Rows where 'text' is empty or null: 0
Empty DataFrame
Columns: [id, text, text_clean]
Index: []

Rows where 'text_clean' is empty or null: 374


Unnamed: 0,id,text,text_clean
27,1844711276577964544,@Nowa_Nadzieja_ @KONFEDERACJA_,
103,1838621761090285568,@KONFEDERACJA_ @Nowa_Nadzieja_,
265,1821520992629305600,https://t.co/H9BQbYjylo,
606,1768213892272959744,@MPerspektywa @AdamAbramowicz1 https://t.co/bz...,
797,1733398115917402624,https://t.co/uXKjbwD1DQ,


saving data used for translation 

In [472]:

df_clean_text = df[['id', 'text', 'text_clean']]

df_clean_text.to_csv('data/02.processed/data_for_translation.csv', index=False)
df.to_csv('data/02.processed/whole_dataset_for_translation.csv', index=False)

In [473]:
df_clean_text.dtypes

id            object
text          object
text_clean    object
dtype: object

reading data used for translation

In [474]:
# Read CSV with ID column as string (text)
df_clean_text = pd.read_csv('Data/02.processed/data_for_translation.csv', dtype={'id': str})

# Verify the column type
print("ID column type:", df_clean_text['id'].dtype)
print("Sample ID:", df_clean_text['id'].iloc[0], "of type", type(df_clean_text['id'].iloc[0]))

ID column type: object
Sample ID: 1765380709114409216 of type <class 'str'>


In [475]:
df_clean_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26061 entries, 0 to 26060
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          26061 non-null  object
 1   text        26061 non-null  object
 2   text_clean  25848 non-null  object
dtypes: object(3)
memory usage: 610.9+ KB


In [476]:
df_clean_text

Unnamed: 0,id,text,text_clean
0,1765380709114409216,"Policja rzucająca kostką brukową w rolników, p...","Policja rzucająca kostką brukową w rolników, p..."
1,1764245727646044160,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...
2,1763931255521857792,"Równość, ale nie dla wszystkich… \nPani „minis...","Równość, ale nie dla wszystkich… \nPani „minis..."
3,1763576337078313216,Pytanie za sto punktów! \nFirma jakiego państw...,Pytanie za sto punktów! \nFirma jakiego państw...
4,1763173358068281600,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,Ukraina jeździ na polskim paliwie. \n\nPrzypom...
...,...,...,...
26056,1716768088052359424,Formacje demokratyczne będą tworzyć przyszły r...,Formacje demokratyczne będą tworzyć przyszły r...
26057,1715066644018381312,"Żadne podchody, kłamstwa czy medialne wrzutki ...","Żadne podchody, kłamstwa czy medialne wrzutki ..."
26058,1714927388637696256,"Polska potrzebuje nowego, demokratycznego rząd...","Polska potrzebuje nowego, demokratycznego rząd..."
26059,1714675864237924864,95 lat temu zmarł generał Tadeusz Jordan Rozwa...,95 lat temu zmarł generał Tadeusz Jordan Rozwa...


In [477]:
# Filter rows where 'text_clean' is null OR empty (after stripping whitespace)
null_or_empty_text_clean = df_clean_text[
    df_clean_text['text_clean'].isna() | 
    (df_clean_text['text_clean'].astype(str).str.strip() == '')
]

# Display the number of problematic rows
print(f"Rows where 'text_clean' is null or empty: {null_or_empty_text_clean.shape[0]}")

# Show the affected rows
null_or_empty_text_clean[['id', 'text', 'text_clean']]

Rows where 'text_clean' is null or empty: 374


Unnamed: 0,id,text,text_clean
26,1844711276577964544,@Nowa_Nadzieja_ @KONFEDERACJA_,
102,1838621761090285568,@KONFEDERACJA_ @Nowa_Nadzieja_,
264,1821520992629305600,https://t.co/H9BQbYjylo,
605,1768213892272959744,@MPerspektywa @AdamAbramowicz1 https://t.co/bz...,
796,1733398115917402624,https://t.co/uXKjbwD1DQ,
...,...,...,...
24206,1791516338030051840,https://t.co/63YfThCg0s,
24380,1725621854751047936,@bmarganiec @szymon_holownia https://t.co/5SI3...,
24381,1725621750354923776,@MartaWitecka1 @szymon_holownia https://t.co/o...,
24698,1846211673155162368,https://t.co/fYTVm6hzqS,


In [479]:
# 1. Print the total number of rows in df_clean_text
print("Total rows in df_clean_text:", len(df_clean_text))

# 2. Filter out rows where 'text_clean' is null or an empty string (after stripping whitespace)
valid_rows = df_clean_text[
    ~(
        df_clean_text['text_clean'].isna() 
        | (df_clean_text['text_clean'].astype(str).str.strip() == '')
    )
]

# 3. Print the number of those valid (non-empty) rows
print("Rows with non-empty 'text_clean':", len(valid_rows))

Total rows in df_clean_text: 26061
Rows with non-empty 'text_clean': 25687


reading translation dataset 

In [480]:
df_en_text = pd.read_csv('data/02.processed/tweets_translation/text_clean_en1.csv')
df_en_text


Unnamed: 0,id,text_clean,text_clean_en
0,1846086999964283136,❌ Rząd polski zamierza budować w Polsce 49 C...,❌ The Polish government intends to build 49 F...
1,1845748090461966592,❌ Szambo wybija i robi się coraz ciekawiej. ...,❌ The cesspool is breaking out and it's getti...
2,1845366606823657984,"❌ NIE ROZUMIEM, JAK MOŻNA KRZYWDZIĆ W TEN SP...",❌ I DON'T UNDERSTAND HOW YOU CAN HURT YOUR OW...
3,1845006197847360000,🆘 Firma farmaceutyczna GSK zapłaci ponad 2 ...,🆘 The pharmaceutical company GSK will pay ove...
4,1844633149784891648,❌ O CO TUTAJ CHODZI? W październiku 2024 r. ...,"❌ WHAT IS GOING ON HERE? In October 2024, her..."
...,...,...,...
25683,1717754912929435904,"Gdyby coś się zmieniło, jestem do dyspozycji ...","If anything changes, I am at your disposal 😄 👍"
25684,1717638951333245440,Zaświadczenie o wyborze na Posła na Sejm RP 🇵...,I also have a certificate of election as a Mem...
25685,1717063007988052224,Ceny paliw na stacjach Orlen rosną w szybkim t...,Fuel prices at Orlen stations are rising rapid...
25686,1716417809469538560,Taki mandat to ja rozumiem 😃 \nDziękuję za każ...,I understand this mandate 😃 \nThank you for ev...


In [552]:
# to correction later on - delete it after correction
df_en_text_v2 = pd.read_csv('data/02.processed/tweets_translation/data-for-translation_v2.csv')
df_en_text_v2

df_en_text_v2['id'] = df_en_text_v2['id'].astype(str)
df_en_text_v2 = df_en_text_v2[['id', 'text_clean', 'text_clean_en']].copy()

In [553]:
print(df_en_text.shape[1])

print(df.shape[1])

print(df_en_text_v2.shape[1])

3
27
3


In [554]:
df_en_text.dtypes

id               object
text_clean       object
text_clean_en    object
dtype: object

In [555]:
df_en_text_v2.dtypes

id               object
text_clean       object
text_clean_en    object
dtype: object

In [556]:
print(len(df_clean_text))
df_clean_text[df_clean_text["id"]=="1765380709114409216"]

26061


Unnamed: 0,id,text,text_clean
0,1765380709114409216,"Policja rzucająca kostką brukową w rolników, p...","Policja rzucająca kostką brukową w rolników, p..."


In [557]:
print(len(df))
df[df["id"]=="1765380709114409216"]

26061


Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,lang,edit_history_tweet_ids,conversation_id,...,geo,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1765380709114409272],1765380709114409216,...,,"Policja rzucająca kostką brukową w rolników, p...",[],[],[],17,4,203,0,12028


In [567]:
print(len(df_en_text_v2))
df_en_text_v2 #1765380709114409216 #1713706323672539392

30


Unnamed: 0,id,text_clean,text_clean_en
0,1807795860480160000,🇳 🇱 Holenderski klub NAC Breda przygotował ...,🇳 🇱 The Dutch club NAC Breda has prepared spe...
1,1729073656112800000,Serdecznie zapraszam na spotkanie z redaktorem...,I cordially invite you to a meeting with edito...
2,1722289952547840000,Zapraszam na spotkanie członków i sympatyków #...,I invite you to a meeting of members and suppo...
3,1820187117135520000,"Jak podaje NASK: około 15 tysięcy „trolli”, od...","According to NASK: about 15,000 ""trolls"" have ..."
4,1783776678776320000,Tak.,@DonBrajo @krzysztofbosak @szymon_holownia Yes.
5,1747645626219680000,"Jeśli się boisz, już jesteś niewolnikiem!","If you are afraid, you are already a slave! ht..."
6,1735658052001440000,Obrzydliwy list Episkopatu w sprawie krajowego...,A disgusting letter from the Episcopate regard...
7,1798344610043040000,Gdybym głosowała w okręgu łączącym województwo...,If I voted in the district connecting the West...
8,1757761149565600000,Zmiana definicji zgwałcenia wraca do razem z ...,Changing the definition of rape returns to @Se...
9,1788269097861440000,Powodzenia!!!!,@magdadropek @__Lewica @fraczekkk @g_garbolins...


merging second version of translated dataset with original one

In [564]:
# Step 1: Make sure IDs are strings
df['id'] = df['id'].astype(str)
df_en_text['id'] = df_en_text['id'].astype(str)
df_en_text_v2['id'] = df_en_text_v2['id'].astype(str)

# Step 2: Combine the two translation sets (v1 + v2)
df_en_combined = pd.concat([df_en_text, df_en_text_v2], ignore_index=True)

# Step 3: Drop duplicates by 'id' to keep only the latest (e.g., if v2 is more up-to-date)
df_en_combined = df_en_combined.drop_duplicates(subset='id', keep='last')

# Step 4: Merge back into the full dataset to get a unified view
df_merged = df.merge(df_en_combined[['id', 'text_clean_en']], on='id', how='left')

print(f"Total rows after merge: {len(df_merged)} (should be 26061)")


Total rows after merge: 26061 (should be 26061)


check wether the data went correctly

In [568]:
df_merged[df_merged["id"]=="1807795860480160000"]

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,lang,edit_history_tweet_ids,conversation_id,...,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count,text_clean_en
420,2024-07-01 15:18:15+00:00,🇳 🇱 Holenderski klub NAC Breda przygotował ...,1807795860480160000,1182211615,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1807795860480159980],1807795860480160000,...,🇳 🇱 Holenderski klub NAC Breda przygotował ...,[],"[https://t.co/WI736ocWQ7, https://t.co/ANX3x4e...",[],5,1,24,0,603,🇳 🇱 The Dutch club NAC Breda has prepared spe...


In [569]:
# Check how many rows still have missing or empty translations
missing_translation_mask = df_merged['text_clean_en'].isna() | (df_merged['text_clean_en'].str.strip() == '')

# Show some of them
df_missing_translation = df_merged[missing_translation_mask]
print(f"Rows without translation: {df_missing_translation.shape[0]}")
display(df_missing_translation[['id', 'text', 'text_clean', 'text_clean_en']].head())


Rows without translation: 374


Unnamed: 0,id,text,text_clean,text_clean_en
26,1844711276577964544,@Nowa_Nadzieja_ @KONFEDERACJA_,,
102,1838621761090285568,@KONFEDERACJA_ @Nowa_Nadzieja_,,
264,1821520992629305600,https://t.co/H9BQbYjylo,,
605,1768213892272959744,@MPerspektywa @AdamAbramowicz1 https://t.co/bz...,,
796,1733398115917402624,https://t.co/uXKjbwD1DQ,,


removing rows withtout translation due to possessing text that is not being analyzed by our research

In [None]:
# Remove them
df_clean_translated = df_merged[~missing_translation_mask].copy()

print(f"Remaining rows with proper translation: {df_clean_translated.shape[0]}")

Remaining rows with proper translation: 25687


In [None]:
df_clean_translated.head()

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,lang,edit_history_tweet_ids,conversation_id,...,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count,text_clean_en
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1765380709114409272],1765380709114409216,...,"Policja rzucająca kostką brukową w rolników, p...",[],[],[],17,4,203,0,12028,"Police throwing paving stones at farmers, MP s..."
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1764245727646044196],1764245727646044160,...,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,[],[https://t.co/I5KaMsUDzo],[],0,0,2,0,275,Please pay attention to point 11.\nThere's not...
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763931255521857839],1763931255521857792,...,"Równość, ale nie dla wszystkich… \nPani „minis...",[],[https://t.co/PyKYCMvSN3],[],0,0,0,0,188,"Equality, but not for everyone... \nThe ""minis..."
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763576337078313201],1763576337078313216,...,Pytanie za sto punktów! \nFirma jakiego państw...,[],[https://t.co/g61Cb5BsIx],[],0,0,1,0,170,One hundred point question! \nWhich country's ...
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763173358068281524],1763173358068281600,...,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,[],[https://t.co/b0Q4Tt1FJW],[],0,0,0,0,124,Ukraine runs on Polish fuel. \n\nI would like ...


In [595]:
df_clean_translated.to_csv('data/02.processed/df_clean_translated_further_analalysis.csv', index=False)

In [603]:
def count_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & pictographs
        "\U0001F680-\U0001F6FF"  # Transport & map symbols
        "\U0001F700-\U0001F77F"  # Alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric shapes
        "\U0001F800-\U0001F8FF"  # Supplemental arrows
        "\U0001F900-\U0001F9FF"  # Supplemental symbols and pictographs
        "\U0001FA00-\U0001FA6F"  # Chess symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and pictographs extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed characters
        "]+",
        flags=re.UNICODE,
    )
    return len(emoji_pattern.findall(text))


In [604]:
# Demojize text columns
df_clean_translated['text_clean_en_demojized'] = df_clean_translated['text_clean_en'].apply(
    lambda x: emoji.demojize(str(x)) if pd.notnull(x) else x
)
df_clean_translated['text_clean_demojized'] = df_clean_translated['text_clean'].apply(
    lambda x: emoji.demojize(str(x)) if pd.notnull(x) else x
)

# Count emojis in original text columns
df_clean_translated['emoji_count_en'] = df_clean_translated['text_clean_en'].apply(
    lambda x: count_emojis(str(x)) if pd.notnull(x) else 0
)
df_clean_translated['emoji_count'] = df_clean_translated['text_clean'].apply(
    lambda x: count_emojis(str(x)) if pd.notnull(x) else 0
)


In [605]:
# Total number of rows
total_rows = len(df_clean_translated)

# Rows with emojis in 'text_clean_en'
rows_with_emojis_en = df_clean_translated[df_clean_translated['emoji_count_en'] > 0].shape[0]

# Rows with emojis in 'text_clean'
rows_with_emojis = df_clean_translated[df_clean_translated['emoji_count'] > 0].shape[0]

# Display statistics
print(f"Total number of rows: {total_rows}")
print(f"Rows with emojis in 'text_clean_en': {rows_with_emojis_en} ({(rows_with_emojis_en/total_rows)*100:.2f}%)")
print(f"Rows with emojis in 'text_clean': {rows_with_emojis} ({(rows_with_emojis/total_rows)*100:.2f}%)")


Total number of rows: 25687
Rows with emojis in 'text_clean_en': 9631 (37.49%)
Rows with emojis in 'text_clean': 9817 (38.22%)


In [606]:
df_clean_translated[['text_clean_en', 'text_clean_en_demojized', 'emoji_count_en', 'text_clean', 'text_clean_demojized', 'emoji_count']].head()


Unnamed: 0,text_clean_en,text_clean_en_demojized,emoji_count_en,text_clean,text_clean_demojized,emoji_count
0,"Police throwing paving stones at farmers, MP s...","Police throwing paving stones at farmers, MP s...",0,"Policja rzucająca kostką brukową w rolników, p...","Policja rzucająca kostką brukową w rolników, p...",0
1,Please pay attention to point 11.\nThere's not...,Please pay attention to point 11.\nThere's not...,0,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,0
2,"Equality, but not for everyone... \nThe ""minis...","Equality, but not for everyone... \nThe ""minis...",0,"Równość, ale nie dla wszystkich… \nPani „minis...","Równość, ale nie dla wszystkich… \nPani „minis...",0
3,One hundred point question! \nWhich country's ...,One hundred point question! \nWhich country's ...,1,Pytanie za sto punktów! \nFirma jakiego państw...,Pytanie za sto punktów! \nFirma jakiego państw...,1
4,Ukraine runs on Polish fuel. \n\nI would like ...,Ukraine runs on Polish fuel. \n\nI would like ...,0,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,0


In [607]:
# Filter rows
rows_with_emojis_in_text_clean_only = df_clean_translated[
    (df_clean_translated['emoji_count'] > 0) & (df_clean_translated['emoji_count_en'] == 0)
]

# Display the number of such rows
print(f"Number of rows with emojis in 'text_clean' but not in 'text_clean_en': {len(rows_with_emojis_in_text_clean_only)}")

# Display the affected rows
rows_with_emojis_in_text_clean_only[['text_clean', 'text_clean_en']]


Number of rows with emojis in 'text_clean' but not in 'text_clean_en': 186


Unnamed: 0,text_clean,text_clean_en
466,"Mamy Państwo z dykty, a Kosiniak-Kamysz natych...","We are out of business, and Kosiniak-Kamysz sh..."
602,Dyrektywa budynkowa przyjęta! Przymusowe remon...,The Building Directive has been adopted! Compu...
700,"Gdzie tu sens, gdzie logika ⁉ ️","Where is the sense, where is the logic?"
2082,Tylko głosowała w całości przeciwko dalszemu...,Only she voted entirely against further social...
2455,Recepta na problemy na polskiej granicy jest b...,The solution to problems at the Polish border ...
...,...,...
25403,"Dziś, w Pałacu Prezydenckim odbyła się uroczys...","Today, the ceremony of awarding nominations to..."
25744,W przeddzień święta Wojska Polskiego składamy ...,"On the eve of the Polish Army Day, we pay trib..."
25873,20 lat temu Polska wstąpiła do Unii Europejski...,"20 years ago, Poland joined the European Union..."
25897,Możliwa eskalacja sytuacji na Bliskim Wschodzi...,The possible escalation of the situation in th...


In [602]:
df_clean_translated['text_clean_en_demojized'] = df_clean_translated['text_clean_en'].apply(lambda x: emoji.demojize(str(x)) if pd.notnull(x) else x)
df_clean_translated['text_clean_demojized'] = df_clean_translated['text_clean'].apply(lambda x: emoji.demojize(str(x)) if pd.notnull(x) else x)

df_clean_translated[['text_clean_en', 'text_clean_en_demojized', 'text_clean', 'text_clean_demojized']].head()

Unnamed: 0,text_clean_en,text_clean_en_demojized,text_clean,text_clean_demojized
0,"Police throwing paving stones at farmers, MP s...","Police throwing paving stones at farmers, MP s...","Policja rzucająca kostką brukową w rolników, p...","Policja rzucająca kostką brukową w rolników, p..."
1,Please pay attention to point 11.\nThere's not...,Please pay attention to point 11.\nThere's not...,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...
2,"Equality, but not for everyone... \nThe ""minis...","Equality, but not for everyone... \nThe ""minis...","Równość, ale nie dla wszystkich… \nPani „minis...","Równość, ale nie dla wszystkich… \nPani „minis..."
3,One hundred point question! \nWhich country's ...,One hundred point question! \nWhich country's ...,Pytanie za sto punktów! \nFirma jakiego państw...,Pytanie za sto punktów! \nFirma jakiego państw...
4,Ukraine runs on Polish fuel. \n\nI would like ...,Ukraine runs on Polish fuel. \n\nI would like ...,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,Ukraina jeździ na polskim paliwie. \n\nPrzypom...


In [608]:
df_clean_translated['possibly_sensitive'] = df_clean_translated['possibly_sensitive'].astype(bool)

In [609]:
username_to_realname = {
    'bartlomiejpejo': 'Bartłomiej Pejo',
    'RobertBiedron': 'Robert Biedroń',
    'PatrykJaki': 'Patryk Jaki',
    'Kpelczynska': 'Katarzyna Pelczyńska',
    'OklaDrewnowicz': 'Agnieszka Okła-Drewnowicz',
    'KosiniakKamysz': 'Władysław Kosiniak-Kamysz',
    'mwojcik_': 'Michał Wójcik',
    'MorawieckiM': 'Mateusz Morawiecki',
    'SlawomirMentzen': 'Sławomir Mentzen',
    'Wlodek_Skalik': 'Włodzimierz Skalik',
    'BeataSzydlo': 'Beata Szydło',
    'WTumanowicz': 'Witold Tumanowicz',
    'KGawkowski': 'Krzysztof Gawkowski',
    'wlodekczarzasty': 'Włodzimierz Czarzasty',
    'Kaminski_M_': 'Mariusz Kamiński',
    'Macierewicz_A': 'Antoni Macierewicz',
    'elzbietawitek': 'Elżbieta Witek',
    'aga_buczynska': 'Agnieszka Buczyńska',
    'szymon_holownia': 'Szymon Hołownia',
    'DorotaNiedziela': 'Dorota Niedziela',
    'EwaKopacz': 'Ewa Kopacz',
    'Leszczyna': 'Izabela Leszczyna',
    'M_K_Blonska': 'Małgorzata Kidawa-Błońska',
    'bbudka': 'Borys Budka',
    'donaldtusk': 'Donald Tusk',
    'DariuszKlimczak': 'Dariusz Klimczak',
    'GrzybAndrzej': 'Andrzej Grzyb',
    'Hetman_K': 'Krzysztof Hetman',
    'JarubasAdam': 'Adam Jarubas',
    'Paslawska': 'Urszula Pasławska',
    'TudujKrzysztof': 'Krzysztof Tuduj',
    'ZalewskiPawel': 'Paweł Zalewski'
}

# Add the 'name' column to the dataframe
df_clean_translated['name'] = df_clean_translated['username'].map(username_to_realname)

In [None]:
# Delete next line sign from the 'text_clean_en' column
df_clean_translated['text_clean_en'] = df_clean_translated['text_clean_en'].str.replace('\n', ' ')

In [611]:
os.getcwd()

'h:\\000_Projects\\01_GitHub\\Twitter'

In [613]:
# Save the DataFrame to a Parquet file
df_clean_translated.to_parquet('data/03.cleaned/df_combined.parquet', index=False)