

,=-# Data cleaning

In this file, we cleaned the downloaded data:
The main steps include:
1. Adding party affiliation to tweet rows
2. Deleting unnecessary downloaded Retweets.
3. Deleting links and mentions from the tweets text and saving them to separate columns
4. Expanding the column of public metrics
5. Encoding emojis in a unified format
6. Translating tweets using Google Translate in Google Sheets
7. Saving all downloaded tweets to one file

### 1.  Used libraries

In [1]:
import os
import pandas as pd
import re
import emoji

### 2. Reading JSON files and transforming them into party-specific pickle files

In [3]:
base_input_path = 'Data/tweets_data_final' # change folder if needed
subfolders = ['Konfederacja', 'NL', 'PiS', 'PO', 'PL2050', 'PSL']
output_folder = 'data/tweets_data_combined'

os.makedirs(output_folder, exist_ok=True)

for subfolder in subfolders:
    folder_path = os.path.join(base_input_path, subfolder)
    dataframes = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            politician = filename.split("_tweets.json")[0]
            try:
                df = pd.read_json(file_path)  
                df["username"] = politician  
                df["party"] = subfolder
                print(f"Read {len(df)} rows from {file_path}")  
                dataframes.append(df)
            except ValueError as e:
                print(f"Error reading {file_path}: {e}")
    
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        
        output_file_path = os.path.join(output_folder, f'{subfolder}_combined.pkl')
        combined_df.to_pickle(output_file_path) 
        
        print(f"Saved {subfolder} combined data to {output_file_path}")

print("Processing complete!")

Read 11 rows from Data/tweets_data_final\Konfederacja\Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json
Read 964 rows from Data/tweets_data_final\Konfederacja\bartlomiejpejo_2023-10-16_2024-10-15.json
Read 1318 rows from Data/tweets_data_final\Konfederacja\KonradBerkowicz_2024-04-16_2024-10-15_vol1.json
Read 597 rows from Data/tweets_data_final\Konfederacja\MichalWawer_2023-10-16_2024-10-15.json
Read 772 rows from Data/tweets_data_final\Konfederacja\MarSypniewski_2023-10-16_2024-10-15.json
Read 175 rows from Data/tweets_data_final\Konfederacja\TudujKrzysztof_2023-10-16_2024-10-15.json
Read 721 rows from Data/tweets_data_final\Konfederacja\SlawomirMentzen_2023-10-16_2024-10-15.json
Read 750 rows from Data/tweets_data_final\Konfederacja\WTumanowicz_2023-10-16_2024-10-15.json
Read 950 rows from Data/tweets_data_final\Konfederacja\Wlodek_Skalik_2023-10-16_2024-10-15.json
Read 289 rows from Data/tweets_data_final\Konfederacja\KonradBerkowicz_2023-10-15_2024-04-16_vol2.json
Read 421 rows from Data/

### 3. Data cleaning

In [4]:
df_konf = pd.read_pickle(os.path.join(output_folder, 'Konfederacja_combined.pkl'))
df_NL = pd.read_pickle(os.path.join(output_folder, 'NL_combined.pkl'))
df_PIS = pd.read_pickle(os.path.join(output_folder, 'PIS_combined.pkl'))
df_PO = pd.read_pickle(os.path.join(output_folder, 'PO_combined.pkl'))
df_PL2050 = pd.read_pickle(os.path.join(output_folder, 'PL2050_combined.pkl'))
df_PSL = pd.read_pickle(os.path.join(output_folder, 'PSL_combined.pkl'))

In [5]:
df_konf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8177 entries, 0 to 8176
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   created_at              8177 non-null   datetime64[ns, UTC]
 1   text                    8177 non-null   object             
 2   id                      8177 non-null   int64              
 3   author_id               8177 non-null   int64              
 4   edit_controls           8177 non-null   object             
 5   possibly_sensitive      8177 non-null   bool               
 6   reply_settings          8177 non-null   object             
 7   public_metrics          8177 non-null   object             
 8   lang                    8177 non-null   object             
 9   edit_history_tweet_ids  8177 non-null   object             
 10  conversation_id         8177 non-null   int64              
 11  category                8177 non-null   obj

In [6]:
df_konf.head()

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 17, 'reply_count': 4, 'like_...",pl,[1765380709114409272],1765380709114409216,Original,,,Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1764245727646044196],1764245727646044160,Original,{'media_keys': ['3_1764245720805040128']},"{'urls': [{'start': 57, 'end': 80, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763931255521857839],1763931255521857792,Original,{'media_keys': ['7_1763931103151202304']},"{'urls': [{'start': 135, 'end': 158, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763576337078313201],1763576337078313216,Original,{'media_keys': ['3_1763576329385967617']},"{'urls': [{'start': 64, 'end': 87, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763173358068281524],1763173358068281600,Original,{'media_keys': ['3_1763173351466401792']},"{'urls': [{'start': 128, 'end': 151, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,


In [7]:
# Merge all dataframes into one
df = pd.concat([df_konf, df_NL, df_PIS, df_PO, df_PL2050, df_PSL], ignore_index=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26677 entries, 0 to 26676
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   created_at              26677 non-null  datetime64[ns, UTC]
 1   text                    26677 non-null  object             
 2   id                      26661 non-null  float64            
 3   author_id               26677 non-null  float64            
 4   edit_controls           26661 non-null  object             
 5   possibly_sensitive      26661 non-null  object             
 6   reply_settings          26677 non-null  object             
 7   public_metrics          26677 non-null  object             
 8   lang                    26677 non-null  object             
 9   edit_history_tweet_ids  26661 non-null  object             
 10  conversation_id         26661 non-null  float64            
 11  category                26677 non-null  o

In [9]:
len(df)

26677

In [10]:
pd.options.display.float_format = '{:.0f}'.format
df['id'] = df['id'].fillna(0).astype('int64')
df['id']

0        1765380709114409216
1        1764245727646044160
2        1763931255521857792
3        1763576337078313216
4        1763173358068281600
                ...         
26672    1715066644018381312
26673    1714927388637696256
26674    1714675864237924864
26675    1714370461507748096
26676    1714370461507748096
Name: id, Length: 26677, dtype: int64

In [11]:
non_duplicate_counts = df['id'].nunique()
print(f"Number of duplicate IDs: {non_duplicate_counts}")

duplicate_counts = df['id'].duplicated().sum()
print(f"Number of duplicate IDs: {duplicate_counts}")


Number of duplicate IDs: 26548
Number of duplicate IDs: 129


brief insight into how do these duplicates look like

In [12]:
duplicates = df[df.duplicated(subset=['id'], keep=False)]
duplicates

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
9,2024-02-26 13:35:28+00:00,"Nie ma takiej obietnicy, której polityk nie ob...",1762109123800293376,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1762109123800293457],1762109123800293376,Original,{'media_keys': ['3_1762109117865304064']},"{'urls': [{'start': 80, 'end': 103, 'url': 'ht...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
10,2024-02-26 13:35:28+00:00,"Nie ma takiej obietnicy, której polityk nie ob...",1762109123800293376,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1762109123800293457],1762109123800293376,Original,{'media_keys': ['3_1762109117865304064']},"{'urls': [{'start': 80, 'end': 103, 'url': 'ht...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
973,2023-10-17 08:22:19+00:00,Serdeczne dzięki za każdy głos. 🤝\nDla mnie to...,1714195119706890496,1182211615,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 15, 'reply_count': 28, 'like...",pl,[1714195119706890463],1714195119706890496,Original,{'media_keys': ['3_1714195114472431617']},"{'hashtags': [{'start': 251, 'end': 264, 'tag'...",bartlomiejpejo_2023-10-16_2024-10-15.json,Konfederacja,,,,{'place_id': '535f0c2de0121451'}
974,2023-10-17 08:22:19+00:00,Serdeczne dzięki za każdy głos. 🤝\nDla mnie to...,1714195119706890496,1182211615,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 15, 'reply_count': 28, 'like...",pl,[1714195119706890463],1714195119706890496,Original,{'media_keys': ['3_1714195114472431617']},"{'urls': [{'start': 275, 'end': 298, 'url': 'h...",bartlomiejpejo_2023-10-16_2024-10-15.json,Konfederacja,,,,{'place_id': '535f0c2de0121451'}
1974,2024-05-25 09:27:54+00:00,@MKierwinski Za to Wy bronicie ambasadora kraj...,1794299341743829248,1420353350,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 104, 'reply_count': 26, 'lik...",pl,[1794299341743829316],1794294356570264064,Reply,,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",KonradBerkowicz_2024-04-16_2024-10-15_vol1.json,Konfederacja,308367619,,"[{'type': 'replied_to', 'id': '179429435657026...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25856,2023-10-17 20:07:37+00:00,Wszystkim Wam serdecznie dziekuję. Za wsparcie...,1714372612611060224,1201875318,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 10, 'reply_count': 6, 'like_...",pl,[1714372612611060222],1714372612611060224,Original,{'media_keys': ['7_1714372479076954113']},"{'urls': [{'start': 81, 'end': 104, 'url': 'ht...",StefanKrajewski_2023-10-16_2024-10-15.json,PSL,,,,{'place_id': '47c001064da7125c'}
26266,2023-10-16 12:39:29+00:00,.@TakJestTVN24 na antenie @tvn24📺. Zapraszam d...,1713897452430909952,1070635531447558144,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 6, 'reply_count': 1, 'like_c...",pl,[1713897452430909939],1713897452430909952,Original,{'media_keys': ['3_1713897440191885312']},"{'urls': [{'start': 83, 'end': 106, 'url': 'ht...",PZgorzelskiP_2023-10-16_2024-10-15.json,PSL,,,,
26267,2023-10-16 12:39:29+00:00,.@TakJestTVN24 na antenie @tvn24📺. Zapraszam d...,1713897452430909952,1070635531447558144,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 6, 'reply_count': 1, 'like_c...",pl,[1713897452430909939],1713897452430909952,Original,{'media_keys': ['3_1713897440191885312']},"{'urls': [{'start': 83, 'end': 106, 'url': 'ht...",PZgorzelskiP_2023-10-16_2024-10-15.json,PSL,,,,
26675,2023-10-17 19:59:04+00:00,"Mówiliśmy jasno: albo #TrzeciaDroga, albo trze...",1714370461507748096,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 748, 'reply_count': 2136, 'l...",pl,[1714370461507748145],1714370461507748096,Original,,"{'mentions': [{'start': 59, 'end': 68, 'userna...",KosiniakKamysz_2023-10-16_2024-10-15.json,PSL,,,,


In [13]:
duplicate_text_count = df['text'].duplicated().sum()
print(f"Number of duplicate Text Entries: {duplicate_text_count}")


Number of duplicate Text Entries: 139


In [14]:
duplicate_id_text_rows = df[df.duplicated(subset=['id', 'text'], keep=False)]
print(f"Rows where BOTH `id` and `text` are duplicated: {len(duplicate_id_text_rows)}")


Rows where BOTH `id` and `text` are duplicated: 227


In [15]:
# Count occurrences of each ID
id_counts = df['id'].value_counts()
print("Distribution of duplicate IDs:")
print(id_counts.value_counts().sort_index())

# Count occurrences of each text
text_counts = df['text'].value_counts()
print("\nDistribution of duplicate Text Entries:")
print(text_counts.value_counts().sort_index())


Distribution of duplicate IDs:
count
1     26434
2       112
3         1
16        1
Name: count, dtype: int64

Distribution of duplicate Text Entries:
count
1    26404
2      130
3        3
4        1
Name: count, dtype: int64


In [16]:
# Get all duplicate ID rows
duplicate_id_rows = df[df.duplicated(subset=['id'], keep=False)]

# Get all duplicate Text rows
duplicate_text_rows = df[df.duplicated(subset=['text'], keep=False)]

# Get rows where both ID and Text are duplicated
duplicate_id_text_rows = df[df.duplicated(subset=['id', 'text'], keep=False)]

# Compare overlaps
print(f"Rows where ID is duplicated: {len(duplicate_id_rows)}")
print(f"Rows where Text is duplicated: {len(duplicate_text_rows)}")
print(f"Rows where BOTH ID and Text are duplicated: {len(duplicate_id_text_rows)}")

# Find duplicate IDs that are NOT in the text duplicate set
id_not_in_text = duplicate_id_rows[~duplicate_id_rows['id'].isin(duplicate_text_rows['id'])]
print(f"\nDuplicate IDs NOT duplicated in Text: {len(id_not_in_text)}")

# Find duplicate Texts that are NOT in the ID duplicate set
text_not_in_id = duplicate_text_rows[~duplicate_text_rows['text'].isin(duplicate_id_rows['text'])]
print(f"Duplicate Texts NOT duplicated in ID: {len(text_not_in_id)}")


Rows where ID is duplicated: 243
Rows where Text is duplicated: 273
Rows where BOTH ID and Text are duplicated: 227

Duplicate IDs NOT duplicated in Text: 16
Duplicate Texts NOT duplicated in ID: 46


In [20]:
id_duplicate_but_different_text = duplicate_id_rows.drop_duplicates(subset=['text'], keep=False)
print(f"Rows where the same `id` has different `text`: {len(id_duplicate_but_different_text)}")
id_duplicate_but_different_text  


Rows where the same `id` has different `text`: 16


Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
19617,2023-10-16 00:00:00+00:00,@tomekbit ✌️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19618,2023-10-16 00:00:00+00:00,"@MaciejGdynia Maćku, czekam na oficjalne wynik...",0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 2, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19619,2023-10-16 00:00:00+00:00,"@MCichonAlicja Alu, czekamy jeszcze na wynik?",0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19620,2023-10-16 00:00:00+00:00,@REL_76 🥰🥰🥰,0,61552404,,,everyone,"{'retweet_count': 1, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19621,2023-10-16 00:00:00+00:00,@Gidziela 🥰✌️,0,61552404,,,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19622,2023-10-16 00:00:00+00:00,@WHaptar Gratulacje👏🥂,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19623,2023-10-16 00:00:00+00:00,@KapenGenezyp Dziękuję❤️❤️❤️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19624,2023-10-16 21:57:00+00:00,@jasinska_e ❤️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19625,2023-10-16 22:27:00+00:00,@BMikolajewska odpowie💪,0,61552404,,,everyone,"{'retweet_count': 18, 'reply_count': 31, 'like...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19626,2023-10-16 22:41:00+00:00,@DorotaNiedziela ja Tobie też❣️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,


In [22]:
text_duplicate_but_different_id = duplicate_text_rows.drop_duplicates(subset=['id'], keep=False)
print(f"Rows where the same `text` has different `id`: {len(text_duplicate_but_different_id)}")
text_duplicate_but_different_id 


Rows where the same `text` has different `id`: 46


Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
103,2024-09-24 16:49:22+00:00,@KONFEDERACJA_ @Nowa_Nadzieja_,1838621761090285568,1182211615,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 1, 'reply_count': 1, 'like_c...",qam,[1838621761090285637],1838621418218561536,Reply,,"{'mentions': [{'start': 0, 'end': 14, 'usernam...",bartlomiejpejo_2023-10-16_2024-10-15.json,Konfederacja,1182211615.0,,"[{'type': 'replied_to', 'id': '183862141821856...",
1802,2024-07-02 13:04:45+00:00,@Bundeskanzler @donaldtusk Jesteście ostatnim ...,1808124655041236992,1420353350,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 45, 'reply_count': 34, 'like...",pl,[1808124655041236995],1808110506722394368,Reply,,"{'mentions': [{'start': 0, 'end': 14, 'usernam...",KonradBerkowicz_2024-04-16_2024-10-15_vol1.json,Konfederacja,1.4891647116524667e+18,"[{'domain': {'id': '10', 'name': 'Person', 'de...","[{'type': 'replied_to', 'id': '180811050672239...",
1803,2024-07-02 13:02:57+00:00,@Bundeskanzler @donaldtusk Jesteście ostatnim ...,1808124200399356416,1420353350,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 20, 'reply_count': 20, 'like...",pl,[1808124200399356382],1808088070870233344,Reply,,"{'mentions': [{'start': 0, 'end': 14, 'usernam...",KonradBerkowicz_2024-04-16_2024-10-15_vol1.json,Konfederacja,1.4891647116524667e+18,"[{'domain': {'id': '10', 'name': 'Person', 'de...","[{'type': 'replied_to', 'id': '180808807087023...",
2058,2024-05-17 07:19:36+00:00,@KONFEDERACJA_ @Nowa_Nadzieja_,1791367951259406336,1420353350,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 0, 'reply_count': 2, 'like_c...",qam,[1791367951259406344],1791367605858550272,Reply,,"{'mentions': [{'start': 0, 'end': 14, 'usernam...",KonradBerkowicz_2024-04-16_2024-10-15_vol1.json,Konfederacja,1420353350.0,,"[{'type': 'replied_to', 'id': '179136760585855...",
2731,2024-01-21 20:27:16+00:00,https://t.co/a07zxVsZ7B,1749166794684653568,941710643853447168,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",zxx,[1749166794684653580],1749166791673168384,Reply,,"{'urls': [{'start': 0, 'end': 23, 'url': 'http...",MichalWawer_2023-10-16_2024-10-15.json,Konfederacja,9.417106438534472e+17,,"[{'type': 'replied_to', 'id': '174916679167316...",
2733,2024-01-20 18:19:48+00:00,https://t.co/a07zxVsZ7B,1748772328115155200,941710643853447168,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",zxx,[1748772328115155198],1748772325653139712,Reply,,"{'urls': [{'start': 0, 'end': 23, 'url': 'http...",MichalWawer_2023-10-16_2024-10-15.json,Konfederacja,9.417106438534472e+17,,"[{'type': 'replied_to', 'id': '174877232565313...",
2735,2024-01-19 17:29:22+00:00,https://t.co/a07zxVsZ7B,1748397249602699520,941710643853447168,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",zxx,[1748397249602699397],1748397247413305344,Reply,,"{'urls': [{'start': 0, 'end': 23, 'url': 'http...",MichalWawer_2023-10-16_2024-10-15.json,Konfederacja,9.417106438534472e+17,,"[{'type': 'replied_to', 'id': '174839724741330...",
2765,2024-01-05 19:33:21+00:00,https://t.co/osFnIRXwaB,1743355020580405760,941710643853447168,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",zxx,[1743355020580405656],1743355018181263616,Reply,,"{'urls': [{'start': 0, 'end': 23, 'url': 'http...",MichalWawer_2023-10-16_2024-10-15.json,Konfederacja,9.417106438534472e+17,,"[{'type': 'replied_to', 'id': '174335501818126...",
2767,2024-01-04 17:45:19+00:00,https://t.co/osFnIRXwaB,1742965447044076032,941710643853447168,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",zxx,[1742965447044076026],1742965444732965120,Reply,,"{'urls': [{'start': 0, 'end': 23, 'url': 'http...",MichalWawer_2023-10-16_2024-10-15.json,Konfederacja,9.417106438534472e+17,,"[{'type': 'replied_to', 'id': '174296544473296...",
2774,2023-12-30 12:59:30+00:00,https://t.co/plR4HvmYku,1741081578942427136,941710643853447168,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",False,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",zxx,[1741081578942427186],1741081576308453632,Reply,,"{'urls': [{'start': 0, 'end': 23, 'url': 'http...",MichalWawer_2023-10-16_2024-10-15.json,Konfederacja,9.417106438534472e+17,,"[{'type': 'replied_to', 'id': '174108157630845...",


In [25]:
id_duplicate_but_different_text[['id', 'text', 'created_at', 'author_id']]

Unnamed: 0,id,text,created_at,author_id
19617,0,@tomekbit ✌️,2023-10-16 00:00:00+00:00,61552404
19618,0,"@MaciejGdynia Maćku, czekam na oficjalne wynik...",2023-10-16 00:00:00+00:00,61552404
19619,0,"@MCichonAlicja Alu, czekamy jeszcze na wynik?",2023-10-16 00:00:00+00:00,61552404
19620,0,@REL_76 🥰🥰🥰,2023-10-16 00:00:00+00:00,61552404
19621,0,@Gidziela 🥰✌️,2023-10-16 00:00:00+00:00,61552404
19622,0,@WHaptar Gratulacje👏🥂,2023-10-16 00:00:00+00:00,61552404
19623,0,@KapenGenezyp Dziękuję❤️❤️❤️,2023-10-16 00:00:00+00:00,61552404
19624,0,@jasinska_e ❤️,2023-10-16 21:57:00+00:00,61552404
19625,0,@BMikolajewska odpowie💪,2023-10-16 22:27:00+00:00,61552404
19626,0,@DorotaNiedziela ja Tobie też❣️,2023-10-16 22:41:00+00:00,61552404


In [None]:
empty_id_rows = df[df['id'].isna()]
print(f"Rows where `id` is empty (NaN): {len(empty_id_rows)}")
#empty_id_rows


Rows where `id` is empty (NaN): 0


Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo


In [29]:
zero_id_rows = df[df['id'] == 0]
print(f"Rows where `id` is 0: {len(zero_id_rows)}")
zero_id_rows


Rows where `id` is 0: 16


Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
19617,2023-10-16 00:00:00+00:00,@tomekbit ✌️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19618,2023-10-16 00:00:00+00:00,"@MaciejGdynia Maćku, czekam na oficjalne wynik...",0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 2, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19619,2023-10-16 00:00:00+00:00,"@MCichonAlicja Alu, czekamy jeszcze na wynik?",0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19620,2023-10-16 00:00:00+00:00,@REL_76 🥰🥰🥰,0,61552404,,,everyone,"{'retweet_count': 1, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19621,2023-10-16 00:00:00+00:00,@Gidziela 🥰✌️,0,61552404,,,everyone,"{'retweet_count': 1, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19622,2023-10-16 00:00:00+00:00,@WHaptar Gratulacje👏🥂,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 1, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19623,2023-10-16 00:00:00+00:00,@KapenGenezyp Dziękuję❤️❤️❤️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19624,2023-10-16 21:57:00+00:00,@jasinska_e ❤️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19625,2023-10-16 22:27:00+00:00,@BMikolajewska odpowie💪,0,61552404,,,everyone,"{'retweet_count': 18, 'reply_count': 31, 'like...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,
19626,2023-10-16 22:41:00+00:00,@DorotaNiedziela ja Tobie też❣️,0,61552404,,,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,,,Reply,,,Leszczyna_2023-10-16_2023-12-31.json,PO,,,,


id 0 of a tweet may mean, that the tweet was, changed, deleted, or that it is not a tweet but something else and was wrongly categorized as one

We need to remove duplicate tweets because our custom downloading loop occasionally downloads the same tweet two times to ensure completeness.

In [30]:
# Remove duplicates from the dataframe based on specific columns
df.drop_duplicates(subset=['id'], inplace=True)

In [31]:
df.head()

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 17, 'reply_count': 4, 'like_...",pl,[1765380709114409272],1765380709114409216,Original,,,Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1764245727646044196],1764245727646044160,Original,{'media_keys': ['3_1764245720805040128']},"{'urls': [{'start': 57, 'end': 80, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763931255521857839],1763931255521857792,Original,{'media_keys': ['7_1763931103151202304']},"{'urls': [{'start': 135, 'end': 158, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763576337078313201],1763576337078313216,Original,{'media_keys': ['3_1763576329385967617']},"{'urls': [{'start': 64, 'end': 87, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763173358068281524],1763173358068281600,Original,{'media_keys': ['3_1763173351466401792']},"{'urls': [{'start': 128, 'end': 151, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,


We need to delete retweets because they are wrongly provided by the X API. We want to analyze only original tweets, replies, and quotes.

In [32]:
df = df[df['category'] != 'Retweet']

In [33]:
df

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 17, 'reply_count': 4, 'like_...",pl,[1765380709114409272],1765380709114409216,Original,,,Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1764245727646044196],1764245727646044160,Original,{'media_keys': ['3_1764245720805040128']},"{'urls': [{'start': 57, 'end': 80, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763931255521857839],1763931255521857792,Original,{'media_keys': ['7_1763931103151202304']},"{'urls': [{'start': 135, 'end': 158, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763576337078313201],1763576337078313216,Original,{'media_keys': ['3_1763576329385967617']},"{'urls': [{'start': 64, 'end': 87, 'url': 'htt...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763173358068281524],1763173358068281600,Original,{'media_keys': ['3_1763173351466401792']},"{'urls': [{'start': 128, 'end': 151, 'url': 'h...",Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json,Konfederacja,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26671,2023-10-24 10:46:22+00:00,Formacje demokratyczne będą tworzyć przyszły r...,1716768088052359424,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 344, 'reply_count': 2013, 'l...",pl,[1716768088052359454],1716768088052359424,Original,{'media_keys': ['3_1716768082402664449']},"{'urls': [{'start': 145, 'end': 168, 'url': 'h...",KosiniakKamysz_2023-10-16_2024-10-15.json,PSL,,,,
26672,2023-10-19 18:05:26+00:00,"Żadne podchody, kłamstwa czy medialne wrzutki ...",1715066644018381312,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 1149, 'reply_count': 2406, '...",pl,[1715066644018381294],1715066644018381312,Original,,,KosiniakKamysz_2023-10-16_2024-10-15.json,PSL,,,,
26673,2023-10-19 08:52:05+00:00,"Polska potrzebuje nowego, demokratycznego rząd...",1714927388637696256,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 185, 'reply_count': 1108, 'l...",pl,[1714927388637696291],1714927388637696256,Original,,,KosiniakKamysz_2023-10-16_2024-10-15.json,PSL,,,,
26674,2023-10-18 16:12:37+00:00,95 lat temu zmarł generał Tadeusz Jordan Rozwa...,1714675864237924864,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,"{'retweet_count': 110, 'reply_count': 354, 'li...",pl,[1714675864237924823],1714675864237924864,Original,{'media_keys': ['3_1714675857548009473']},"{'urls': [{'start': 264, 'end': 287, 'url': 'h...",KosiniakKamysz_2023-10-16_2024-10-15.json,PSL,,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",,


In [38]:
# Update the 'username' column to keep only the string until '_2' -> split to date range

#df['username'] = df['username'].str.split('_2').str[0].copy()
df.loc[:, 'username'] = df['username'].str.split('_2').str[0]

In [39]:
category_summary = df['category'].value_counts()
print(category_summary)
total_tweets = category_summary.sum()
print(f"Total tweets: {total_tweets}")

category
Original    17595
Reply        5641
Quote        2826
Name: count, dtype: int64
Total tweets: 26062


In [41]:
# Ensure the created_at column is in datetime format

#df['created_at'] = pd.to_datetime(df['created_at'])
df.loc[:, 'created_at'] = pd.to_datetime(df['created_at'])

In [42]:
df.head()

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,conversation_id,category,attachments,entities,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 17, 'reply_count': 4, 'like_...",pl,[1765380709114409272],1765380709114409216,Original,,,Iwaszkiewicz_RJ,Konfederacja,,,,
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1764245727646044196],1764245727646044160,Original,{'media_keys': ['3_1764245720805040128']},"{'urls': [{'start': 57, 'end': 80, 'url': 'htt...",Iwaszkiewicz_RJ,Konfederacja,,,,
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763931255521857839],1763931255521857792,Original,{'media_keys': ['7_1763931103151202304']},"{'urls': [{'start': 135, 'end': 158, 'url': 'h...",Iwaszkiewicz_RJ,Konfederacja,,,,
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763576337078313201],1763576337078313216,Original,{'media_keys': ['3_1763576329385967617']},"{'urls': [{'start': 64, 'end': 87, 'url': 'htt...",Iwaszkiewicz_RJ,Konfederacja,,,,
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763173358068281524],1763173358068281600,Original,{'media_keys': ['3_1763173351466401792']},"{'urls': [{'start': 128, 'end': 151, 'url': 'h...",Iwaszkiewicz_RJ,Konfederacja,,,,


In [43]:
print(df.loc[1, 'text'])

Zwróćcie uwagę na punkt 11.
Tu nie ma z czego się śmiać… https://t.co/I5KaMsUDzo


In [18]:
def add_space_around_emojis(text):
    return ''.join(f' {char} ' if char in emoji.EMOJI_DATA or re.match(r'[\U0001F1E6-\U0001F1FF]', char) else char for char in text)

df['text'] = df['text'].apply(add_space_around_emojis)

def clean_text(text):
    mentions = re.findall(r'@\w+', text)
    text = re.sub(r'@\w+', '', text)
    links = re.findall(r'http\S+', text)
    text = re.sub(r'http\S+', '', text)
    hashtags = re.findall(r'#\w+', text)
    text = re.sub(r'(?<!\s)([\U0001F600-\U0001F64F])', r' \1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F])(?!\s)', r'\1 ', text)
    return [text, mentions, links, hashtags]

df[['text_clean', 'mentions', 'links', 'hashtags']] = pd.DataFrame(df['text'].apply(clean_text).tolist(), index=df.index)

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # Turn off the warning from lack of loc

In [None]:
def add_space_around_emojis(text):
    return ''.join(f' {char} ' if char in emoji.EMOJI_DATA or re.match(r'[\U0001F1E6-\U0001F1FF]', char) else char for char in text)

def clean_text(text):
    mentions = re.findall(r'@\w+', text)
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    
    links = re.findall(r'http\S+', text)
    text = re.sub(r'http\S+', '', text)  # Remove links
    
    hashtags = re.findall(r'#\w+', text)
    
    # Add spaces around emojis
    text = re.sub(r'(?<!\s)([\U0001F600-\U0001F64F])', r' \1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F])(?!\s)', r'\1 ', text)
    
    return [text.strip(), mentions, links, hashtags]

# Apply functions
df['text_clean'] = df['text'].apply(add_space_around_emojis)
#df.loc[:, 'text_clean'] = df['text'].apply(add_space_around_emojis)

df[['text_clean', 'mentions', 'links', 'hashtags']] = pd.DataFrame(df['text_clean'].apply(clean_text).tolist(), index=df.index)
#df.loc[:, ['text_clean', 'mentions', 'links', 'hashtags']] = pd.DataFrame(df['text_clean'].apply(clean_text).tolist(), index=df.index)

In [47]:
df.head()

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,public_metrics,lang,edit_history_tweet_ids,...,username,party,in_reply_to_user_id,context_annotations,referenced_tweets,geo,text_clean,mentions,links,hashtags
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 17, 'reply_count': 4, 'like_...",pl,[1765380709114409272],...,Iwaszkiewicz_RJ,Konfederacja,,,,,"Policja rzucająca kostką brukową w rolników, p...",[],[],[]
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1764245727646044196],...,Iwaszkiewicz_RJ,Konfederacja,,,,,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,[],[https://t.co/I5KaMsUDzo],[]
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763931255521857839],...,Iwaszkiewicz_RJ,Konfederacja,,,,,"Równość, ale nie dla wszystkich… \nPani „minis...",[],[https://t.co/PyKYCMvSN3],[]
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763576337078313201],...,Iwaszkiewicz_RJ,Konfederacja,,,,,Pytanie za sto punktów! \nFirma jakiego państw...,[],[https://t.co/g61Cb5BsIx],[]
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",pl,[1763173358068281524],...,Iwaszkiewicz_RJ,Konfederacja,,,,,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,[],[https://t.co/b0Q4Tt1FJW],[]


In [48]:
df.drop(columns=['entities'], inplace=True)

In [49]:
# Some additioanl numerical data from tweets is extracted and added to the dataframe as new variables, then the original column is dropped
df['retweet_count'] = df['public_metrics'].apply(lambda x: x['retweet_count'])
df['reply_count'] = df['public_metrics'].apply(lambda x: x['reply_count'])
df['like_count'] = df['public_metrics'].apply(lambda x: x['like_count'])
df['quote_count'] = df['public_metrics'].apply(lambda x: x['quote_count'])
df['impression_count'] = df['public_metrics'].apply(lambda x: x['impression_count'])

df.drop(columns=['public_metrics'], inplace=True)

In [51]:
df

Unnamed: 0,created_at,text,id,author_id,edit_controls,possibly_sensitive,reply_settings,lang,edit_history_tweet_ids,conversation_id,...,geo,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count
0,2024-03-06 14:15:34+00:00,"Policja rzucająca kostką brukową w rolników, p...",1765380709114409216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1765380709114409272],1765380709114409216,...,,"Policja rzucająca kostką brukową w rolników, p...",[],[],[],17,4,203,0,12028
1,2024-03-03 11:05:34+00:00,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,1764245727646044160,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1764245727646044196],1764245727646044160,...,,Zwróćcie uwagę na punkt 11.\nTu nie ma z czego...,[],[https://t.co/I5KaMsUDzo],[],0,0,2,0,275
2,2024-03-02 14:15:58+00:00,"Równość, ale nie dla wszystkich… \nPani „minis...",1763931255521857792,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763931255521857839],1763931255521857792,...,,"Równość, ale nie dla wszystkich… \nPani „minis...",[],[https://t.co/PyKYCMvSN3],[],0,0,0,0,188
3,2024-03-01 14:45:39+00:00,Pytanie za sto punktów! \nFirma jakiego państw...,1763576337078313216,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763576337078313201],1763576337078313216,...,,Pytanie za sto punktów! \nFirma jakiego państw...,[],[https://t.co/g61Cb5BsIx],[],0,0,1,0,170
4,2024-02-29 12:04:21+00:00,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,1763173358068281600,1554483991511703552,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",False,everyone,pl,[1763173358068281524],1763173358068281600,...,,Ukraina jeździ na polskim paliwie. \n\nPrzypom...,[],[https://t.co/b0Q4Tt1FJW],[],0,0,0,0,124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26671,2023-10-24 10:46:22+00:00,Formacje demokratyczne będą tworzyć przyszły r...,1716768088052359424,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,pl,[1716768088052359454],1716768088052359424,...,,Formacje demokratyczne będą tworzyć przyszły r...,[],[https://t.co/XFrtPBIMiv],[],344,2013,4637,100,330379
26672,2023-10-19 18:05:26+00:00,"Żadne podchody, kłamstwa czy medialne wrzutki ...",1715066644018381312,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,pl,[1715066644018381294],1715066644018381312,...,,"Żadne podchody, kłamstwa czy medialne wrzutki ...",[],[],[],1149,2406,12760,182,689059
26673,2023-10-19 08:52:05+00:00,"Polska potrzebuje nowego, demokratycznego rząd...",1714927388637696256,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,pl,[1714927388637696291],1714927388637696256,...,,"Polska potrzebuje nowego, demokratycznego rząd...",[],[],[],185,1108,3443,47,385751
26674,2023-10-18 16:12:37+00:00,95 lat temu zmarł generał Tadeusz Jordan Rozwa...,1714675864237924864,955239446,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",0,everyone,pl,[1714675864237924823],1714675864237924864,...,,95 lat temu zmarł generał Tadeusz Jordan Rozwa...,[],[https://t.co/TZaorUiXQ1],[],110,354,1568,21,205237


In [53]:
# changing id from float to int
df['id'] = df['id'].astype('int64')

In [54]:
df_clean_text = df[['id', 'text_clean']]

df_clean_text.to_csv('data_for_translation.csv', index=False)

# TO JEST DO OGARNIĘCIA -> OCZEKIWUJE NA DANE

In [55]:
df_en_text = pd.read_csv('Data/tweets_translation/translated_tweets.csv')
df_en_text.head()


FileNotFoundError: [Errno 2] No such file or directory: 'Data/tweets_translation/translated_tweets.csv'

In [26]:
df_en_text["id"] = df_en_text["id"].apply(lambda x: int(float(x.replace(',', ''))))

In [27]:
if 'text_clean_en' in df_en_text.columns:
    df = df.merge(df_en_text[['id', 'text_clean_en']], on='id', how='left')

    display(df.head())
else:
    print("Column 'text_clean_en' does not exist in df_clean_text")


Unnamed: 0,id,possibly_sensitive,in_reply_to_user_id,created_at,referenced_tweets,edit_controls,text,edit_history_tweet_ids,reply_settings,author_id,...,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count,text_clean_en
0,1846267743022330112,False,509272614.0,2024-10-15 19:11:46+00:00,"[{'type': 'replied_to', 'id': '184615588116918...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@Dariusz_Jonski ""Ani kroku wstecz!"" - w przód ...",[1846267743022330183],everyone,1187748790863839232,...,"""Ani kroku wstecz!"" - w przód też żadnego jak...",[@Dariusz_Jonski],[],[],0,1,13,0,219,"""Not a step back!"" - you haven't put any forw..."
1,1846264777347117568,False,,2024-10-15 18:59:59+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",Właśnie zadzwonił do mnie nieznany numer. Już ...,[1846264777347117471],everyone,1187748790863839232,...,Właśnie zadzwonił do mnie nieznany numer. Już ...,[],[],[],28,9,418,0,6449,An unknown number just called me. I thought it...
2,1846262693394588160,False,955239446.0,2024-10-15 18:51:42+00:00,"[{'type': 'replied_to', 'id': '184612763211236...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@KosiniakKamysz Czy ""jeszcze cięższa praca"" pr...",[1846262693394588154],everyone,1187748790863839232,...,"Czy ""jeszcze cięższa praca"" przełoży się na r...",[@KosiniakKamysz],[],[],4,1,38,0,443,"Will ""even harder work"" translate into the fu..."
3,1846261341327446272,False,,2024-10-15 18:46:20+00:00,"[{'type': 'quoted', 'id': '1846159669573689602'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",Polecam się zapoznać z treścią konferencji 👇 ...,[1846261341327446163],everyone,1187748790863839232,...,Polecam się zapoznać z treścią konferencji 👇,[],[https://t.co/U7LAeL2cqP],[],9,2,48,0,687,I recommend reading the content of the confere...
4,1846104865829015552,False,,2024-10-15 08:24:33+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",❌ Kilka dni temu zagłosowałem za odwołaniem K...,[1846104865829015639],everyone,1187748790863839232,...,❌ Kilka dni temu zagłosowałem za odwołaniem K...,[],"[https://t.co/JQAQMEKjEs, https://t.co/4qtpUJO...",[],30,0,141,0,3354,"❌ A few days ago, I voted to dismiss Krzyszto..."


In [28]:
# Replace '#VALUE!' with NaN in 'text_clean_en' column
df['text_clean_en'] = df['text_clean_en'].replace('#VALUE!', pd.NA)

In [29]:
df['text_clean_en_demojized'] = df['text_clean_en'].apply(lambda x: emoji.demojize(str(x)) if pd.notnull(x) else x)

df[['text_clean_en', 'text_clean_en_demojized']].head()

Unnamed: 0,text_clean_en,text_clean_en_demojized
0,"""Not a step back!"" - you haven't put any forw...","""Not a step back!"" - you haven't put any forw..."
1,An unknown number just called me. I thought it...,An unknown number just called me. I thought it...
2,"Will ""even harder work"" translate into the fu...","Will ""even harder work"" translate into the fu..."
3,I recommend reading the content of the confere...,I recommend reading the content of the confere...
4,"❌ A few days ago, I voted to dismiss Krzyszto...",":cross_mark: A few days ago, I voted to dismi..."


In [30]:
df

Unnamed: 0,id,possibly_sensitive,in_reply_to_user_id,created_at,referenced_tweets,edit_controls,text,edit_history_tweet_ids,reply_settings,author_id,...,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count,text_clean_en,text_clean_en_demojized
0,1846267743022330112,False,509272614,2024-10-15 19:11:46+00:00,"[{'type': 'replied_to', 'id': '184615588116918...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@Dariusz_Jonski ""Ani kroku wstecz!"" - w przód ...",[1846267743022330183],everyone,1187748790863839232,...,[@Dariusz_Jonski],[],[],0,1,13,0,219,"""Not a step back!"" - you haven't put any forw...","""Not a step back!"" - you haven't put any forw..."
1,1846264777347117568,False,,2024-10-15 18:59:59+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",Właśnie zadzwonił do mnie nieznany numer. Już ...,[1846264777347117471],everyone,1187748790863839232,...,[],[],[],28,9,418,0,6449,An unknown number just called me. I thought it...,An unknown number just called me. I thought it...
2,1846262693394588160,False,955239446,2024-10-15 18:51:42+00:00,"[{'type': 'replied_to', 'id': '184612763211236...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@KosiniakKamysz Czy ""jeszcze cięższa praca"" pr...",[1846262693394588154],everyone,1187748790863839232,...,[@KosiniakKamysz],[],[],4,1,38,0,443,"Will ""even harder work"" translate into the fu...","Will ""even harder work"" translate into the fu..."
3,1846261341327446272,False,,2024-10-15 18:46:20+00:00,"[{'type': 'quoted', 'id': '1846159669573689602'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",Polecam się zapoznać z treścią konferencji 👇 ...,[1846261341327446163],everyone,1187748790863839232,...,[],[https://t.co/U7LAeL2cqP],[],9,2,48,0,687,I recommend reading the content of the confere...,I recommend reading the content of the confere...
4,1846104865829015552,False,,2024-10-15 08:24:33+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",❌ Kilka dni temu zagłosowałem za odwołaniem K...,[1846104865829015639],everyone,1187748790863839232,...,[],"[https://t.co/JQAQMEKjEs, https://t.co/4qtpUJO...",[],30,0,141,0,3354,"❌ A few days ago, I voted to dismiss Krzyszto...",":cross_mark: A few days ago, I voted to dismi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11452,1721611648257921024,0,,2023-11-06 19:32:57+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",Prezydent RP jest gwarantem ciągłości władzy p...,[1721611648257921084],everyone,964017524,...,[],[],[],17,6,53,1,4871,The President of the Republic of Poland is the...,The President of the Republic of Poland is the...
11453,1720163035048706304,0,,2023-11-02 19:36:41+00:00,"[{'type': 'quoted', 'id': '1720122294914187659'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",W państwach o ustabilizowanej demokracji siła ...,[1720163035048706306],everyone,964017524,...,[],[https://t.co/u1vzB6ImaB],[],18,2,41,2,3152,"In countries with stable democracy, the streng...","In countries with stable democracy, the streng..."
11454,1716897815400792320,0,,2023-10-24 19:21:52+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",Piękny epilog kampanii wyborczej:\n ✅ ️ @Rober...,[1716897815400792400],everyone,964017524,...,"[@RobertTelus, @KosiniakKamysz]",[https://t.co/MGuhesegb2],[#TrzeciaDroga],26,7,97,1,5674,A beautiful epilogue of the election campaign:...,A beautiful epilogue of the election campaign:...
11455,1714387831425052928,0,3370515933,2023-10-17 21:08:05+00:00,"[{'type': 'replied_to', 'id': '171438631727710...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",@motykamilosz Jak widać nie tylko w piosenkach...,[1714387831425052856],everyone,964017524,...,[@motykamilosz],[],[],1,2,50,0,1301,"As you can see, not only in Eleni's songs ""Lo...","As you can see, not only in Eleni's songs ""Lo..."


In [31]:
df['possibly_sensitive'] = df['possibly_sensitive'].astype(bool)

In [32]:
username_to_realname = {
    'bartlomiejpejo': 'Bartłomiej Pejo',
    'RobertBiedron': 'Robert Biedroń',
    'PatrykJaki': 'Patryk Jaki',
    'Kpelczynska': 'Katarzyna Pelczyńska',
    'OklaDrewnowicz': 'Agnieszka Okła-Drewnowicz',
    'KosiniakKamysz': 'Władysław Kosiniak-Kamysz',
    'mwojcik_': 'Michał Wójcik',
    'MorawieckiM': 'Mateusz Morawiecki',
    'SlawomirMentzen': 'Sławomir Mentzen',
    'Wlodek_Skalik': 'Włodzimierz Skalik',
    'BeataSzydlo': 'Beata Szydło',
    'WTumanowicz': 'Witold Tumanowicz',
    'KGawkowski': 'Krzysztof Gawkowski',
    'wlodekczarzasty': 'Włodzimierz Czarzasty',
    'Kaminski_M_': 'Mariusz Kamiński',
    'Macierewicz_A': 'Antoni Macierewicz',
    'elzbietawitek': 'Elżbieta Witek',
    'aga_buczynska': 'Agnieszka Buczyńska',
    'szymon_holownia': 'Szymon Hołownia',
    'DorotaNiedziela': 'Dorota Niedziela',
    'EwaKopacz': 'Ewa Kopacz',
    'Leszczyna': 'Izabela Leszczyna',
    'M_K_Blonska': 'Małgorzata Kidawa-Błońska',
    'bbudka': 'Borys Budka',
    'donaldtusk': 'Donald Tusk',
    'DariuszKlimczak': 'Dariusz Klimczak',
    'GrzybAndrzej': 'Andrzej Grzyb',
    'Hetman_K': 'Krzysztof Hetman',
    'JarubasAdam': 'Adam Jarubas',
    'Paslawska': 'Urszula Pasławska',
    'TudujKrzysztof': 'Krzysztof Tuduj',
    'ZalewskiPawel': 'Paweł Zalewski'
}

# Add the 'name' column to the dataframe
df['name'] = df['username'].map(username_to_realname)

In [33]:
# Delete next line sign from the 'text_clean_en' column
df['text_clean_en'] = df['text_clean_en'].str.replace('\n', ' ')

In [34]:
# Save the DataFrame to a Parquet file
df.to_parquet('cleaned_data/df_combined.parquet', index=False)