# Data cleaning

In this file, we cleaned the downloaded data:
The main steps include:
1. Adding party affiliation to tweet rows
2. Deleting unnecessary downloaded Retweets.
3. Deleting links and mentions from the tweets text and saving them to separate columns
4. Expanding the column of public metrics
5. Encoding emojis in a unified format
6. Translating tweets using Google Translate in Google Sheets
7. Saving all downloaded tweets to one file

### 1.  Used libraries

In [1]:
import os
import pandas as pd
import re
import emoji

### 2. Reading JSON files and transforming them into party-specific pickle files

In [3]:
base_input_path = 'data/tweets_data_final'
subfolders = ['Konfederacja', 'NL', 'PIS', 'PO', 'PL2050', 'PSL']
output_folder = 'data/tweets_data_combined'

os.makedirs(output_folder, exist_ok=True)

for subfolder in subfolders:
    folder_path = os.path.join(base_input_path, subfolder)
    dataframes = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            politician = filename.split("_tweets.json")[0]
            try:
                df = pd.read_json(file_path)  
                df["username"] = politician  
                df["party"] = subfolder
                print(f"Read {len(df)} rows from {file_path}")  
                dataframes.append(df)
            except ValueError as e:
                print(f"Error reading {file_path}: {e}")
    
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        
        output_file_path = os.path.join(output_folder, f'{subfolder}_combined.pkl')
        combined_df.to_pickle(output_file_path) 
        
        print(f"Saved {subfolder} combined data to {output_file_path}")

print("Processing complete!")

Read 950 rows from data/tweets_data_final/Konfederacja/Wlodek_Skalik_2023-10-16_2024-10-15.json
Read 721 rows from data/tweets_data_final/Konfederacja/SlawomirMentzen_2023-10-16_2024-10-15.json
Read 175 rows from data/tweets_data_final/Konfederacja/TudujKrzysztof_2023-10-16_2024-10-15.json
Read 964 rows from data/tweets_data_final/Konfederacja/bartlomiejpejo_2023-10-16_2024-10-15.json
Read 750 rows from data/tweets_data_final/Konfederacja/WTumanowicz_2023-10-16_2024-10-15.json
Saved Konfederacja combined data to data/tweets_data_combined/Konfederacja_combined.pkl
Read 457 rows from data/tweets_data_final/NL/RobertBiedron_2023-10-16_2024-10-15.json
Read 178 rows from data/tweets_data_final/NL/KGawkowski_2023-10-16_2024-10-15.json
Read 73 rows from data/tweets_data_final/NL/wlodekczarzasty_2023-10-16_2024-10-15.json
Read 0 rows from data/tweets_data_final/NL/DyduchMarek_2023-10-16_2024-10-15.json
Saved NL combined data to data/tweets_data_combined/NL_combined.pkl
Read 647 rows from data/

### 3. Data cleaning

In [4]:
df_konf = pd.read_pickle(os.path.join(output_folder, 'Konfederacja_combined.pkl'))
df_NL = pd.read_pickle(os.path.join(output_folder, 'NL_combined.pkl'))
df_PIS = pd.read_pickle(os.path.join(output_folder, 'PIS_combined.pkl'))
df_PO = pd.read_pickle(os.path.join(output_folder, 'PO_combined.pkl'))
df_PL2050 = pd.read_pickle(os.path.join(output_folder, 'PL2050_combined.pkl'))
df_PSL = pd.read_pickle(os.path.join(output_folder, 'PSL_combined.pkl'))

In [5]:
df_konf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3560 entries, 0 to 3559
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   id                      3560 non-null   int64              
 1   possibly_sensitive      3560 non-null   bool               
 2   in_reply_to_user_id     726 non-null    float64            
 3   created_at              3560 non-null   datetime64[ns, UTC]
 4   referenced_tweets       1055 non-null   object             
 5   edit_controls           3560 non-null   object             
 6   entities                3281 non-null   object             
 7   text                    3560 non-null   object             
 8   public_metrics          3560 non-null   object             
 9   edit_history_tweet_ids  3560 non-null   object             
 10  reply_settings          3560 non-null   object             
 11  author_id               3560 non-null   int

In [6]:
# Merge all dataframes into one
df = pd.concat([df_konf, df_NL, df_PIS, df_PO, df_PL2050, df_PSL], ignore_index=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11523 entries, 0 to 11522
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   id                      11507 non-null  float64            
 1   possibly_sensitive      11507 non-null  object             
 2   in_reply_to_user_id     1888 non-null   float64            
 3   created_at              11523 non-null  datetime64[ns, UTC]
 4   referenced_tweets       3210 non-null   object             
 5   edit_controls           11507 non-null  object             
 6   entities                9861 non-null   object             
 7   text                    11523 non-null  object             
 8   public_metrics          11523 non-null  object             
 9   edit_history_tweet_ids  11507 non-null  object             
 10  reply_settings          11523 non-null  object             
 11  author_id               11523 non-null  f

In [8]:
pd.options.display.float_format = '{:.0f}'.format
df['id'] = df['id'].fillna(0).astype('int64')
df['id']

0        1846267743022330112
1        1846264777347117568
2        1846262693394588160
3        1846261341327446272
4        1846104865829015552
                ...         
11518    1720163035048706304
11519    1716897815400792320
11520    1714387831425052928
11521    1714364905619431680
11522    1714364905619431680
Name: id, Length: 11523, dtype: int64

In [9]:
df['id'].nunique()

11461

We need to remove duplicate tweets because our custom downloading loop occasionally downloads the same tweet two times to ensure completeness.

In [10]:
# Remove duplicates from the dataframe based on specific columns
df.drop_duplicates(subset=['id'], inplace=True)

In [11]:
df.head()

Unnamed: 0,id,possibly_sensitive,in_reply_to_user_id,created_at,referenced_tweets,edit_controls,entities,text,public_metrics,edit_history_tweet_ids,reply_settings,author_id,lang,conversation_id,category,context_annotations,attachments,geo,username,party
0,1846267743022330112,False,509272614.0,2024-10-15 19:11:46+00:00,"[{'type': 'replied_to', 'id': '184615588116918...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","{'mentions': [{'start': 0, 'end': 15, 'usernam...","@Dariusz_Jonski ""Ani kroku wstecz!"" - w prz√≥d ...","{'retweet_count': 0, 'reply_count': 1, 'like_c...",[1846267743022330183],everyone,1187748790863839232,pl,1846155881169182720,Reply,,,,Wlodek_Skalik_2023-10-16_2024-10-15.json,Konfederacja
1,1846264777347117568,False,,2024-10-15 18:59:59+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,W≈Ça≈õnie zadzwoni≈Ç do mnie nieznany numer. Ju≈º ...,"{'retweet_count': 28, 'reply_count': 9, 'like_...",[1846264777347117471],everyone,1187748790863839232,pl,1846264777347117568,Original,,,,Wlodek_Skalik_2023-10-16_2024-10-15.json,Konfederacja
2,1846262693394588160,False,955239446.0,2024-10-15 18:51:42+00:00,"[{'type': 'replied_to', 'id': '184612763211236...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","{'mentions': [{'start': 0, 'end': 15, 'usernam...","@KosiniakKamysz Czy ""jeszcze ciƒô≈ºsza praca"" pr...","{'retweet_count': 4, 'reply_count': 1, 'like_c...",[1846262693394588154],everyone,1187748790863839232,pl,1846127632112369664,Reply,,,,Wlodek_Skalik_2023-10-16_2024-10-15.json,Konfederacja
3,1846261341327446272,False,,2024-10-15 18:46:20+00:00,"[{'type': 'quoted', 'id': '1846159669573689602'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...","{'urls': [{'start': 45, 'end': 68, 'url': 'htt...",Polecam siƒô zapoznaƒá z tre≈õciƒÖ konferencji üëá h...,"{'retweet_count': 9, 'reply_count': 2, 'like_c...",[1846261341327446163],everyone,1187748790863839232,pl,1846261341327446272,Quote,"[{'domain': {'id': '10', 'name': 'Person', 'de...",,,Wlodek_Skalik_2023-10-16_2024-10-15.json,Konfederacja
4,1846104865829015552,False,,2024-10-15 08:24:33+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...","{'urls': [{'start': 277, 'end': 300, 'url': 'h...",‚ùåKilka dni temu zag≈Çosowa≈Çem za odwo≈Çaniem Krz...,"{'retweet_count': 30, 'reply_count': 0, 'like_...",[1846104865829015639],everyone,1187748790863839232,pl,1846104865829015552,Original,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",{'media_keys': ['3_1846104848942432256']},,Wlodek_Skalik_2023-10-16_2024-10-15.json,Konfederacja


We need to delete retweets because they are wrongly provided by the X API. We want to analyze only original tweets, replies, and quotes.

In [12]:
df = df[df['category'] != 'Retweet']

In [13]:
# Update the 'username' column to keep only the string until '_2'
df['username'] = df['username'].str.split('_2').str[0]

In [14]:
category_summary = df['category'].value_counts()
print(category_summary)
total_tweets = category_summary.sum()
print(f"Total tweets: {total_tweets}")

category
Original    8235
Reply       1852
Quote       1370
Name: count, dtype: int64
Total tweets: 11457


In [15]:
# Ensure the created_at column is in datetime format
df['created_at'] = pd.to_datetime(df['created_at'])

In [16]:
df.head()

Unnamed: 0,id,possibly_sensitive,in_reply_to_user_id,created_at,referenced_tweets,edit_controls,entities,text,public_metrics,edit_history_tweet_ids,reply_settings,author_id,lang,conversation_id,category,context_annotations,attachments,geo,username,party
0,1846267743022330112,False,509272614.0,2024-10-15 19:11:46+00:00,"[{'type': 'replied_to', 'id': '184615588116918...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","{'mentions': [{'start': 0, 'end': 15, 'usernam...","@Dariusz_Jonski ""Ani kroku wstecz!"" - w prz√≥d ...","{'retweet_count': 0, 'reply_count': 1, 'like_c...",[1846267743022330183],everyone,1187748790863839232,pl,1846155881169182720,Reply,,,,Wlodek_Skalik,Konfederacja
1,1846264777347117568,False,,2024-10-15 18:59:59+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,W≈Ça≈õnie zadzwoni≈Ç do mnie nieznany numer. Ju≈º ...,"{'retweet_count': 28, 'reply_count': 9, 'like_...",[1846264777347117471],everyone,1187748790863839232,pl,1846264777347117568,Original,,,,Wlodek_Skalik,Konfederacja
2,1846262693394588160,False,955239446.0,2024-10-15 18:51:42+00:00,"[{'type': 'replied_to', 'id': '184612763211236...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","{'mentions': [{'start': 0, 'end': 15, 'usernam...","@KosiniakKamysz Czy ""jeszcze ciƒô≈ºsza praca"" pr...","{'retweet_count': 4, 'reply_count': 1, 'like_c...",[1846262693394588154],everyone,1187748790863839232,pl,1846127632112369664,Reply,,,,Wlodek_Skalik,Konfederacja
3,1846261341327446272,False,,2024-10-15 18:46:20+00:00,"[{'type': 'quoted', 'id': '1846159669573689602'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...","{'urls': [{'start': 45, 'end': 68, 'url': 'htt...",Polecam siƒô zapoznaƒá z tre≈õciƒÖ konferencji üëá h...,"{'retweet_count': 9, 'reply_count': 2, 'like_c...",[1846261341327446163],everyone,1187748790863839232,pl,1846261341327446272,Quote,"[{'domain': {'id': '10', 'name': 'Person', 'de...",,,Wlodek_Skalik,Konfederacja
4,1846104865829015552,False,,2024-10-15 08:24:33+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...","{'urls': [{'start': 277, 'end': 300, 'url': 'h...",‚ùåKilka dni temu zag≈Çosowa≈Çem za odwo≈Çaniem Krz...,"{'retweet_count': 30, 'reply_count': 0, 'like_...",[1846104865829015639],everyone,1187748790863839232,pl,1846104865829015552,Original,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",{'media_keys': ['3_1846104848942432256']},,Wlodek_Skalik,Konfederacja


In [17]:
print(df.loc[1, 'text'])

W≈Ça≈õnie zadzwoni≈Ç do mnie nieznany numer. Ju≈º my≈õla≈Çem, ≈ºe to lekarz chce siƒô zapytaƒá o moje samopoczucie.

Okaza≈Ço siƒô, ≈ºe to jednak fotowoltaika.


In [18]:
def add_space_around_emojis(text):
    return ''.join(f' {char} ' if char in emoji.EMOJI_DATA or re.match(r'[\U0001F1E6-\U0001F1FF]', char) else char for char in text)

df['text'] = df['text'].apply(add_space_around_emojis)

def clean_text(text):
    mentions = re.findall(r'@\w+', text)
    text = re.sub(r'@\w+', '', text)
    links = re.findall(r'http\S+', text)
    text = re.sub(r'http\S+', '', text)
    hashtags = re.findall(r'#\w+', text)
    text = re.sub(r'(?<!\s)([\U0001F600-\U0001F64F])', r' \1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F])(?!\s)', r'\1 ', text)
    return [text, mentions, links, hashtags]

df[['text_clean', 'mentions', 'links', 'hashtags']] = pd.DataFrame(df['text'].apply(clean_text).tolist(), index=df.index)

In [19]:
df.head()

Unnamed: 0,id,possibly_sensitive,in_reply_to_user_id,created_at,referenced_tweets,edit_controls,entities,text,public_metrics,edit_history_tweet_ids,...,category,context_annotations,attachments,geo,username,party,text_clean,mentions,links,hashtags
0,1846267743022330112,False,509272614.0,2024-10-15 19:11:46+00:00,"[{'type': 'replied_to', 'id': '184615588116918...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","{'mentions': [{'start': 0, 'end': 15, 'usernam...","@Dariusz_Jonski ""Ani kroku wstecz!"" - w prz√≥d ...","{'retweet_count': 0, 'reply_count': 1, 'like_c...",[1846267743022330183],...,Reply,,,,Wlodek_Skalik,Konfederacja,"""Ani kroku wstecz!"" - w prz√≥d te≈º ≈ºadnego jak...",[@Dariusz_Jonski],[],[]
1,1846264777347117568,False,,2024-10-15 18:59:59+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,W≈Ça≈õnie zadzwoni≈Ç do mnie nieznany numer. Ju≈º ...,"{'retweet_count': 28, 'reply_count': 9, 'like_...",[1846264777347117471],...,Original,,,,Wlodek_Skalik,Konfederacja,W≈Ça≈õnie zadzwoni≈Ç do mnie nieznany numer. Ju≈º ...,[],[],[]
2,1846262693394588160,False,955239446.0,2024-10-15 18:51:42+00:00,"[{'type': 'replied_to', 'id': '184612763211236...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","{'mentions': [{'start': 0, 'end': 15, 'usernam...","@KosiniakKamysz Czy ""jeszcze ciƒô≈ºsza praca"" pr...","{'retweet_count': 4, 'reply_count': 1, 'like_c...",[1846262693394588154],...,Reply,,,,Wlodek_Skalik,Konfederacja,"Czy ""jeszcze ciƒô≈ºsza praca"" prze≈Ço≈ºy siƒô na r...",[@KosiniakKamysz],[],[]
3,1846261341327446272,False,,2024-10-15 18:46:20+00:00,"[{'type': 'quoted', 'id': '1846159669573689602'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...","{'urls': [{'start': 45, 'end': 68, 'url': 'htt...",Polecam siƒô zapoznaƒá z tre≈õciƒÖ konferencji üëá ...,"{'retweet_count': 9, 'reply_count': 2, 'like_c...",[1846261341327446163],...,Quote,"[{'domain': {'id': '10', 'name': 'Person', 'de...",,,Wlodek_Skalik,Konfederacja,Polecam siƒô zapoznaƒá z tre≈õciƒÖ konferencji üëá,[],[https://t.co/U7LAeL2cqP],[]
4,1846104865829015552,False,,2024-10-15 08:24:33+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...","{'urls': [{'start': 277, 'end': 300, 'url': 'h...",‚ùå Kilka dni temu zag≈Çosowa≈Çem za odwo≈Çaniem K...,"{'retweet_count': 30, 'reply_count': 0, 'like_...",[1846104865829015639],...,Original,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",{'media_keys': ['3_1846104848942432256']},,Wlodek_Skalik,Konfederacja,‚ùå Kilka dni temu zag≈Çosowa≈Çem za odwo≈Çaniem K...,[],"[https://t.co/JQAQMEKjEs, https://t.co/4qtpUJO...",[]


In [20]:
df.drop(columns=['entities'], inplace=True)

In [21]:
df['retweet_count'] = df['public_metrics'].apply(lambda x: x['retweet_count'])
df['reply_count'] = df['public_metrics'].apply(lambda x: x['reply_count'])
df['like_count'] = df['public_metrics'].apply(lambda x: x['like_count'])
df['quote_count'] = df['public_metrics'].apply(lambda x: x['quote_count'])
df['impression_count'] = df['public_metrics'].apply(lambda x: x['impression_count'])

df.drop(columns=['public_metrics'], inplace=True)

In [22]:
df

Unnamed: 0,id,possibly_sensitive,in_reply_to_user_id,created_at,referenced_tweets,edit_controls,text,edit_history_tweet_ids,reply_settings,author_id,...,party,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count
0,1846267743022330112,False,509272614,2024-10-15 19:11:46+00:00,"[{'type': 'replied_to', 'id': '184615588116918...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@Dariusz_Jonski ""Ani kroku wstecz!"" - w prz√≥d ...",[1846267743022330183],everyone,1187748790863839232,...,Konfederacja,"""Ani kroku wstecz!"" - w prz√≥d te≈º ≈ºadnego jak...",[@Dariusz_Jonski],[],[],0,1,13,0,219
1,1846264777347117568,False,,2024-10-15 18:59:59+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",W≈Ça≈õnie zadzwoni≈Ç do mnie nieznany numer. Ju≈º ...,[1846264777347117471],everyone,1187748790863839232,...,Konfederacja,W≈Ça≈õnie zadzwoni≈Ç do mnie nieznany numer. Ju≈º ...,[],[],[],28,9,418,0,6449
2,1846262693394588160,False,955239446,2024-10-15 18:51:42+00:00,"[{'type': 'replied_to', 'id': '184612763211236...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@KosiniakKamysz Czy ""jeszcze ciƒô≈ºsza praca"" pr...",[1846262693394588154],everyone,1187748790863839232,...,Konfederacja,"Czy ""jeszcze ciƒô≈ºsza praca"" prze≈Ço≈ºy siƒô na r...",[@KosiniakKamysz],[],[],4,1,38,0,443
3,1846261341327446272,False,,2024-10-15 18:46:20+00:00,"[{'type': 'quoted', 'id': '1846159669573689602'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",Polecam siƒô zapoznaƒá z tre≈õciƒÖ konferencji üëá ...,[1846261341327446163],everyone,1187748790863839232,...,Konfederacja,Polecam siƒô zapoznaƒá z tre≈õciƒÖ konferencji üëá,[],[https://t.co/U7LAeL2cqP],[],9,2,48,0,687
4,1846104865829015552,False,,2024-10-15 08:24:33+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",‚ùå Kilka dni temu zag≈Çosowa≈Çem za odwo≈Çaniem K...,[1846104865829015639],everyone,1187748790863839232,...,Konfederacja,‚ùå Kilka dni temu zag≈Çosowa≈Çem za odwo≈Çaniem K...,[],"[https://t.co/JQAQMEKjEs, https://t.co/4qtpUJO...",[],30,0,141,0,3354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11517,1721611648257921024,0,,2023-11-06 19:32:57+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",Prezydent RP jest gwarantem ciƒÖg≈Ço≈õci w≈Çadzy p...,[1721611648257921084],everyone,964017524,...,PSL,Prezydent RP jest gwarantem ciƒÖg≈Ço≈õci w≈Çadzy p...,[],[],[],17,6,53,1,4871
11518,1720163035048706304,0,,2023-11-02 19:36:41+00:00,"[{'type': 'quoted', 'id': '1720122294914187659'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",W pa≈Ñstwach o ustabilizowanej demokracji si≈Ça ...,[1720163035048706306],everyone,964017524,...,PSL,W pa≈Ñstwach o ustabilizowanej demokracji si≈Ça ...,[],[https://t.co/u1vzB6ImaB],[],18,2,41,2,3152
11519,1716897815400792320,0,,2023-10-24 19:21:52+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",Piƒôkny epilog kampanii wyborczej:\n ‚úÖ Ô∏è @Rober...,[1716897815400792400],everyone,964017524,...,PSL,Piƒôkny epilog kampanii wyborczej:\n ‚úÖ Ô∏è musi ...,"[@RobertTelus, @KosiniakKamysz]",[https://t.co/MGuhesegb2],[#TrzeciaDroga],26,7,97,1,5674
11520,1714387831425052928,0,3370515933,2023-10-17 21:08:05+00:00,"[{'type': 'replied_to', 'id': '171438631727710...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",@motykamilosz Jak widaƒá nie tylko w piosenkach...,[1714387831425052856],everyone,964017524,...,PSL,"Jak widaƒá nie tylko w piosenkach Eleni ""Mi≈Ço≈õ...",[@motykamilosz],[],[],1,2,50,0,1301


In [23]:
df['id'] = df['id'].astype('int64')

In [24]:
df_clean_text = df[['id', 'text_clean']]

df_clean_text.to_csv('data_for_translation.csv', index=False)

In [25]:
df_en_text = pd.read_csv('tweets_translation/translated_tweets.csv')
df_en_text.head()


Unnamed: 0,id,text_clean,text_clean_en
0,1846277256509116672,"Niezrealizowanie wiƒôkszo≈õci ze ""100 konkret√≥w...","Failure to implement most of the ""100 specifi..."
1,1846222583898784000,Rok po wyborach trzeba powiedzieƒá jedno - nie ...,"A year after the elections, one thing must be ..."
2,1846161400328028160,"‚ùå Mamy rok po wyborach, a Polska pogrƒÖ≈ºa siƒô ...","‚ùå We are a year after the elections, and Pola..."
3,1846091824101769472,Mija rok od wybor√≥w parlamentarnych. W kampani...,A year has passed since the parliamentary elec...
4,1846075343188144128,#Idƒô11 üáµ üá±,#I'm going11 üáµ üá±


In [26]:
df_en_text["id"] = df_en_text["id"].apply(lambda x: int(float(x.replace(',', ''))))

In [27]:
if 'text_clean_en' in df_en_text.columns:
    df = df.merge(df_en_text[['id', 'text_clean_en']], on='id', how='left')

    display(df.head())
else:
    print("Column 'text_clean_en' does not exist in df_clean_text")


Unnamed: 0,id,possibly_sensitive,in_reply_to_user_id,created_at,referenced_tweets,edit_controls,text,edit_history_tweet_ids,reply_settings,author_id,...,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count,text_clean_en
0,1846267743022330112,False,509272614.0,2024-10-15 19:11:46+00:00,"[{'type': 'replied_to', 'id': '184615588116918...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@Dariusz_Jonski ""Ani kroku wstecz!"" - w prz√≥d ...",[1846267743022330183],everyone,1187748790863839232,...,"""Ani kroku wstecz!"" - w prz√≥d te≈º ≈ºadnego jak...",[@Dariusz_Jonski],[],[],0,1,13,0,219,"""Not a step back!"" - you haven't put any forw..."
1,1846264777347117568,False,,2024-10-15 18:59:59+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",W≈Ça≈õnie zadzwoni≈Ç do mnie nieznany numer. Ju≈º ...,[1846264777347117471],everyone,1187748790863839232,...,W≈Ça≈õnie zadzwoni≈Ç do mnie nieznany numer. Ju≈º ...,[],[],[],28,9,418,0,6449,An unknown number just called me. I thought it...
2,1846262693394588160,False,955239446.0,2024-10-15 18:51:42+00:00,"[{'type': 'replied_to', 'id': '184612763211236...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@KosiniakKamysz Czy ""jeszcze ciƒô≈ºsza praca"" pr...",[1846262693394588154],everyone,1187748790863839232,...,"Czy ""jeszcze ciƒô≈ºsza praca"" prze≈Ço≈ºy siƒô na r...",[@KosiniakKamysz],[],[],4,1,38,0,443,"Will ""even harder work"" translate into the fu..."
3,1846261341327446272,False,,2024-10-15 18:46:20+00:00,"[{'type': 'quoted', 'id': '1846159669573689602'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",Polecam siƒô zapoznaƒá z tre≈õciƒÖ konferencji üëá ...,[1846261341327446163],everyone,1187748790863839232,...,Polecam siƒô zapoznaƒá z tre≈õciƒÖ konferencji üëá,[],[https://t.co/U7LAeL2cqP],[],9,2,48,0,687,I recommend reading the content of the confere...
4,1846104865829015552,False,,2024-10-15 08:24:33+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",‚ùå Kilka dni temu zag≈Çosowa≈Çem za odwo≈Çaniem K...,[1846104865829015639],everyone,1187748790863839232,...,‚ùå Kilka dni temu zag≈Çosowa≈Çem za odwo≈Çaniem K...,[],"[https://t.co/JQAQMEKjEs, https://t.co/4qtpUJO...",[],30,0,141,0,3354,"‚ùå A few days ago, I voted to dismiss Krzyszto..."


In [28]:
# Replace '#VALUE!' with NaN in 'text_clean_en' column
df['text_clean_en'] = df['text_clean_en'].replace('#VALUE!', pd.NA)

In [29]:
df['text_clean_en_demojized'] = df['text_clean_en'].apply(lambda x: emoji.demojize(str(x)) if pd.notnull(x) else x)

df[['text_clean_en', 'text_clean_en_demojized']].head()

Unnamed: 0,text_clean_en,text_clean_en_demojized
0,"""Not a step back!"" - you haven't put any forw...","""Not a step back!"" - you haven't put any forw..."
1,An unknown number just called me. I thought it...,An unknown number just called me. I thought it...
2,"Will ""even harder work"" translate into the fu...","Will ""even harder work"" translate into the fu..."
3,I recommend reading the content of the confere...,I recommend reading the content of the confere...
4,"‚ùå A few days ago, I voted to dismiss Krzyszto...",":cross_mark: A few days ago, I voted to dismi..."


In [30]:
df

Unnamed: 0,id,possibly_sensitive,in_reply_to_user_id,created_at,referenced_tweets,edit_controls,text,edit_history_tweet_ids,reply_settings,author_id,...,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count,text_clean_en,text_clean_en_demojized
0,1846267743022330112,False,509272614,2024-10-15 19:11:46+00:00,"[{'type': 'replied_to', 'id': '184615588116918...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@Dariusz_Jonski ""Ani kroku wstecz!"" - w prz√≥d ...",[1846267743022330183],everyone,1187748790863839232,...,[@Dariusz_Jonski],[],[],0,1,13,0,219,"""Not a step back!"" - you haven't put any forw...","""Not a step back!"" - you haven't put any forw..."
1,1846264777347117568,False,,2024-10-15 18:59:59+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",W≈Ça≈õnie zadzwoni≈Ç do mnie nieznany numer. Ju≈º ...,[1846264777347117471],everyone,1187748790863839232,...,[],[],[],28,9,418,0,6449,An unknown number just called me. I thought it...,An unknown number just called me. I thought it...
2,1846262693394588160,False,955239446,2024-10-15 18:51:42+00:00,"[{'type': 'replied_to', 'id': '184612763211236...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@KosiniakKamysz Czy ""jeszcze ciƒô≈ºsza praca"" pr...",[1846262693394588154],everyone,1187748790863839232,...,[@KosiniakKamysz],[],[],4,1,38,0,443,"Will ""even harder work"" translate into the fu...","Will ""even harder work"" translate into the fu..."
3,1846261341327446272,False,,2024-10-15 18:46:20+00:00,"[{'type': 'quoted', 'id': '1846159669573689602'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",Polecam siƒô zapoznaƒá z tre≈õciƒÖ konferencji üëá ...,[1846261341327446163],everyone,1187748790863839232,...,[],[https://t.co/U7LAeL2cqP],[],9,2,48,0,687,I recommend reading the content of the confere...,I recommend reading the content of the confere...
4,1846104865829015552,False,,2024-10-15 08:24:33+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",‚ùå Kilka dni temu zag≈Çosowa≈Çem za odwo≈Çaniem K...,[1846104865829015639],everyone,1187748790863839232,...,[],"[https://t.co/JQAQMEKjEs, https://t.co/4qtpUJO...",[],30,0,141,0,3354,"‚ùå A few days ago, I voted to dismiss Krzyszto...",":cross_mark: A few days ago, I voted to dismi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11452,1721611648257921024,0,,2023-11-06 19:32:57+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",Prezydent RP jest gwarantem ciƒÖg≈Ço≈õci w≈Çadzy p...,[1721611648257921084],everyone,964017524,...,[],[],[],17,6,53,1,4871,The President of the Republic of Poland is the...,The President of the Republic of Poland is the...
11453,1720163035048706304,0,,2023-11-02 19:36:41+00:00,"[{'type': 'quoted', 'id': '1720122294914187659'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",W pa≈Ñstwach o ustabilizowanej demokracji si≈Ça ...,[1720163035048706306],everyone,964017524,...,[],[https://t.co/u1vzB6ImaB],[],18,2,41,2,3152,"In countries with stable democracy, the streng...","In countries with stable democracy, the streng..."
11454,1716897815400792320,0,,2023-10-24 19:21:52+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",Piƒôkny epilog kampanii wyborczej:\n ‚úÖ Ô∏è @Rober...,[1716897815400792400],everyone,964017524,...,"[@RobertTelus, @KosiniakKamysz]",[https://t.co/MGuhesegb2],[#TrzeciaDroga],26,7,97,1,5674,A beautiful epilogue of the election campaign:...,A beautiful epilogue of the election campaign:...
11455,1714387831425052928,0,3370515933,2023-10-17 21:08:05+00:00,"[{'type': 'replied_to', 'id': '171438631727710...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",@motykamilosz Jak widaƒá nie tylko w piosenkach...,[1714387831425052856],everyone,964017524,...,[@motykamilosz],[],[],1,2,50,0,1301,"As you can see, not only in Eleni's songs ""Lo...","As you can see, not only in Eleni's songs ""Lo..."


In [31]:
df['possibly_sensitive'] = df['possibly_sensitive'].astype(bool)

In [32]:
username_to_realname = {
    'bartlomiejpejo': 'Bart≈Çomiej Pejo',
    'RobertBiedron': 'Robert Biedro≈Ñ',
    'PatrykJaki': 'Patryk Jaki',
    'Kpelczynska': 'Katarzyna Pelczy≈Ñska',
    'OklaDrewnowicz': 'Agnieszka Ok≈Ça-Drewnowicz',
    'KosiniakKamysz': 'W≈Çadys≈Çaw Kosiniak-Kamysz',
    'mwojcik_': 'Micha≈Ç W√≥jcik',
    'MorawieckiM': 'Mateusz Morawiecki',
    'SlawomirMentzen': 'S≈Çawomir Mentzen',
    'Wlodek_Skalik': 'W≈Çodzimierz Skalik',
    'BeataSzydlo': 'Beata Szyd≈Ço',
    'WTumanowicz': 'Witold Tumanowicz',
    'KGawkowski': 'Krzysztof Gawkowski',
    'wlodekczarzasty': 'W≈Çodzimierz Czarzasty',
    'Kaminski_M_': 'Mariusz Kami≈Ñski',
    'Macierewicz_A': 'Antoni Macierewicz',
    'elzbietawitek': 'El≈ºbieta Witek',
    'aga_buczynska': 'Agnieszka Buczy≈Ñska',
    'szymon_holownia': 'Szymon Ho≈Çownia',
    'DorotaNiedziela': 'Dorota Niedziela',
    'EwaKopacz': 'Ewa Kopacz',
    'Leszczyna': 'Izabela Leszczyna',
    'M_K_Blonska': 'Ma≈Çgorzata Kidawa-B≈Ço≈Ñska',
    'bbudka': 'Borys Budka',
    'donaldtusk': 'Donald Tusk',
    'DariuszKlimczak': 'Dariusz Klimczak',
    'GrzybAndrzej': 'Andrzej Grzyb',
    'Hetman_K': 'Krzysztof Hetman',
    'JarubasAdam': 'Adam Jarubas',
    'Paslawska': 'Urszula Pas≈Çawska',
    'TudujKrzysztof': 'Krzysztof Tuduj',
    'ZalewskiPawel': 'Pawe≈Ç Zalewski'
}

# Add the 'name' column to the dataframe
df['name'] = df['username'].map(username_to_realname)

In [33]:
# Delete next line sign from the 'text_clean_en' column
df['text_clean_en'] = df['text_clean_en'].str.replace('\n', ' ')

In [34]:
# Save the DataFrame to a Parquet file
df.to_parquet('cleaned_data/df_combined.parquet', index=False)