# Data cleaning

In this file, we cleaned the downloaded data:
The main steps include:
1. Adding party affiliation to tweet rows
2. Deleting unnecessary downloaded Retweets.
3. Deleting links and mentions from the tweets text and saving them to separate columns
4. Expanding the column of public metrics
5. Encoding emojis in a unified format
6. Translating tweets using Google Translate in Google Sheets
7. Saving all downloaded tweets to one file

### 1.  Used libraries

In [35]:
import os
import pandas as pd
import re
import emoji

### 2. Reading JSON files and transforming them into party-specific pickle files

In [37]:
base_input_path = 'data/tweets_data_final'
subfolders = ['Konfederacja', 'NL', 'PIS', 'PO', 'PL2050', 'PSL']
output_folder = 'data/tweets_data_combined'

os.makedirs(output_folder, exist_ok=True)

for subfolder in subfolders:
    folder_path = os.path.join(base_input_path, subfolder)
    dataframes = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            politician = filename.split("_tweets.json")[0]
            try:
                df = pd.read_json(file_path)  
                df["username"] = politician  
                df["party"] = subfolder
                print(f"Read {len(df)} rows from {file_path}")  
                dataframes.append(df)
            except ValueError as e:
                print(f"Error reading {file_path}: {e}")
    
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        
        output_file_path = os.path.join(output_folder, f'{subfolder}_combined.pkl')
        combined_df.to_pickle(output_file_path) 
        
        print(f"Saved {subfolder} combined data to {output_file_path}")

print("Processing complete!")

Read 320 rows from data/tweets_data_final/Konfederacja/placzekgrzegorz_2024-04-16_2024-10-15.json
Read 597 rows from data/tweets_data_final/Konfederacja/MichalWawer_2023-10-16_2024-10-15.json
Read 1318 rows from data/tweets_data_final/Konfederacja/KonradBerkowicz_2024-04-16_2024-10-15_vol1 (1).json
Read 950 rows from data/tweets_data_final/Konfederacja/Wlodek_Skalik_2023-10-16_2024-10-15.json
Read 721 rows from data/tweets_data_final/Konfederacja/SlawomirMentzen_2023-10-16_2024-10-15.json
Read 889 rows from data/tweets_data_final/Konfederacja/GrzegorzBraun__2023-10-16_2024-10-15.json
Read 175 rows from data/tweets_data_final/Konfederacja/TudujKrzysztof_2023-10-16_2024-10-15.json
Read 964 rows from data/tweets_data_final/Konfederacja/bartlomiejpejo_2023-10-16_2024-10-15.json
Read 421 rows from data/tweets_data_final/Konfederacja/placzekgrzegorz_2023-10-16_2024-04-15.json
Read 772 rows from data/tweets_data_final/Konfederacja/MarSypniewski_2023-10-16_2024-10-15.json
Read 289 rows from da

### 3. Data cleaning

In [38]:
df_konf = pd.read_pickle(os.path.join(output_folder, 'Konfederacja_combined.pkl'))
df_NL = pd.read_pickle(os.path.join(output_folder, 'NL_combined.pkl'))
df_PIS = pd.read_pickle(os.path.join(output_folder, 'PIS_combined.pkl'))
df_PO = pd.read_pickle(os.path.join(output_folder, 'PO_combined.pkl'))
df_PL2050 = pd.read_pickle(os.path.join(output_folder, 'PL2050_combined.pkl'))
df_PSL = pd.read_pickle(os.path.join(output_folder, 'PSL_combined.pkl'))

In [39]:
df_konf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8177 entries, 0 to 8176
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   public_metrics          8177 non-null   object             
 1   reply_settings          8177 non-null   object             
 2   entities                7489 non-null   object             
 3   created_at              8177 non-null   datetime64[ns, UTC]
 4   attachments             4073 non-null   object             
 5   edit_controls           8177 non-null   object             
 6   author_id               8177 non-null   int64              
 7   edit_history_tweet_ids  8177 non-null   object             
 8   lang                    8177 non-null   object             
 9   possibly_sensitive      8177 non-null   bool               
 10  id                      8177 non-null   int64              
 11  conversation_id         8177 non-null   int

In [40]:
# Merge all dataframes into one
df = pd.concat([df_konf, df_NL, df_PIS, df_PO, df_PL2050, df_PSL], ignore_index=True)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26677 entries, 0 to 26676
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   public_metrics          26677 non-null  object             
 1   reply_settings          26677 non-null  object             
 2   entities                23437 non-null  object             
 3   created_at              26677 non-null  datetime64[ns, UTC]
 4   attachments             11931 non-null  object             
 5   edit_controls           26661 non-null  object             
 6   author_id               26677 non-null  float64            
 7   edit_history_tweet_ids  26661 non-null  object             
 8   lang                    26677 non-null  object             
 9   possibly_sensitive      26661 non-null  object             
 10  id                      26661 non-null  float64            
 11  conversation_id         26661 non-null  f

In [42]:
pd.options.display.float_format = '{:.0f}'.format
df['id'] = df['id'].fillna(0).astype('int64')
df['id']

0        1846086999964283136
1        1845748090461966592
2        1845366606823657984
3        1845006197847360000
4        1844633149784891648
                ...         
26672    1717638951333245440
26673    1717063007988052224
26674    1716417809469538560
26675    1714372612611060224
26676    1714372612611060224
Name: id, Length: 26677, dtype: int64

In [43]:
df['id'].nunique()

26548

We need to remove duplicate tweets because our custom downloading loop occasionally downloads the same tweet two times to ensure completeness.

In [44]:
# Remove duplicates from the dataframe based on specific columns
df.drop_duplicates(subset=['id'], inplace=True)

In [45]:
df.head()

Unnamed: 0,public_metrics,reply_settings,entities,created_at,attachments,edit_controls,author_id,edit_history_tweet_ids,lang,possibly_sensitive,id,conversation_id,text,category,context_annotations,in_reply_to_user_id,referenced_tweets,username,party,geo
0,"{'retweet_count': 230, 'reply_count': 61, 'lik...",everyone,"{'urls': [{'start': 277, 'end': 300, 'url': 'h...",2024-10-15 07:13:34+00:00,"{'media_keys': ['3_1846083966849159168', '3_18...","{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1846086770229694583, 1846086999964283214]",pl,False,1846086999964283136,1846086999964283136,❌ Rząd polski zamierza budować w Polsce 49 Cen...,Original,,,,placzekgrzegorz_2024-04-16_2024-10-15.json,Konfederacja,
1,"{'retweet_count': 1301, 'reply_count': 169, 'l...",everyone,"{'urls': [{'start': 276, 'end': 299, 'url': 'h...",2024-10-14 08:46:51+00:00,,"{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1845747336862961872, 1845748090461966651]",pl,False,1845748090461966592,1845748090461966592,❌ Szambo wybija i robi się coraz ciekawiej. Na...,Original,,,,placzekgrzegorz_2024-04-16_2024-10-15.json,Konfederacja,
2,"{'retweet_count': 682, 'reply_count': 145, 'li...",everyone,"{'urls': [{'start': 279, 'end': 302, 'url': 'h...",2024-10-13 07:30:58+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845366606823657982],pl,False,1845366606823657984,1845366606823657984,"❌ NIE ROZUMIEM, JAK MOŻNA KRZYWDZIĆ W TEN SPOS...",Original,"[{'domain': {'id': '10', 'name': 'Person', 'de...",,,placzekgrzegorz_2024-04-16_2024-10-15.json,Konfederacja,
3,"{'retweet_count': 271, 'reply_count': 56, 'lik...",everyone,"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",2024-10-12 07:38:50+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845006197847359885],pl,False,1845006197847360000,1845006197847360000,🆘 Firma farmaceutyczna GSK zapłaci ponad 2 ml...,Original,,,,placzekgrzegorz_2024-04-16_2024-10-15.json,Konfederacja,
4,"{'retweet_count': 214, 'reply_count': 56, 'lik...",everyone,"{'urls': [{'start': 281, 'end': 304, 'url': 'h...",2024-10-11 06:56:29+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1844633149784891665],pl,False,1844633149784891648,1844633149784891648,❌ O CO TUTAJ CHODZI? W październiku 2024 r. sz...,Original,,,,placzekgrzegorz_2024-04-16_2024-10-15.json,Konfederacja,


We need to delete retweets because they are wrongly provided by the X API. We want to analyze only original tweets, replies, and quotes.

In [46]:
df = df[df['category'] != 'Retweet']

In [47]:
# Update the 'username' column to keep only the string until '_2'
df['username'] = df['username'].str.split('_2').str[0]

In [48]:
category_summary = df['category'].value_counts()
print(category_summary)
total_tweets = category_summary.sum()
print(f"Total tweets: {total_tweets}")

category
Original    17595
Reply        5641
Quote        2826
Name: count, dtype: int64
Total tweets: 26062


In [49]:
# Ensure the created_at column is in datetime format
df['created_at'] = pd.to_datetime(df['created_at'])

In [50]:
df.head()

Unnamed: 0,public_metrics,reply_settings,entities,created_at,attachments,edit_controls,author_id,edit_history_tweet_ids,lang,possibly_sensitive,id,conversation_id,text,category,context_annotations,in_reply_to_user_id,referenced_tweets,username,party,geo
0,"{'retweet_count': 230, 'reply_count': 61, 'lik...",everyone,"{'urls': [{'start': 277, 'end': 300, 'url': 'h...",2024-10-15 07:13:34+00:00,"{'media_keys': ['3_1846083966849159168', '3_18...","{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1846086770229694583, 1846086999964283214]",pl,False,1846086999964283136,1846086999964283136,❌ Rząd polski zamierza budować w Polsce 49 Cen...,Original,,,,placzekgrzegorz,Konfederacja,
1,"{'retweet_count': 1301, 'reply_count': 169, 'l...",everyone,"{'urls': [{'start': 276, 'end': 299, 'url': 'h...",2024-10-14 08:46:51+00:00,,"{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1845747336862961872, 1845748090461966651]",pl,False,1845748090461966592,1845748090461966592,❌ Szambo wybija i robi się coraz ciekawiej. Na...,Original,,,,placzekgrzegorz,Konfederacja,
2,"{'retweet_count': 682, 'reply_count': 145, 'li...",everyone,"{'urls': [{'start': 279, 'end': 302, 'url': 'h...",2024-10-13 07:30:58+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845366606823657982],pl,False,1845366606823657984,1845366606823657984,"❌ NIE ROZUMIEM, JAK MOŻNA KRZYWDZIĆ W TEN SPOS...",Original,"[{'domain': {'id': '10', 'name': 'Person', 'de...",,,placzekgrzegorz,Konfederacja,
3,"{'retweet_count': 271, 'reply_count': 56, 'lik...",everyone,"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",2024-10-12 07:38:50+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845006197847359885],pl,False,1845006197847360000,1845006197847360000,🆘 Firma farmaceutyczna GSK zapłaci ponad 2 ml...,Original,,,,placzekgrzegorz,Konfederacja,
4,"{'retweet_count': 214, 'reply_count': 56, 'lik...",everyone,"{'urls': [{'start': 281, 'end': 304, 'url': 'h...",2024-10-11 06:56:29+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1844633149784891665],pl,False,1844633149784891648,1844633149784891648,❌ O CO TUTAJ CHODZI? W październiku 2024 r. sz...,Original,,,,placzekgrzegorz,Konfederacja,


In [51]:
print(df.loc[1, 'text'])

❌ Szambo wybija i robi się coraz ciekawiej. Na światło dzienne wychodzą bowiem coraz to nowe fakty. Otóż wytyczne funkcjonowania 49 Centrów Integracji Cudzoziemców (CIC) przewidują dla cudzoziemców w całej Polsce między innymi… zatrudnianie OSOBISTYCH ASYSTENTÓW w urzędach,… https://t.co/OZyTSwcMAb


In [52]:
def add_space_around_emojis(text):
    return ''.join(f' {char} ' if char in emoji.EMOJI_DATA or re.match(r'[\U0001F1E6-\U0001F1FF]', char) else char for char in text)

df['text'] = df['text'].apply(add_space_around_emojis)

def clean_text(text):
    mentions = re.findall(r'@\w+', text)
    text = re.sub(r'@\w+', '', text)
    links = re.findall(r'http\S+', text)
    text = re.sub(r'http\S+', '', text)
    hashtags = re.findall(r'#\w+', text)
    text = re.sub(r'(?<!\s)([\U0001F600-\U0001F64F])', r' \1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F])(?!\s)', r'\1 ', text)
    return [text, mentions, links, hashtags]

df[['text_clean', 'mentions', 'links', 'hashtags']] = pd.DataFrame(df['text'].apply(clean_text).tolist(), index=df.index)

In [53]:
df.head()

Unnamed: 0,public_metrics,reply_settings,entities,created_at,attachments,edit_controls,author_id,edit_history_tweet_ids,lang,possibly_sensitive,...,context_annotations,in_reply_to_user_id,referenced_tweets,username,party,geo,text_clean,mentions,links,hashtags
0,"{'retweet_count': 230, 'reply_count': 61, 'lik...",everyone,"{'urls': [{'start': 277, 'end': 300, 'url': 'h...",2024-10-15 07:13:34+00:00,"{'media_keys': ['3_1846083966849159168', '3_18...","{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1846086770229694583, 1846086999964283214]",pl,False,...,,,,placzekgrzegorz,Konfederacja,,❌ Rząd polski zamierza budować w Polsce 49 C...,[],"[https://t.co/gL3O8F0ITB, https://t.co/cay37TX...",[]
1,"{'retweet_count': 1301, 'reply_count': 169, 'l...",everyone,"{'urls': [{'start': 276, 'end': 299, 'url': 'h...",2024-10-14 08:46:51+00:00,,"{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1845747336862961872, 1845748090461966651]",pl,False,...,,,,placzekgrzegorz,Konfederacja,,❌ Szambo wybija i robi się coraz ciekawiej. ...,[],[https://t.co/OZyTSwcMAb],[]
2,"{'retweet_count': 682, 'reply_count': 145, 'li...",everyone,"{'urls': [{'start': 279, 'end': 302, 'url': 'h...",2024-10-13 07:30:58+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845366606823657982],pl,False,...,"[{'domain': {'id': '10', 'name': 'Person', 'de...",,,placzekgrzegorz,Konfederacja,,"❌ NIE ROZUMIEM, JAK MOŻNA KRZYWDZIĆ W TEN SP...",[@donaldtusk],[https://t.co/rIGkIpR8sw],[#RadaKrajowaKO]
3,"{'retweet_count': 271, 'reply_count': 56, 'lik...",everyone,"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",2024-10-12 07:38:50+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845006197847359885],pl,False,...,,,,placzekgrzegorz,Konfederacja,,🆘 Firma farmaceutyczna GSK zapłaci ponad 2 ...,[],[https://t.co/cwusG1221F],[]
4,"{'retweet_count': 214, 'reply_count': 56, 'lik...",everyone,"{'urls': [{'start': 281, 'end': 304, 'url': 'h...",2024-10-11 06:56:29+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1844633149784891665],pl,False,...,,,,placzekgrzegorz,Konfederacja,,❌ O CO TUTAJ CHODZI? W październiku 2024 r. ...,"[@MZ_GOV_PL, @Leszczyna, @NFZ_GOV_PL]",[https://t.co/RkylDcUHbo],[]


In [54]:
df.drop(columns=['entities'], inplace=True)

In [55]:
df['retweet_count'] = df['public_metrics'].apply(lambda x: x['retweet_count'])
df['reply_count'] = df['public_metrics'].apply(lambda x: x['reply_count'])
df['like_count'] = df['public_metrics'].apply(lambda x: x['like_count'])
df['quote_count'] = df['public_metrics'].apply(lambda x: x['quote_count'])
df['impression_count'] = df['public_metrics'].apply(lambda x: x['impression_count'])

df.drop(columns=['public_metrics'], inplace=True)

In [56]:
df

Unnamed: 0,reply_settings,created_at,attachments,edit_controls,author_id,edit_history_tweet_ids,lang,possibly_sensitive,id,conversation_id,...,geo,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count
0,everyone,2024-10-15 07:13:34+00:00,"{'media_keys': ['3_1846083966849159168', '3_18...","{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1846086770229694583, 1846086999964283214]",pl,False,1846086999964283136,1846086999964283136,...,,❌ Rząd polski zamierza budować w Polsce 49 C...,[],"[https://t.co/gL3O8F0ITB, https://t.co/cay37TX...",[],230,61,644,7,11648
1,everyone,2024-10-14 08:46:51+00:00,,"{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1845747336862961872, 1845748090461966651]",pl,False,1845748090461966592,1845748090461966592,...,,❌ Szambo wybija i robi się coraz ciekawiej. ...,[],[https://t.co/OZyTSwcMAb],[],1301,169,3845,57,146584
2,everyone,2024-10-13 07:30:58+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845366606823657982],pl,False,1845366606823657984,1845366606823657984,...,,"❌ NIE ROZUMIEM, JAK MOŻNA KRZYWDZIĆ W TEN SP...",[@donaldtusk],[https://t.co/rIGkIpR8sw],[#RadaKrajowaKO],682,145,2061,28,100757
3,everyone,2024-10-12 07:38:50+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845006197847359885],pl,False,1845006197847360000,1845006197847360000,...,,🆘 Firma farmaceutyczna GSK zapłaci ponad 2 ...,[],[https://t.co/cwusG1221F],[],271,56,989,8,30769
4,everyone,2024-10-11 06:56:29+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1844633149784891665],pl,False,1844633149784891648,1844633149784891648,...,,❌ O CO TUTAJ CHODZI? W październiku 2024 r. ...,"[@MZ_GOV_PL, @Leszczyna, @NFZ_GOV_PL]",[https://t.co/RkylDcUHbo],[],214,56,678,9,17432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26671,everyone,2023-10-27 04:07:40+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",1201875318,[1717754912929435789],pl,0,1717754912929435904,1717638951333245440,...,{'place_id': '47c001064da7125c'},"Gdyby coś się zmieniło, jestem do dyspozycji ...",[@Speranza7Andrea],[],[],0,1,1,0,53
26672,everyone,2023-10-26 20:26:52+00:00,{'media_keys': ['3_1717638945809252352']},"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1201875318,[1717638951333245364],pl,0,1717638951333245440,1717638951333245440,...,,Zaświadczenie o wyborze na Posła na Sejm RP 🇵...,[],[https://t.co/Dl8oj3iPCW],[],9,13,91,0,3191
26673,everyone,2023-10-25 06:18:17+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1201875318,[1717063007988052299],pl,0,1717063007988052224,1717063007988052224,...,{'place_id': '47c001064da7125c'},Ceny paliw na stacjach Orlen rosną w szybkim t...,[],[],[],17,11,39,2,1977
26674,everyone,2023-10-23 11:34:29+00:00,{'media_keys': ['7_1716417715710156800']},"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1201875318,[1716417809469538514],pl,0,1716417809469538560,1716417809469538560,...,{'place_id': '535f0c2de0121451'},Taki mandat to ja rozumiem 😃 \nDziękuję za każ...,[],[https://t.co/de5flsSkCd],[],38,14,177,6,28476


In [57]:
df['id'] = df['id'].astype('int64')

In [None]:
df_clean_text = df[['id', 'text_clean']]

In [60]:
df_clean_text = df_clean_text[df_clean_text['text_clean'].notna() & df_clean_text['text_clean'].str.strip().ne('')]

# Reset index if needed
df_clean_text = df_clean_text.reset_index(drop=True)

In [61]:
df_clean_text.to_csv('data_for_translation.csv', index=False)

In [69]:
df_en_text = pd.read_csv('tweets_translation/text_clean_en1.csv')
df_en_text.head()


Unnamed: 0,id,text_clean,text_clean_en
0,1846086999964283136,❌ Rząd polski zamierza budować w Polsce 49 C...,❌ The Polish government intends to build 49 F...
1,1845748090461966592,❌ Szambo wybija i robi się coraz ciekawiej. ...,❌ The cesspool is breaking out and it's getti...
2,1845366606823657984,"❌ NIE ROZUMIEM, JAK MOŻNA KRZYWDZIĆ W TEN SP...",❌ I DON'T UNDERSTAND HOW YOU CAN HURT YOUR OW...
3,1845006197847360000,🆘 Firma farmaceutyczna GSK zapłaci ponad 2 ...,🆘 The pharmaceutical company GSK will pay ove...
4,1844633149784891648,❌ O CO TUTAJ CHODZI? W październiku 2024 r. ...,"❌ WHAT IS GOING ON HERE? In October 2024, her..."


In [64]:
#df_en_text["id"] = df_en_text["id"].apply(lambda x: int(float(x.replace(',', ''))))

In [74]:
if 'text_clean_en' in df_en_text.columns:
    # Ensure 'id' is the same type in both DataFrames
    df['id'] = df['id'].astype(str)
    df_en_text['id'] = df_en_text['id'].astype(str)

    # Perform the merge
    df = df.merge(df_en_text[['id', 'text_clean_en']], on='id', how='left')

    display(df.head())
else:
    print("Column 'text_clean_en' does not exist in df_en_text")

Unnamed: 0,reply_settings,created_at,attachments,edit_controls,author_id,edit_history_tweet_ids,lang,possibly_sensitive,id,conversation_id,...,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count,text_clean_en
0,everyone,2024-10-15 07:13:34+00:00,"{'media_keys': ['3_1846083966849159168', '3_18...","{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1846086770229694583, 1846086999964283214]",pl,False,1846086999964283136,1846086999964283136,...,❌ Rząd polski zamierza budować w Polsce 49 C...,[],"[https://t.co/gL3O8F0ITB, https://t.co/cay37TX...",[],230,61,644,7,11648,❌ The Polish government intends to build 49 F...
1,everyone,2024-10-14 08:46:51+00:00,,"{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1845747336862961872, 1845748090461966651]",pl,False,1845748090461966592,1845748090461966592,...,❌ Szambo wybija i robi się coraz ciekawiej. ...,[],[https://t.co/OZyTSwcMAb],[],1301,169,3845,57,146584,❌ The cesspool is breaking out and it's getti...
2,everyone,2024-10-13 07:30:58+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845366606823657982],pl,False,1845366606823657984,1845366606823657984,...,"❌ NIE ROZUMIEM, JAK MOŻNA KRZYWDZIĆ W TEN SP...",[@donaldtusk],[https://t.co/rIGkIpR8sw],[#RadaKrajowaKO],682,145,2061,28,100757,❌ I DON'T UNDERSTAND HOW YOU CAN HURT YOUR OW...
3,everyone,2024-10-12 07:38:50+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845006197847359885],pl,False,1845006197847360000,1845006197847360000,...,🆘 Firma farmaceutyczna GSK zapłaci ponad 2 ...,[],[https://t.co/cwusG1221F],[],271,56,989,8,30769,🆘 The pharmaceutical company GSK will pay ove...
4,everyone,2024-10-11 06:56:29+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1844633149784891665],pl,False,1844633149784891648,1844633149784891648,...,❌ O CO TUTAJ CHODZI? W październiku 2024 r. ...,"[@MZ_GOV_PL, @Leszczyna, @NFZ_GOV_PL]",[https://t.co/RkylDcUHbo],[],214,56,678,9,17432,"❌ WHAT IS GOING ON HERE? In October 2024, her..."


In [28]:
# Replace '#VALUE!' with NaN in 'text_clean_en' column
#df['text_clean_en'] = df['text_clean_en'].replace('#VALUE!', pd.NA)

In [75]:
df['text_clean_en_demojized'] = df['text_clean_en'].apply(lambda x: emoji.demojize(str(x)) if pd.notnull(x) else x)

df[['text_clean_en', 'text_clean_en_demojized']].head()

Unnamed: 0,text_clean_en,text_clean_en_demojized
0,❌ The Polish government intends to build 49 F...,:cross_mark: The Polish government intends to...
1,❌ The cesspool is breaking out and it's getti...,:cross_mark: The cesspool is breaking out and...
2,❌ I DON'T UNDERSTAND HOW YOU CAN HURT YOUR OW...,:cross_mark: I DON'T UNDERSTAND HOW YOU CAN H...
3,🆘 The pharmaceutical company GSK will pay ove...,:SOS_button: The pharmaceutical company GSK w...
4,"❌ WHAT IS GOING ON HERE? In October 2024, her...",:cross_mark: WHAT IS GOING ON HERE? In Octobe...


In [76]:
df

Unnamed: 0,reply_settings,created_at,attachments,edit_controls,author_id,edit_history_tweet_ids,lang,possibly_sensitive,id,conversation_id,...,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count,text_clean_en,text_clean_en_demojized
0,everyone,2024-10-15 07:13:34+00:00,"{'media_keys': ['3_1846083966849159168', '3_18...","{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1846086770229694583, 1846086999964283214]",pl,False,1846086999964283136,1846086999964283136,...,[],"[https://t.co/gL3O8F0ITB, https://t.co/cay37TX...",[],230,61,644,7,11648,❌ The Polish government intends to build 49 F...,:cross_mark: The Polish government intends to...
1,everyone,2024-10-14 08:46:51+00:00,,"{'edits_remaining': 4, 'is_edit_eligible': Tru...",1284852220593414144,"[1845747336862961872, 1845748090461966651]",pl,False,1845748090461966592,1845748090461966592,...,[],[https://t.co/OZyTSwcMAb],[],1301,169,3845,57,146584,❌ The cesspool is breaking out and it's getti...,:cross_mark: The cesspool is breaking out and...
2,everyone,2024-10-13 07:30:58+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845366606823657982],pl,False,1845366606823657984,1845366606823657984,...,[@donaldtusk],[https://t.co/rIGkIpR8sw],[#RadaKrajowaKO],682,145,2061,28,100757,❌ I DON'T UNDERSTAND HOW YOU CAN HURT YOUR OW...,:cross_mark: I DON'T UNDERSTAND HOW YOU CAN H...
3,everyone,2024-10-12 07:38:50+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1845006197847359885],pl,False,1845006197847360000,1845006197847360000,...,[],[https://t.co/cwusG1221F],[],271,56,989,8,30769,🆘 The pharmaceutical company GSK will pay ove...,:SOS_button: The pharmaceutical company GSK w...
4,everyone,2024-10-11 06:56:29+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1284852220593414144,[1844633149784891665],pl,False,1844633149784891648,1844633149784891648,...,"[@MZ_GOV_PL, @Leszczyna, @NFZ_GOV_PL]",[https://t.co/RkylDcUHbo],[],214,56,678,9,17432,"❌ WHAT IS GOING ON HERE? In October 2024, her...",:cross_mark: WHAT IS GOING ON HERE? In Octobe...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26057,everyone,2023-10-27 04:07:40+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",1201875318,[1717754912929435789],pl,0,1717754912929435904,1717638951333245440,...,[@Speranza7Andrea],[],[],0,1,1,0,53,"If anything changes, I am at your disposal 😄 👍","If anything changes, I am at your disposal :g..."
26058,everyone,2023-10-26 20:26:52+00:00,{'media_keys': ['3_1717638945809252352']},"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1201875318,[1717638951333245364],pl,0,1717638951333245440,1717638951333245440,...,[],[https://t.co/Dl8oj3iPCW],[],9,13,91,0,3191,I also have a certificate of election as a Mem...,I also have a certificate of election as a Mem...
26059,everyone,2023-10-25 06:18:17+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1201875318,[1717063007988052299],pl,0,1717063007988052224,1717063007988052224,...,[],[],[],17,11,39,2,1977,Fuel prices at Orlen stations are rising rapid...,Fuel prices at Orlen stations are rising rapid...
26060,everyone,2023-10-23 11:34:29+00:00,{'media_keys': ['7_1716417715710156800']},"{'edits_remaining': 5, 'is_edit_eligible': Tru...",1201875318,[1716417809469538514],pl,0,1716417809469538560,1716417809469538560,...,[],[https://t.co/de5flsSkCd],[],38,14,177,6,28476,I understand this mandate 😃 \nThank you for ev...,I understand this mandate :grinning_face_with_...


In [77]:
df['possibly_sensitive'] = df['possibly_sensitive'].astype(bool)

In [78]:
unique_usernames = df['username'].unique()
print(unique_usernames)

['placzekgrzegorz' 'MichalWawer' 'KonradBerkowicz' 'Wlodek_Skalik'
 'SlawomirMentzen' 'GrzegorzBraun_' 'TudujKrzysztof' 'bartlomiejpejo'
 'MarSypniewski' 'Iwaszkiewicz_RJ' 'WTumanowicz' 'RobertBiedron'
 'MoskwaWodnicka' 'poselTTrela' 'KGawkowski' 'AnitaKDZG' 'WandaNowicka'
 'wieczorekdarek' 'PaulinaPW2024' 'AndrzejSzejna' 'wlodekczarzasty'
 'MarcinKulasek' 'K_Smiszek' 'JoankaSW' 'mblaszczak' 'MorawieckiM'
 'Kowalczyk_H' 'elzbietawitek' 'BeataSzydlo' 'PatrykJaki' 'mwojcik_'
 'Kaminski_M_' 'Macierewicz_A' 'trzaskowski_' 'EwaKopacz' 'Leszczyna'
 'M_K_Blonska' 'bbudka' 'Konwinski_PO' 'OklaDrewnowicz' 'CTomczyk'
 'MKierwinski' 'DorotaNiedziela' 'JanGrabiec' 'donaldtusk' 'SlizPawel'
 'Kpelczynska' 'LukaszOsmalak' 'AgaBaranowskaPL' 'hennigkloska'
 'ZalewskiPawel' 'ZywnoMaciej' 'joannamucha' 'szymon_holownia'
 'aga_buczynska' 'GrzybAndrzej' 'PZgorzelskiP' 'KosiniakKamysz'
 'Paslawska' 'Hetman_K' 'JarubasAdam' 'DariuszKlimczak' 'StefanKrajewski']


In [79]:
username_to_realname = {
    'bartlomiejpejo': 'Bartłomiej Pejo',
    'MichalWawer': 'Michał Wawer',
    'SlawomirMentzen': 'Sławomir Mentzen',
    'GrzegorzBraun_': 'Grzegorz Braun',
    'placzekgrzegorz': 'Grzegorz Placzek',
    'Wlodek_Skalik': 'Włodzimierz Skalik',  
    'KonradBerkowicz': 'Konrad Berkowicz',
    'RobertBiedron': 'Robert Biedroń',
    'PatrykJaki': 'Patryk Jaki',
    'Kpelczynska': 'Katarzyna Pełczyńska',
    'OklaDrewnowicz': 'Agnieszka Okła-Drewnowicz',
    'KosiniakKamysz': 'Władysław Kosiniak-Kamysz',
    'mwojcik_': 'Michał Wójcik',
    'MorawieckiM': 'Mateusz Morawiecki',
    'BeataSzydlo': 'Beata Szydło',
    'WTumanowicz': 'Witold Tumanowicz',
    'KGawkowski': 'Krzysztof Gawkowski',
    'wlodekczarzasty': 'Włodzimierz Czarzasty',
    'Kaminski_M_': 'Mariusz Kamiński',
    'Macierewicz_A': 'Antoni Macierewicz',
    'elzbietawitek': 'Elżbieta Witek',
    'aga_buczynska': 'Agnieszka Buczyńska',
    'szymon_holownia': 'Szymon Hołownia',
    'DorotaNiedziela': 'Dorota Niedziela',
    'EwaKopacz': 'Ewa Kopacz',
    'Leszczyna': 'Izabela Leszczyna',
    'M_K_Blonska': 'Małgorzata Kidawa-Błońska',
    'bbudka': 'Borys Budka',
    'donaldtusk': 'Donald Tusk',
    'DariuszKlimczak': 'Dariusz Klimczak',
    'GrzybAndrzej': 'Andrzej Grzyb',
    'Hetman_K': 'Krzysztof Hetman',
    'JarubasAdam': 'Adam Jarubas',
    'Paslawska': 'Urszula Pasławska',
    'TudujKrzysztof': 'Krzysztof Tuduj',
    'ZalewskiPawel': 'Paweł Zalewski',
    'Iwaszkiewicz_RJ': 'Ryszard Iwaszkiewicz',
    'MarSypniewski': 'Marcin Sypniewski',
    'PaulinaPW2024': 'Paulina Piechna-Więckiewicz',
    'AndrzejSzejna': 'Andrzej Szejna',
    'MarcinKulasek': 'Marcin Kulasek',
    'K_Smiszek': 'Krzysztof Śmiszek',
    'JoankaSW': 'Joanna Scheuring-Wielgus',
    'mblaszczak': 'Mariusz Błaszczak',
    'Kowalczyk_H': 'Henryk Kowalczyk',
    'trzaskowski_': 'Rafał Trzaskowski',
    'Konwinski_PO': 'Zbigniew Konwiński',
    'CTomczyk': 'Cezary Tomczyk',
    'MKierwinski': 'Marcin Kierwiński',
    'JanGrabiec': 'Jan Grabiec',
    'SlizPawel': 'Paweł Śliz',
    'LukaszOsmalak': 'Łukasz Osmalak',
    'AgaBaranowskaPL': 'Agnieszka Baranowska',
    'hennigkloska': 'Paulina Hennig-Kloska',
    'ZywnoMaciej': 'Maciej Żywno',
    'joannamucha': 'Joanna Mucha',
    'PZgorzelskiP': 'Piotr Zgorzelski',
    'StefanKrajewski': 'Stefan Krajewski',
    'wieczorekdarek': 'Dariusz Wieczorek',
    'WandaNowicka': 'Wanda Nowicka',
    'AnitaKDZG': 'Anita Kucharska-Dziedzic',
    'poselTTrela': 'Tomasz Trela',
    'MoskwaWodnicka': 'Hanna Zdanowska'
}

# Add the 'name' column to the DataFrame
df['name'] = df['username'].map(username_to_realname)

In [80]:
# Delete next line sign from the 'text_clean_en' column
df['text_clean_en'] = df['text_clean_en'].str.replace('\n', ' ')

In [81]:
# Save the DataFrame to a Parquet file
df.to_parquet('cleaned_data/df_combined.parquet', index=False)