# Data cleaning

In this file, we cleaned the downloaded data:
The main steps include:
1. Adding party affiliation to tweet rows
2. Deleting unnecessary downloaded Retweets.
3. Deleting links and mentions from the tweets text and saving them to separate columns
4. Expanding the column of public metrics
5. Encoding emojis in a unified format
6. Translating tweets using Google Translate in Google Sheets
7. Saving all downloaded tweets to one file

### 1.  Used libraries

In [1]:
import os
import pandas as pd
import re
import emoji

### 2. Reading JSON files and transforming them into party-specific pickle files

In [2]:
base_input_paths = ['data/PoWyborach', 'data/tweets_data_2022']
subfolders = ['Konfederacja', 'NL', 'PIS', 'PO', 'PL2050', 'PSL']
output_folder = 'data/tweets_data_combined'

for subfolder in subfolders:
    dataframes = []
    for base_input_path in base_input_paths:
        folder_path = os.path.join(base_input_path, subfolder)
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(folder_path, filename)
                politician = filename.split("_tweets.json")[0]
                try:
                    df = pd.read_json(file_path)  
                    df["username"] = politician  
                    df["party"] = subfolder
                    print(f"Read {len(df)} rows from {file_path}")  
                    dataframes.append(df)
                except ValueError as e:
                    print(f"Error reading {file_path}: {e}")
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        
        output_file_path = os.path.join(output_folder, f'{subfolder}_combined.pkl')
        combined_df.to_pickle(output_file_path) 
        
        print(f"Saved {subfolder} combined data to {output_file_path}")

print("Processing complete!")

Read 964 rows from data/PoWyborach\Konfederacja\bartlomiejpejo_2023-10-16_2024-10-15.json
Read 889 rows from data/PoWyborach\Konfederacja\GrzegorzBraun__2023-10-16_2024-10-15.json
Read 11 rows from data/PoWyborach\Konfederacja\Iwaszkiewicz_RJ_2023-10-16_2024-10-15.json
Read 289 rows from data/PoWyborach\Konfederacja\KonradBerkowicz_2023-10-15_2024-04-16_vol2 (1).json
Read 1318 rows from data/PoWyborach\Konfederacja\KonradBerkowicz_2024-04-16_2024-10-15_vol1 (1).json
Read 772 rows from data/PoWyborach\Konfederacja\MarSypniewski_2023-10-16_2024-10-15.json
Read 597 rows from data/PoWyborach\Konfederacja\MichalWawer_2023-10-16_2024-10-15.json
Read 421 rows from data/PoWyborach\Konfederacja\placzekgrzegorz_2023-10-16_2024-04-15.json
Read 320 rows from data/PoWyborach\Konfederacja\placzekgrzegorz_2024-04-16_2024-10-15.json
Read 721 rows from data/PoWyborach\Konfederacja\SlawomirMentzen_2023-10-16_2024-10-15.json
Read 175 rows from data/PoWyborach\Konfederacja\TudujKrzysztof_2023-10-16_2024-1

### 3. Data cleaning

In [3]:
df_konfederacja = pd.read_pickle(os.path.join(output_folder, 'Konfederacja_combined.pkl'))
df_NL = pd.read_pickle(os.path.join(output_folder, 'NL_combined.pkl'))
df_PIS = pd.read_pickle(os.path.join(output_folder, 'PIS_combined.pkl'))
df_PO = pd.read_pickle(os.path.join(output_folder, 'PO_combined.pkl'))
df_PL2050 = pd.read_pickle(os.path.join(output_folder, 'PL2050_combined.pkl'))
df_PSL = pd.read_pickle(os.path.join(output_folder, 'PSL_combined.pkl'))

In [4]:
df_konfederacja.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12651 entries, 0 to 12650
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   public_metrics          12651 non-null  object             
 1   in_reply_to_user_id     3078 non-null   float64            
 2   reply_settings          12651 non-null  object             
 3   author_id               12651 non-null  float64            
 4   context_annotations     1334 non-null   object             
 5   id                      12651 non-null  float64            
 6   text                    12651 non-null  object             
 7   edit_controls           12651 non-null  object             
 8   referenced_tweets       4543 non-null   object             
 9   created_at              12651 non-null  datetime64[ns, UTC]
 10  edit_history_tweet_ids  12651 non-null  object             
 11  lang                    12651 non-null  o

In [5]:
# Merge all dataframes into one
df = pd.concat([df_konfederacja, df_NL, df_PIS, df_PO, df_PL2050, df_PSL], ignore_index=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52787 entries, 0 to 52786
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   public_metrics          52787 non-null  object             
 1   in_reply_to_user_id     10972 non-null  float64            
 2   reply_settings          52787 non-null  object             
 3   author_id               52787 non-null  float64            
 4   context_annotations     5948 non-null   object             
 5   id                      52771 non-null  float64            
 6   text                    52787 non-null  object             
 7   edit_controls           52771 non-null  object             
 8   referenced_tweets       19691 non-null  object             
 9   created_at              52787 non-null  datetime64[ns, UTC]
 10  edit_history_tweet_ids  52771 non-null  object             
 11  lang                    52787 non-null  o

In [7]:
pd.options.display.float_format = '{:.0f}'.format
df['id'] = df['id'].fillna(0).astype('int64')
df['id']

0        1846277256509116672
1        1846222583898784000
2        1846161400328028160
3        1846091824101769472
4        1846075343188144128
                ...         
52782    1701274354145780224
52783    1701273238263742720
52784    1701273238263742720
52785    1697128952131661824
52786    1697128952131661824
Name: id, Length: 52787, dtype: int64

In [8]:
df['id'].nunique()

52543

We need to remove duplicate tweets because our custom downloading loop occasionally downloads the same tweet two times to ensure completeness.

In [9]:
# Remove duplicates from the dataframe based on specific columns
df.drop_duplicates(subset=['id'], inplace=True)
# Remove all rows with id == 0
df = df[df['id'] != 0]

In [10]:
df.head()

Unnamed: 0,public_metrics,in_reply_to_user_id,reply_settings,author_id,context_annotations,id,text,edit_controls,referenced_tweets,created_at,edit_history_tweet_ids,lang,conversation_id,entities,possibly_sensitive,category,attachments,geo,username,party
0,"{'retweet_count': 3, 'reply_count': 1, 'like_c...",375146901.0,everyone,1182211615,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1846277256509116672,"@donaldtusk Niezrealizowanie większości ze ""10...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","[{'type': 'replied_to', 'id': '184609177626996...",2024-10-15 19:49:34+00:00,[1846277256509116623],pl,1846091776269963776,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",False,Reply,,,bartlomiejpejo_2023-10-16_2024-10-15.json,Konfederacja
1,"{'retweet_count': 9, 'reply_count': 2, 'like_c...",,everyone,1182211615,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1846222583898784000,Rok po wyborach trzeba powiedzieć jedno - nie ...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 16:12:19+00:00,[1846222583898784025],pl,1846222583898784000,"{'urls': [{'start': 100, 'end': 123, 'url': 'h...",False,Original,{'media_keys': ['13_1846222491456282626']},,bartlomiejpejo_2023-10-16_2024-10-15.json,Konfederacja
2,"{'retweet_count': 4, 'reply_count': 3, 'like_c...",,everyone,1182211615,,1846161400328028160,"❌Mamy rok po wyborach, a Polska pogrąża się w ...","{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 12:09:12+00:00,[1846161400328028272],pl,1846161400328028160,"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",False,Original,{'media_keys': ['3_1846148786910810112']},,bartlomiejpejo_2023-10-16_2024-10-15.json,Konfederacja
3,"{'retweet_count': 6, 'reply_count': 2, 'like_c...",,everyone,1182211615,,1846091824101769472,Mija rok od wyborów parlamentarnych. W kampani...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 07:32:44+00:00,[1846091824101769490],pl,1846091824101769472,"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",False,Original,{'media_keys': ['3_1846091818959597568']},,bartlomiejpejo_2023-10-16_2024-10-15.json,Konfederacja
4,"{'retweet_count': 45, 'reply_count': 18, 'like...",,everyone,1182211615,,1846075343188144128,#Idę11 🇵🇱 https://t.co/KiCe5ATOpX,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 06:27:14+00:00,[1846075343188144153],qme,1846075343188144128,"{'hashtags': [{'start': 0, 'end': 6, 'tag': 'I...",False,Original,{'media_keys': ['13_1846075276687478784']},,bartlomiejpejo_2023-10-16_2024-10-15.json,Konfederacja


We need to delete retweets because they are wrongly provided by the X API. We want to analyze only original tweets, replies, and quotes.

In [11]:
df = df[df['category'] != 'Retweet']

In [12]:
# Update the 'username' column to keep only the string until '_2'
df['username'] = df['username'].str.split('_2').str[0]

In [13]:
category_summary = df['category'].value_counts()
print(category_summary)
total_tweets = category_summary.sum()
print(f"Total tweets: {total_tweets}")

category
Original    32794
Reply       10790
Quote        5478
Name: count, dtype: int64
Total tweets: 49062


In [14]:
# Ensure the created_at column is in datetime format
df['created_at'] = pd.to_datetime(df['created_at'])

In [15]:
df.head()

Unnamed: 0,public_metrics,in_reply_to_user_id,reply_settings,author_id,context_annotations,id,text,edit_controls,referenced_tweets,created_at,edit_history_tweet_ids,lang,conversation_id,entities,possibly_sensitive,category,attachments,geo,username,party
0,"{'retweet_count': 3, 'reply_count': 1, 'like_c...",375146901.0,everyone,1182211615,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1846277256509116672,"@donaldtusk Niezrealizowanie większości ze ""10...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","[{'type': 'replied_to', 'id': '184609177626996...",2024-10-15 19:49:34+00:00,[1846277256509116623],pl,1846091776269963776,"{'mentions': [{'start': 0, 'end': 11, 'usernam...",False,Reply,,,bartlomiejpejo,Konfederacja
1,"{'retweet_count': 9, 'reply_count': 2, 'like_c...",,everyone,1182211615,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1846222583898784000,Rok po wyborach trzeba powiedzieć jedno - nie ...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 16:12:19+00:00,[1846222583898784025],pl,1846222583898784000,"{'urls': [{'start': 100, 'end': 123, 'url': 'h...",False,Original,{'media_keys': ['13_1846222491456282626']},,bartlomiejpejo,Konfederacja
2,"{'retweet_count': 4, 'reply_count': 3, 'like_c...",,everyone,1182211615,,1846161400328028160,"❌Mamy rok po wyborach, a Polska pogrąża się w ...","{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 12:09:12+00:00,[1846161400328028272],pl,1846161400328028160,"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",False,Original,{'media_keys': ['3_1846148786910810112']},,bartlomiejpejo,Konfederacja
3,"{'retweet_count': 6, 'reply_count': 2, 'like_c...",,everyone,1182211615,,1846091824101769472,Mija rok od wyborów parlamentarnych. W kampani...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 07:32:44+00:00,[1846091824101769490],pl,1846091824101769472,"{'urls': [{'start': 278, 'end': 301, 'url': 'h...",False,Original,{'media_keys': ['3_1846091818959597568']},,bartlomiejpejo,Konfederacja
4,"{'retweet_count': 45, 'reply_count': 18, 'like...",,everyone,1182211615,,1846075343188144128,#Idę11 🇵🇱 https://t.co/KiCe5ATOpX,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 06:27:14+00:00,[1846075343188144153],qme,1846075343188144128,"{'hashtags': [{'start': 0, 'end': 6, 'tag': 'I...",False,Original,{'media_keys': ['13_1846075276687478784']},,bartlomiejpejo,Konfederacja


In [16]:
print(df.loc[1, 'text'])

Rok po wyborach trzeba powiedzieć jedno - nie na takie państwo Donald Tusk umawiał się z wyborcami! https://t.co/4Jh5Ni6sgr


In [17]:
def add_space_around_emojis(text):
    return ''.join(f' {char} ' if char in emoji.EMOJI_DATA or re.match(r'[\U0001F1E6-\U0001F1FF]', char) else char for char in text)

df['text'] = df['text'].apply(add_space_around_emojis)

def clean_text(text):
    mentions = re.findall(r'@\w+', text)
    text = re.sub(r'@\w+', '', text)
    links = re.findall(r'http\S+', text)
    text = re.sub(r'http\S+', '', text)
    hashtags = re.findall(r'#\w+', text)
    text = re.sub(r'(?<!\s)([\U0001F600-\U0001F64F])', r' \1', text)
    text = re.sub(r'([\U0001F600-\U0001F64F])(?!\s)', r'\1 ', text)
    return [text, mentions, links, hashtags]

df[['text_clean', 'mentions', 'links', 'hashtags']] = pd.DataFrame(df['text'].apply(clean_text).tolist(), index=df.index)

In [18]:
df.head()

Unnamed: 0,public_metrics,in_reply_to_user_id,reply_settings,author_id,context_annotations,id,text,edit_controls,referenced_tweets,created_at,...,possibly_sensitive,category,attachments,geo,username,party,text_clean,mentions,links,hashtags
0,"{'retweet_count': 3, 'reply_count': 1, 'like_c...",375146901.0,everyone,1182211615,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1846277256509116672,"@donaldtusk Niezrealizowanie większości ze ""10...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","[{'type': 'replied_to', 'id': '184609177626996...",2024-10-15 19:49:34+00:00,...,False,Reply,,,bartlomiejpejo,Konfederacja,"Niezrealizowanie większości ze ""100 konkretów...",[@donaldtusk],[],[]
1,"{'retweet_count': 9, 'reply_count': 2, 'like_c...",,everyone,1182211615,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1846222583898784000,Rok po wyborach trzeba powiedzieć jedno - nie ...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 16:12:19+00:00,...,False,Original,{'media_keys': ['13_1846222491456282626']},,bartlomiejpejo,Konfederacja,Rok po wyborach trzeba powiedzieć jedno - nie ...,[],[https://t.co/4Jh5Ni6sgr],[]
2,"{'retweet_count': 4, 'reply_count': 3, 'like_c...",,everyone,1182211615,,1846161400328028160,"❌ Mamy rok po wyborach, a Polska pogrąża się ...","{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 12:09:12+00:00,...,False,Original,{'media_keys': ['3_1846148786910810112']},,bartlomiejpejo,Konfederacja,"❌ Mamy rok po wyborach, a Polska pogrąża się ...",[],"[https://t.co/zFk5QLd1em, https://t.co/bRV4y07...",[]
3,"{'retweet_count': 6, 'reply_count': 2, 'like_c...",,everyone,1182211615,,1846091824101769472,Mija rok od wyborów parlamentarnych. W kampani...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 07:32:44+00:00,...,False,Original,{'media_keys': ['3_1846091818959597568']},,bartlomiejpejo,Konfederacja,Mija rok od wyborów parlamentarnych. W kampani...,[],"[https://t.co/rtVu3Bh43G, https://t.co/8Q3LME6...",[]
4,"{'retweet_count': 45, 'reply_count': 18, 'like...",,everyone,1182211615,,1846075343188144128,#Idę11 🇵 🇱 https://t.co/KiCe5ATOpX,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 06:27:14+00:00,...,False,Original,{'media_keys': ['13_1846075276687478784']},,bartlomiejpejo,Konfederacja,#Idę11 🇵 🇱,[],[https://t.co/KiCe5ATOpX],[#Idę11]


In [19]:
df.drop(columns=['entities'], inplace=True)

In [20]:
df['retweet_count'] = df['public_metrics'].apply(lambda x: x['retweet_count'])
df['reply_count'] = df['public_metrics'].apply(lambda x: x['reply_count'])
df['like_count'] = df['public_metrics'].apply(lambda x: x['like_count'])
df['quote_count'] = df['public_metrics'].apply(lambda x: x['quote_count'])
df['impression_count'] = df['public_metrics'].apply(lambda x: x['impression_count'])

df.drop(columns=['public_metrics'], inplace=True)

In [21]:
df

Unnamed: 0,in_reply_to_user_id,reply_settings,author_id,context_annotations,id,text,edit_controls,referenced_tweets,created_at,edit_history_tweet_ids,...,party,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count
0,375146901,everyone,1182211615,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1846277256509116672,"@donaldtusk Niezrealizowanie większości ze ""10...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","[{'type': 'replied_to', 'id': '184609177626996...",2024-10-15 19:49:34+00:00,[1846277256509116623],...,Konfederacja,"Niezrealizowanie większości ze ""100 konkretów...",[@donaldtusk],[],[],3,1,33,0,1555
1,,everyone,1182211615,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1846222583898784000,Rok po wyborach trzeba powiedzieć jedno - nie ...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 16:12:19+00:00,[1846222583898784025],...,Konfederacja,Rok po wyborach trzeba powiedzieć jedno - nie ...,[],[https://t.co/4Jh5Ni6sgr],[],9,2,72,0,3031
2,,everyone,1182211615,,1846161400328028160,"❌ Mamy rok po wyborach, a Polska pogrąża się ...","{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 12:09:12+00:00,[1846161400328028272],...,Konfederacja,"❌ Mamy rok po wyborach, a Polska pogrąża się ...",[],"[https://t.co/zFk5QLd1em, https://t.co/bRV4y07...",[],4,3,33,2,8636
3,,everyone,1182211615,,1846091824101769472,Mija rok od wyborów parlamentarnych. W kampani...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 07:32:44+00:00,[1846091824101769490],...,Konfederacja,Mija rok od wyborów parlamentarnych. W kampani...,[],"[https://t.co/rtVu3Bh43G, https://t.co/8Q3LME6...",[],6,2,38,0,2441
4,,everyone,1182211615,,1846075343188144128,#Idę11 🇵 🇱 https://t.co/KiCe5ATOpX,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2024-10-15 06:27:14+00:00,[1846075343188144153],...,Konfederacja,#Idę11 🇵 🇱,[],[https://t.co/KiCe5ATOpX],[#Idę11],45,18,616,2,8634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52753,,everyone,961181894,,1707719554355380480,"Studiujesz na kierunku lekarskim, pielęgniarst...","{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2023-09-29 11:30:44+00:00,[1707719554355380484],...,PSL,"Studiujesz na kierunku lekarskim, pielęgniarst...",[@SejmikMaz],[https://t.co/6zats7JXbY],[],9,0,6,0,2154
52774,,everyone,961181894,,1704120323023454464,Za nami posiedzenie @SejmikMaz. I kolejne wspa...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2023-09-19 13:08:40+00:00,[1704120323023454339],...,PSL,Za nami posiedzenie . I kolejne wsparcie dla m...,[@SejmikMaz],[https://t.co/A7EG9Jzuv1],[#OSP],9,0,15,0,649
52777,,everyone,961181894,,1702668459576786944,Płockie Centrum Onkologii gotowe! Już na począ...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2023-09-15 12:59:29+00:00,[1702668459576787064],...,PSL,Płockie Centrum Onkologii gotowe! Już na począ...,[@SejmikMaz],[https://t.co/OALgj7gqxE],[],8,0,16,0,581
52778,,everyone,961181894,,1701960909369868544,To jedna z największych inwestycji drogowych @...,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",,2023-09-13 14:07:56+00:00,[1701960909369868437],...,PSL,To jedna z największych inwestycji drogowych ...,[@SejmikMaz],[https://t.co/9jWRcVZHXk],[#634],8,0,13,0,621


In [22]:
df['id'] = df['id'].astype('int64')

In [23]:
df = df[df['text_clean'].str.strip().astype(bool)]

In [24]:
df_clean_text = df[['id', 'text_clean']]

df_clean_text.to_csv('data/data_for_translation.csv', index=False)

In [26]:
len(df_clean_text)

48331

In [25]:
df_en_text = pd.read_csv('tweets_translation/translated_tweets.csv')
df_en_text.head()


Unnamed: 0,id,text_clean,text_clean_en
0,1846277256509116672,"Niezrealizowanie większości ze ""100 konkretów...","Failure to implement most of the ""100 specifi..."
1,1846222583898784000,Rok po wyborach trzeba powiedzieć jedno - nie ...,"A year after the elections, one thing must be ..."
2,1846161400328028160,"❌ Mamy rok po wyborach, a Polska pogrąża się ...","❌ We are a year after the elections, and Pola..."
3,1846091824101769472,Mija rok od wyborów parlamentarnych. W kampani...,A year has passed since the parliamentary elec...
4,1846075343188144128,#Idę11 🇵 🇱,#I'm going11 🇵 🇱


In [26]:
df_en_text["id"] = df_en_text["id"].apply(lambda x: int(float(x.replace(',', ''))))

In [27]:
if 'text_clean_en' in df_en_text.columns:
    df = df.merge(df_en_text[['id', 'text_clean_en']], on='id', how='left')

    display(df.head())
else:
    print("Column 'text_clean_en' does not exist in df_clean_text")


Unnamed: 0,id,possibly_sensitive,in_reply_to_user_id,created_at,referenced_tweets,edit_controls,text,edit_history_tweet_ids,reply_settings,author_id,...,text_clean,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count,text_clean_en
0,1846267743022330112,False,509272614.0,2024-10-15 19:11:46+00:00,"[{'type': 'replied_to', 'id': '184615588116918...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@Dariusz_Jonski ""Ani kroku wstecz!"" - w przód ...",[1846267743022330183],everyone,1187748790863839232,...,"""Ani kroku wstecz!"" - w przód też żadnego jak...",[@Dariusz_Jonski],[],[],0,1,13,0,219,"""Not a step back!"" - you haven't put any forw..."
1,1846264777347117568,False,,2024-10-15 18:59:59+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",Właśnie zadzwonił do mnie nieznany numer. Już ...,[1846264777347117471],everyone,1187748790863839232,...,Właśnie zadzwonił do mnie nieznany numer. Już ...,[],[],[],28,9,418,0,6449,An unknown number just called me. I thought it...
2,1846262693394588160,False,955239446.0,2024-10-15 18:51:42+00:00,"[{'type': 'replied_to', 'id': '184612763211236...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@KosiniakKamysz Czy ""jeszcze cięższa praca"" pr...",[1846262693394588154],everyone,1187748790863839232,...,"Czy ""jeszcze cięższa praca"" przełoży się na r...",[@KosiniakKamysz],[],[],4,1,38,0,443,"Will ""even harder work"" translate into the fu..."
3,1846261341327446272,False,,2024-10-15 18:46:20+00:00,"[{'type': 'quoted', 'id': '1846159669573689602'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",Polecam się zapoznać z treścią konferencji 👇 ...,[1846261341327446163],everyone,1187748790863839232,...,Polecam się zapoznać z treścią konferencji 👇,[],[https://t.co/U7LAeL2cqP],[],9,2,48,0,687,I recommend reading the content of the confere...
4,1846104865829015552,False,,2024-10-15 08:24:33+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",❌ Kilka dni temu zagłosowałem za odwołaniem K...,[1846104865829015639],everyone,1187748790863839232,...,❌ Kilka dni temu zagłosowałem za odwołaniem K...,[],"[https://t.co/JQAQMEKjEs, https://t.co/4qtpUJO...",[],30,0,141,0,3354,"❌ A few days ago, I voted to dismiss Krzyszto..."


In [28]:
# Replace '#VALUE!' with NaN in 'text_clean_en' column
df['text_clean_en'] = df['text_clean_en'].replace('#VALUE!', pd.NA)

In [29]:
df['text_clean_en_demojized'] = df['text_clean_en'].apply(lambda x: emoji.demojize(str(x)) if pd.notnull(x) else x)

df[['text_clean_en', 'text_clean_en_demojized']].head()

Unnamed: 0,text_clean_en,text_clean_en_demojized
0,"""Not a step back!"" - you haven't put any forw...","""Not a step back!"" - you haven't put any forw..."
1,An unknown number just called me. I thought it...,An unknown number just called me. I thought it...
2,"Will ""even harder work"" translate into the fu...","Will ""even harder work"" translate into the fu..."
3,I recommend reading the content of the confere...,I recommend reading the content of the confere...
4,"❌ A few days ago, I voted to dismiss Krzyszto...",":cross_mark: A few days ago, I voted to dismi..."


In [30]:
df

Unnamed: 0,id,possibly_sensitive,in_reply_to_user_id,created_at,referenced_tweets,edit_controls,text,edit_history_tweet_ids,reply_settings,author_id,...,mentions,links,hashtags,retweet_count,reply_count,like_count,quote_count,impression_count,text_clean_en,text_clean_en_demojized
0,1846267743022330112,False,509272614,2024-10-15 19:11:46+00:00,"[{'type': 'replied_to', 'id': '184615588116918...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@Dariusz_Jonski ""Ani kroku wstecz!"" - w przód ...",[1846267743022330183],everyone,1187748790863839232,...,[@Dariusz_Jonski],[],[],0,1,13,0,219,"""Not a step back!"" - you haven't put any forw...","""Not a step back!"" - you haven't put any forw..."
1,1846264777347117568,False,,2024-10-15 18:59:59+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",Właśnie zadzwonił do mnie nieznany numer. Już ...,[1846264777347117471],everyone,1187748790863839232,...,[],[],[],28,9,418,0,6449,An unknown number just called me. I thought it...,An unknown number just called me. I thought it...
2,1846262693394588160,False,955239446,2024-10-15 18:51:42+00:00,"[{'type': 'replied_to', 'id': '184612763211236...","{'edits_remaining': 5, 'is_edit_eligible': Fal...","@KosiniakKamysz Czy ""jeszcze cięższa praca"" pr...",[1846262693394588154],everyone,1187748790863839232,...,[@KosiniakKamysz],[],[],4,1,38,0,443,"Will ""even harder work"" translate into the fu...","Will ""even harder work"" translate into the fu..."
3,1846261341327446272,False,,2024-10-15 18:46:20+00:00,"[{'type': 'quoted', 'id': '1846159669573689602'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",Polecam się zapoznać z treścią konferencji 👇 ...,[1846261341327446163],everyone,1187748790863839232,...,[],[https://t.co/U7LAeL2cqP],[],9,2,48,0,687,I recommend reading the content of the confere...,I recommend reading the content of the confere...
4,1846104865829015552,False,,2024-10-15 08:24:33+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",❌ Kilka dni temu zagłosowałem za odwołaniem K...,[1846104865829015639],everyone,1187748790863839232,...,[],"[https://t.co/JQAQMEKjEs, https://t.co/4qtpUJO...",[],30,0,141,0,3354,"❌ A few days ago, I voted to dismiss Krzyszto...",":cross_mark: A few days ago, I voted to dismi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11452,1721611648257921024,0,,2023-11-06 19:32:57+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Fal...",Prezydent RP jest gwarantem ciągłości władzy p...,[1721611648257921084],everyone,964017524,...,[],[],[],17,6,53,1,4871,The President of the Republic of Poland is the...,The President of the Republic of Poland is the...
11453,1720163035048706304,0,,2023-11-02 19:36:41+00:00,"[{'type': 'quoted', 'id': '1720122294914187659'}]","{'edits_remaining': 5, 'is_edit_eligible': Tru...",W państwach o ustabilizowanej demokracji siła ...,[1720163035048706306],everyone,964017524,...,[],[https://t.co/u1vzB6ImaB],[],18,2,41,2,3152,"In countries with stable democracy, the streng...","In countries with stable democracy, the streng..."
11454,1716897815400792320,0,,2023-10-24 19:21:52+00:00,,"{'edits_remaining': 5, 'is_edit_eligible': Tru...",Piękny epilog kampanii wyborczej:\n ✅ ️ @Rober...,[1716897815400792400],everyone,964017524,...,"[@RobertTelus, @KosiniakKamysz]",[https://t.co/MGuhesegb2],[#TrzeciaDroga],26,7,97,1,5674,A beautiful epilogue of the election campaign:...,A beautiful epilogue of the election campaign:...
11455,1714387831425052928,0,3370515933,2023-10-17 21:08:05+00:00,"[{'type': 'replied_to', 'id': '171438631727710...","{'edits_remaining': 5, 'is_edit_eligible': Fal...",@motykamilosz Jak widać nie tylko w piosenkach...,[1714387831425052856],everyone,964017524,...,[@motykamilosz],[],[],1,2,50,0,1301,"As you can see, not only in Eleni's songs ""Lo...","As you can see, not only in Eleni's songs ""Lo..."


In [31]:
df['possibly_sensitive'] = df['possibly_sensitive'].astype(bool)

In [32]:
username_to_realname = {
    'bartlomiejpejo': 'Bartłomiej Pejo',
    'RobertBiedron': 'Robert Biedroń',
    'PatrykJaki': 'Patryk Jaki',
    'Kpelczynska': 'Katarzyna Pelczyńska',
    'OklaDrewnowicz': 'Agnieszka Okła-Drewnowicz',
    'KosiniakKamysz': 'Władysław Kosiniak-Kamysz',
    'mwojcik_': 'Michał Wójcik',
    'MorawieckiM': 'Mateusz Morawiecki',
    'SlawomirMentzen': 'Sławomir Mentzen',
    'Wlodek_Skalik': 'Włodzimierz Skalik',
    'BeataSzydlo': 'Beata Szydło',
    'WTumanowicz': 'Witold Tumanowicz',
    'KGawkowski': 'Krzysztof Gawkowski',
    'wlodekczarzasty': 'Włodzimierz Czarzasty',
    'Kaminski_M_': 'Mariusz Kamiński',
    'Macierewicz_A': 'Antoni Macierewicz',
    'elzbietawitek': 'Elżbieta Witek',
    'aga_buczynska': 'Agnieszka Buczyńska',
    'szymon_holownia': 'Szymon Hołownia',
    'DorotaNiedziela': 'Dorota Niedziela',
    'EwaKopacz': 'Ewa Kopacz',
    'Leszczyna': 'Izabela Leszczyna',
    'M_K_Blonska': 'Małgorzata Kidawa-Błońska',
    'bbudka': 'Borys Budka',
    'donaldtusk': 'Donald Tusk',
    'DariuszKlimczak': 'Dariusz Klimczak',
    'GrzybAndrzej': 'Andrzej Grzyb',
    'Hetman_K': 'Krzysztof Hetman',
    'JarubasAdam': 'Adam Jarubas',
    'Paslawska': 'Urszula Pasławska',
    'TudujKrzysztof': 'Krzysztof Tuduj',
    'ZalewskiPawel': 'Paweł Zalewski'
}

# Add the 'name' column to the dataframe
df['name'] = df['username'].map(username_to_realname)

In [33]:
# Delete next line sign from the 'text_clean_en' column
df['text_clean_en'] = df['text_clean_en'].str.replace('\n', ' ')

In [34]:
# Save the DataFrame to a Parquet file
df.to_parquet('cleaned_data/df_combined.parquet', index=False)