In [1]:
import pandas as pd
import re
import unicodedata

In [2]:
df = pd.read_csv('../Data/filtered_tweets_engie.csv', sep=None, engine='python', encoding='utf-8-sig')
print(df.head())

            id      screen_name                name  \
0  1,72517E+18       gptournier  Guillaume Tournier   
1  1,72868E+18       jouanetwan      Jouan Etwan 💚💛   
2  1,73784E+18  vince_thouvenin           thouvenin   
3  1,74049E+18   BiduleAnatheme     Anathème Bidule   
4  1,74068E+18  vince_thouvenin           thouvenin   

                   created_at  \
0  2023-11-16 16:13:18 +01:00   
1  2023-11-26 08:34:34 +01:00   
2  2023-12-21 15:27:08 +01:00   
3  2023-12-28 22:32:58 +01:00   
4  2023-12-29 11:08:10 +01:00   

                                           full_text  
0  @ENGIEpartFR \n6 mois d’attente et tjs aucune ...  
1  Bonjour @ENGIEpartSAV , l’appli #monpilotageel...  
2  @ENGIEpartFR mon syndic de copropriété sergic ...  
3  @ENGIEpartSAV vous envisagez de vous occuper d...  
4  @ENGIEpartSAV retour de votre technicien "vous...  


In [3]:
def remove_emojis(text):
    emoji_pattern = re.compile("[\U00010000-\U0010ffff]", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [4]:
def clean_text(text):
    if pd.isna(text):
        return ""
    
    text = text.strip()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\n", " ", text)
    text = remove_emojis(text)
    text = re.sub(r"[^a-zA-ZÀ-ÿ0-9.,'!? ]", "", text)
    text = re.sub(r"\s+", " ", text)
    text = unicodedata.normalize("NFKD", text)
    
    return text.strip()

In [5]:
df["full_text"] = df["full_text"].astype(str).apply(clean_text)

df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce', utc=True)

df['created_at'] = df['created_at'].dt.tz_convert(None)

df['hour'] = df['created_at'].dt.hour

df['text_length'] = df['full_text'].apply(len)

df['contains_engie'] = df['full_text'].apply(lambda x: 'ENGIE' in x)

print(df.head())

            id      screen_name                name          created_at  \
0  1,72517E+18       gptournier  Guillaume Tournier 2023-11-16 15:13:18   
1  1,72868E+18       jouanetwan      Jouan Etwan 💚💛 2023-11-26 07:34:34   
2  1,73784E+18  vince_thouvenin           thouvenin 2023-12-21 14:27:08   
3  1,74049E+18   BiduleAnatheme     Anathème Bidule 2023-12-28 21:32:58   
4  1,74068E+18  vince_thouvenin           thouvenin 2023-12-29 10:08:10   

                                           full_text  hour  text_length  \
0  ENGIEpartFR n6 mois dattente et tjs aucune ré...    15          282   
1  Bonjour ENGIEpartSAV , lappli monpilotageelec ...     7          155   
2  ENGIEpartFR mon syndic de copropriété sergic...    14          219   
3  ENGIEpartSAV vous envisagez de vous occuper de...    21          267   
4  ENGIEpartSAV retour de votre technicien vous n...    10          240   

   contains_engie  
0            True  
1            True  
2            True  
3            True 

In [6]:
df.to_csv('../Data/data_cleaned.csv', index=False)