#PREPROCESSING - PART 2

For this second part of the preprocessing, now that the data is unified and has a certain structure, the text will be cleaned and emojis & hashtags will be dealt with. The output of this notebook is, for each language, two csv files: one for data containing emojis and hashtags, and another with these elements removed.

*Note: When saving or loading data from Drive, the paths are specific to my personal Drive*

In [1]:
import pandas as pd
!pip install emoji
import emoji
import re
import numpy
import warnings
#In order to ignore warnings when overwriting row values in panda dataframes
warnings.filterwarnings('ignore')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.0.0.tar.gz (197 kB)
[K     |████████████████████████████████| 197 kB 5.3 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.0.0-py3-none-any.whl size=193022 sha256=95f231fc45a47fffbf5307e9b15beeac77300b69f5f9d0fb293c36cc98d34ecd
  Stored in directory: /root/.cache/pip/wheels/ec/29/4d/3cfe7452ac7d8d83b1930f8a6205c3c9649b24e80f9029fc38
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.0.0


In [2]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


####Imports & functions

In [3]:
#Some dataset's emojis don't show as emojis, we need to replace the emoji code with the emoji in the df
df_emoji_code_to_emoji = pd.read_csv('/content/emoji_codes_csv.csv', encoding='utf8', engine='python')
df_emoji_code_to_emoji = df_emoji_code_to_emoji[["hex code", "emoji_code", "emoji"]]

In [4]:
#defining functions
def unique(list1):
    x = numpy.array(list1)
    return numpy.unique(x)

#If a hashtag is followed by another hashtag or a special character (?!), this divides it
def divideHashtagtext(txt):
  specialChars = "#”.,@?¿¡!|:" 
  for specialChar in specialChars:
    txt = txt.replace(specialChar, "$$"+specialChar)
  txt = txt.replace(',', ' ')
  return txt.split("$$")

#Deletes accents in a string
def deleteAccents(txt):
    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
    )
    for a, b in replacements:
        txt = txt.replace(a, b).replace(a.upper(), b.upper())
    return txt

#Returns all emojis in a string
def check_emojis(tweet : str):
  words = list(tweet)
  listEmojis = str()
  for i in words:
    if emoji.is_emoji(i):
      listEmojis = listEmojis + " " + i
  return listEmojis

#Returns all codes with the structure &#____; (indicating emojis/special symbols)
def getAllEmojiHexCode(df):
  emoji_codes = []
  pattern = "&#(.*?)\;"
  for idx, row in df.iterrows():
    match = re.findall(pattern, row.text)
    for m in match:
      emoji_codes.append( "&#" + m + ";")

  return unique(emoji_codes)

#Replaces hex codes with the corresponding emoji/symbol in a dataframe
def replaceEmojiCodes(df):
  for idx, row in df.iterrows():
    for eidx, erow in df_emoji_code_to_emoji.iterrows():
      row.text = row.text.replace(df_emoji_code_to_emoji['hex code'][eidx], str(df_emoji_code_to_emoji['emoji'][eidx])) 
      df['text'][idx] = row.text

  return df

#Returns keys of a dictionary whose value is superior to "num"
def important_values(dict_, num):
  dict_important = {}
  for (key, value) in dict_.items():
    if value > num:
        dict_important[key] = value
  return dict_important

#Replace all emojis in keys_rep in dataframe for text in between colons, and delete al emojis in keys_del in same dataframe
def replace_emojis(keys_rep, keys_del, df):
  for idx, row in df.iterrows():
    for k in keys_del:
      row.text = row.text.replace(k, "")
      df['text'][idx] = row.text
    for k in keys_rep:
      row.text = row.text.replace(k, " " + emoji.demojize(k) + " ")
      df['text'][idx] = row.text
  return df

In [5]:
#This function returns a cleaned dataframe, plus dictionaries with all the hashtags and emojis in them
def cleanTweets_getDicts(df):
  #Just choose spanish tweets
  hashtag_dict = {}
  emoji_dict = {}
  #Change urls and mentions for a single word
  for idx, row in df.iterrows():
        #Remove ""
        row.text = row.text.strip('"')
        df['text'][idx] = row.text
        #
        all_emojis = check_emojis(row.text)
        all_ats = list(filter(lambda word: word[0]=='@', row.text.split())) 
        all_url = list(filter(lambda word: word[:8]=='https://', row.text.split())) + list(filter(lambda word: word[:7]=='http://', row.text.split()))
        all_hashtags = list(filter(lambda word: word[0]=='#', row.text.split()))   
        ##Deletes indication that the tweet is a Retweet
        if row.text.startswith('RT '):
          row.text = row.text[3:];
          df['text'][idx] = row.text
        ##
        for i in range(len(all_hashtags)):
            all_hashtags[i] = all_hashtags[i]
            if len(divideHashtagtext(all_hashtags[i])) <= 2:
              if deleteAccents(all_hashtags[i]) in hashtag_dict:
                hashtag_dict[deleteAccents(all_hashtags[i])] = hashtag_dict[deleteAccents(all_hashtags[i])] + 1
              else:
                hashtag_dict[deleteAccents(all_hashtags[i])] = 1
            else:
              hashtag_aux_list = divideHashtagtext(all_hashtags[i])
              hashtag_aux_list = [i for i in hashtag_aux_list if i.startswith('#')]
              for string in hashtag_aux_list:
                if string[0] == '#':
                  if deleteAccents(string) in hashtag_dict:
                    hashtag_dict[deleteAccents(string)] = hashtag_dict[deleteAccents(string)] + 1
                  else:
                    hashtag_dict[deleteAccents(string)] = 1
                elif string[0] == '@':
                  row.text = row.text.replace(string, "") #MENTION
                  df['text'][idx] = row.text
        for mention in all_ats:
          row.text = row.text.replace(mention, "") #MENTION
          df['text'][idx] = row.text
        for url in all_url:
          row.text = row.text.replace(url, "") #URL
          df['text'][idx] = row.text
        for e in check_emojis(row.text):
            if e in emoji_dict:
              emoji_dict[e] = emoji_dict[e] + 1
            else:
              emoji_dict[e] = 1
              
  return df, hashtag_dict, emoji_dict

#SPANISH DATA

In [6]:
df = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_spanish_data.csv', encoding='utf8', engine='python')

####Emoji code extraction and replacement

In [None]:
emoji_codes = getAllEmojiHexCode(df)

In [None]:
emoji_codes

array([], dtype=float64)

In [None]:
df = replaceEmojiCodes(df)

####Hashtag and emoji filtering

In [7]:
df, hashtag_dict, emoji_dict = cleanTweets_getDicts(df)

In [8]:
emo_keys = list(emoji_dict.keys())
emo_keys.remove(" ")

In [9]:
hash_keys = list(hashtag_dict.keys())
hash_keys.remove("#")

In [26]:
del hashtag_dict["#"]
del emoji_dict[" "]

In [32]:
print("EMOJIS:")
print("unique emojis:", len(emo_keys))
print("emoji count:", sum(emoji_dict.values()))
print("HASHTAGS: ")
print("unique hashtags:", len(hash_keys))
print("hashtag count:", sum(hashtag_dict.values()))

EMOJIS:
unique emojis: 234
emoji count: 1691
HASHTAGS: 
unique hashtags: 1818
hashtag count: 2656


#####Removal of all emojis and hashtags

In [None]:
#Remove emojis
df = replace_emojis([], emo_keys, df)

In [None]:
#Make sure no hashtag is left in data by mistake
for idx, row in df.iterrows():
      #We do this loop now and not in cells above because we leave a space before every #, and it could cause problems with emojis/special characters
      row.text = row.text.replace("#", " #") 
      df['text'][idx] = row.text
      all_hashtags = list(filter(lambda word: word[0]=='#', row.text.split()))   
      for i in range(len(all_hashtags)):
          hash_keys = hash_keys + all_hashtags
hash_keys = list(dict.fromkeys(hash_keys))
for c in [",","!",":",".","¿","|","?","¡","'"]:
  hash_keys = [s.strip(c) for s in hash_keys]
hash_keys = sorted(hash_keys, key=len, reverse=True)

In [None]:
#Remove hashtags 
for k in hash_keys:
  df['text'] = df.apply(lambda row: row.text.replace(k, ""), axis=1)

In [None]:
df.to_csv('/content/drive/MyDrive/TFG/data/final_data/removed_spanish_data.csv', index=False)  

#####All emojis and hashtags kept

In [None]:
#Replace emojis with text
df = replace_emojis(emo_keys, [], df)

In [None]:
df.to_csv('/content/drive/MyDrive/TFG/data/final_data/mantained_spanish_data.csv', index=False)  

#ITALIAN DATA

In [33]:
df = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_italian_data.csv', encoding='utf8', engine='python')

####Emoji code extraction and replacement

In [34]:
emoji_codes = getAllEmojiHexCode(df)

In [35]:
emoji_codes

array(['&#x1f339;', '&#x1f346;', '&#x1f399;', '&#x1f3c1;', '&#x1f42e;',
       '&#x1f433;', '&#x1f437;', '&#x1f441;', '&#x1f444;', '&#x1f445;',
       '&#x1f447;', '&#x1f44d;', '&#x1f47f;', '&#x1f48b;', '&#x1f493;',
       '&#x1f496;', '&#x1f4a6;', '&#x1f4a9;', '&#x1f4aa;', '&#x1f4af;',
       '&#x1f50b;', '&#x1f51d;', '&#x1f600;', '&#x1f601;', '&#x1f602;',
       '&#x1f605;', '&#x1f606;', '&#x1f608;', '&#x1f609;', '&#x1f60b;',
       '&#x1f60d;', '&#x1f611;', '&#x1f614;', '&#x1f618;', '&#x1f61a;',
       '&#x1f61c;', '&#x1f621;', '&#x1f62d;', '&#x1f630;', '&#x1f643;',
       '&#x1f645;', '&#x1f64f;', '&#x1f914;', '&#x1f921;', '&#x1f923;',
       '&#x1f924;', '&#x1f926;', '&#x1f952;', '&#x25b6;', '&#x2615;',
       '&#x26a1;', '&#x270a;', '&#x270c;', '&#x2753;', '&#x2764;',
       '&#x27a1;'], dtype='<U9')

In [36]:
df = replaceEmojiCodes(df)

In [None]:
df.to_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_replacedemojis_italian_data.csv', index=False) 


In [None]:
df = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_replacedemojis_italian_data.csv', encoding='utf8', engine='python')

####Hashtag and emoji filtering

In [37]:
df, hashtag_dict, emoji_dict = cleanTweets_getDicts(df)

In [38]:
emo_keys = list(emoji_dict.keys())
emo_keys.remove(" ")

In [39]:
hash_keys = list(hashtag_dict.keys())
hash_keys.remove("#")
hash_keys = sorted(hash_keys, key=len, reverse=True)

In [40]:
del hashtag_dict["#"]
del emoji_dict[" "]

In [41]:
print("EMOJIS:")
print("unique emojis:", len(emo_keys))
print("emoji count:", sum(emoji_dict.values()))
print("HASHTAGS: ")
print("unique hashtags:", len(hash_keys))
print("hashtag count:", sum(hashtag_dict.values()))

EMOJIS:
unique emojis: 181
emoji count: 1541
HASHTAGS: 
unique hashtags: 3149
hashtag count: 8819


#####Removal of all emojis and hashtags

In [None]:
#Remove emojis
df = replace_emojis([], emo_keys, df)

In [None]:
df

Unnamed: 0,text,hate speech
0,Fatti trovare te lo do volentieri e ti sbor...,1.0
1,Tu dovresti ricominciare dai semafori a fare ...,1.0
2,"Amore,sei presentabile? Xchè così via Skype ti...",1.0
3,"Salvo poi mandare la culona a Mosca, aummaumm...",1.0
4,Ti sborro io,1.0
...,...,...
17446,Gli stati nazionali devono essere pronti a rin...,0.0
17447,Il ministro dell'interno della Germania #Horst...,0.0
17448,#Salvini: In Italia troppi si sono montati la ...,0.0
17449,Chi giubila in buona fede non ha capito nien...,0.0


In [None]:
#Make sure no hashtag is left in data by mistake
for idx, row in df.iterrows():
      #We do this loop now and not in cells above because we leave a space before every #, and it could cause problems with emojis/special characters
      row.text = row.text.replace("#", " #") 
      df['text'][idx] = row.text
      all_hashtags = list(filter(lambda word: word[0]=='#', row.text.split()))   
      for i in range(len(all_hashtags)):
          hash_keys = hash_keys + all_hashtags
hash_keys = list(dict.fromkeys(hash_keys))
for c in [",","!",":",".","¿","|","?","¡","'"]:
  hash_keys = [s.strip(c) for s in hash_keys]
hash_keys = sorted(hash_keys, key=len, reverse=True)

In [None]:
#Remove hashtags 
for k in hash_keys:
  df['text'] = df.apply(lambda row: row.text.replace(k, ""), axis=1)

In [None]:
df.to_csv('/content/drive/MyDrive/TFG/data/final_data/removed_italian_data.csv', index=False)  

#####All emojis and hashtags kept

In [None]:
#Replace emojis with text
df = replace_emojis(emo_keys, [], df)

In [None]:
df.to_csv('/content/drive/MyDrive/TFG/data/final_data/mantained_italian_data.csv', index=False)  

#PORTUGUESE DATA

In [42]:
df = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_portuguese_data.csv', encoding='utf8', engine='python')

####Emoji code extraction and replacement

In [None]:
emoji_codes = getAllEmojiHexCode(df)

In [None]:
emoji_codes

array([], dtype=float64)

In [None]:
df = replaceEmojiCodes(df)

####Hashtag and emoji filtering

In [43]:
df, hashtag_dict, emoji_dict = cleanTweets_getDicts(df)

In [44]:
emo_keys = list(emoji_dict.keys())
emo_keys.remove(" ")

In [45]:
hash_keys = list(hashtag_dict.keys())
hash_keys.remove("#")
hash_keys = sorted(hash_keys, key=len, reverse=True)

In [46]:
del hashtag_dict["#"]
del emoji_dict[" "]

In [47]:
print("EMOJIS:")
print("unique emojis:", len(emo_keys))
print("emoji count:", sum(emoji_dict.values()))
print("HASHTAGS: ")
print("unique hashtags:", len(hash_keys))
print("hashtag count:", sum(hashtag_dict.values()))

EMOJIS:
unique emojis: 410
emoji count: 6194
HASHTAGS: 
unique hashtags: 1406
hashtag count: 2791


#####Removal of all emojis and hashtags

In [None]:
#Remove emojis
df = replace_emojis([], emo_keys, df)

In [None]:
#Make sure no hashtag is left in data by mistake
for idx, row in df.iterrows():
      #We do this loop now and not in cells above because we leave a space before every #, and it could cause problems with emojis/special characters
      row.text = row.text.replace("#", " #") 
      df['text'][idx] = row.text
      all_hashtags = list(filter(lambda word: word[0]=='#', row.text.split()))   
      for i in range(len(all_hashtags)):
          hash_keys = hash_keys + all_hashtags
hash_keys = list(dict.fromkeys(hash_keys))
for c in [",","!",":",".","¿","|","?","¡","'"]:
  hash_keys = [s.strip(c) for s in hash_keys]
hash_keys = sorted(hash_keys, key=len, reverse=True)

In [None]:
#Remove hashtags
for k in hash_keys:
  df['text'] = df.apply(lambda row: row.text.replace(k, ""), axis=1)

In [None]:
df.to_csv('/content/drive/MyDrive/TFG/data/final_data/removed_portuguese_data.csv', index=False)  

#####All emojis and hashtags kept

In [None]:
#Replace emojis with text
df = replace_emojis(emo_keys, [], df)

In [None]:
df.to_csv('/content/drive/MyDrive/TFG/data/final_data/mantained_portuguese_data.csv', index=False)  

#ENGLISH DATA

In [48]:
df = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data.csv', encoding='utf8', engine='python')

####Data splitting
The data is split into 5 equal parts and then re-unified. This is done because runtime would be too long in some cells for the whole data

In [None]:
df1 = df.sample(frac = 0.2)
df = df.drop(df1.index)
df1.to_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_1.csv', index=False) 
#
df2 = df.sample(frac = 0.25)
df = df.drop(df2.index)
df2.to_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_2.csv', index=False) 
#
df3 = df.sample(frac = 0.3)
df = df.drop(df3.index)
df3.to_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_3.csv', index=False) 
#
df4 = df.sample(frac = 0.5)
df5 = df.drop(df4.index)
df4.to_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_4.csv', index=False) 
df5.to_csv('/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_5.csv', index=False) 

In [None]:
#@title Choose subsection to run the preprocessing code on

Subdataframe = '5'  #@param ["1", "2", "3", "4", "5"]

map_subsection_load_path = {
    '1':
        '/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_1.csv',
    '2':
        '/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_2.csv',
    '3':
        '/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_3.csv',
    '4':
        '/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_4.csv',
    '5':
        '/content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_5.csv',
}

map_replaced_emoji_save_path = {
    '1':
        '/content/drive/MyDrive/TFG/data/final_data/unfinished_replacedemojis_english_data_1.csv',
    '2':
        '/content/drive/MyDrive/TFG/data/final_data/unfinished_replacedemojis_english_data_2.csv',
    '3':
        '/content/drive/MyDrive/TFG/data/final_data/unfinished_replacedemojis_english_data_3.csv',
    '4':
        '/content/drive/MyDrive/TFG/data/final_data/unfinished_replacedemojis_english_data_4.csv',
    '5':
        '/content/drive/MyDrive/TFG/data/final_data/unfinished_replacedemojis_english_data_5.csv',
}

map_removed_save_path = {
    '1':
        '/content/drive/MyDrive/TFG/data/final_data/removed_english_data_1.csv',
    '2':
        '/content/drive/MyDrive/TFG/data/final_data/removed_english_data_2.csv',
    '3':
        '/content/drive/MyDrive/TFG/data/final_data/removed_english_data_3.csv',
    '4':
        '/content/drive/MyDrive/TFG/data/final_data/removed_english_data_4.csv',
    '5':
        '/content/drive/MyDrive/TFG/data/final_data/removed_english_data_5.csv',
}

map_filtered_save_path = {
    '1':
        '/content/drive/MyDrive/TFG/data/final_data/filtered_english_data_1.csv',
    '2':
        '/content/drive/MyDrive/TFG/data/final_data/filtered_english_data_2.csv',
    '3':
        '/content/drive/MyDrive/TFG/data/final_data/filtered_english_data_3.csv',
    '4':
        '/content/drive/MyDrive/TFG/data/final_data/filtered_english_data_4.csv',
    '5':
        '/content/drive/MyDrive/TFG/data/final_data/filtered_english_data_5.csv',
}

map_mantained_save_path = {
    '1':
        '/content/drive/MyDrive/TFG/data/final_data/mantained_english_data_1.csv',
    '2':
        '/content/drive/MyDrive/TFG/data/final_data/mantained_english_data_2.csv',
    '3':
        '/content/drive/MyDrive/TFG/data/final_data/mantained_english_data_3.csv',
    '4':
        '/content/drive/MyDrive/TFG/data/final_data/mantained_english_data_4.csv',
    '5':
        '/content/drive/MyDrive/TFG/data/final_data/mantained_english_data_5.csv',
}

load_path = map_subsection_load_path[Subdataframe]
emoji_save_path = map_replaced_emoji_save_path[Subdataframe]
removed_path = map_removed_save_path[Subdataframe]
filtered_path = map_filtered_save_path[Subdataframe]
mantained_path = map_mantained_save_path[Subdataframe]

print(f'Data to load           : {load_path}')

Data to load           : /content/drive/MyDrive/TFG/data/final_data/unfinished_english_data_5.csv


In [None]:
df = pd.read_csv(load_path, encoding='utf8', engine='python')

####Emoji code extraction and replacement

In [49]:
emoji_codes = getAllEmojiHexCode(df)

In [50]:
emoji_codes

array(['&#10004;', '&#10006;', '&#10024;', '&#10052;', '&#10060;',
       '&#10067;', '&#10068;', '&#10069;', '&#10071;', '&#10084;',
       '&#1040784;', '&#1040788;', '&#1041184;', '&#1041185;',
       '&#1041190;', '&#1041191;', '&#1041196;', '&#1041204;',
       '&#1041205;', '&#1041210;', '&#1041216;', '&#1041219;',
       '&#1041222;', '&#1041223;', '&#1041233;', '&#1041237;',
       '&#1041240;', '&#1041243;', '&#1041629;', '&#1041636;',
       '&#1041653;', '&#1041681;', '&#1042390;', '&#1042469;',
       '&#1043214;', '&#1043215;', '&#1043291;', '&#1043292;',
       '&#1043293;', '&#1043323;', '&#1043350;', '&#1043351;',
       '&#1043353;', '&#1043357;', '&#1043358;', '&#1043359;',
       '&#1043360;', '&#1043473;', '&#11015;', '&#11088;', '&#11093;',
       '&#12288;', '&#12290;', '&#127344;', '&#127378;', '&#127379;',
       '&#127382;', '&#127383;', '&#127386;', '&#127463;', '&#127468;',
       '&#127470;', '&#127480;', '&#127481;', '&#127482;', '&#127744;',
       '&#1277

In [None]:
df = replaceEmojiCodes(df)

In [None]:
df.to_csv(emoji_save_path, index=False) 

In [None]:
df = pd.read_csv(emoji_save_path, encoding='utf8', engine='python')

####Hashtag and emoji filtering

In [53]:
df, hashtag_dict, emoji_dict = cleanTweets_getDicts(df)

In [54]:
emo_keys = list(emoji_dict.keys())
emo_keys.remove(" ")

In [55]:
hash_keys = list(hashtag_dict.keys())
hash_keys.remove("#")
hash_keys = sorted(hash_keys, key=len, reverse=True)

In [56]:
del hashtag_dict["#"]
del emoji_dict[" "]

In [57]:
print("EMOJIS:")
print("unique emojis:", len(emo_keys))
print("emoji count:", sum(emoji_dict.values()))
print("HASHTAGS: ")
print("unique hashtags:", len(hash_keys))
print("hashtag count:", sum(hashtag_dict.values()))

EMOJIS:
unique emojis: 510
emoji count: 14131
HASHTAGS: 
unique hashtags: 12893
hashtag count: 35400


#####Removal of all emojis and hashtags

In [None]:
#Remove emojis
df = replace_emojis([], emo_keys, df)

In [None]:
#Make sure no hashtag is left in data by mistake
for idx, row in df.iterrows():
      #We do this loop now and not in cells above because we leave a space before every #, and it could cause problems with emojis/special characters
      row.text = row.text.replace("#", " #") 
      df['text'][idx] = row.text
      all_hashtags = list(filter(lambda word: word[0]=='#', row.text.split()))   
      for i in range(len(all_hashtags)):
          hash_keys = hash_keys + all_hashtags
hash_keys = list(dict.fromkeys(hash_keys))
for c in [",","!",":",".","¿","|","?","¡","'"]:
  hash_keys = [s.strip(c) for s in hash_keys]
hash_keys = sorted(hash_keys, key=len, reverse=True)

In [None]:
#Remove hashtags 
for k in hash_keys:
  df['text'] = df.apply(lambda row: row.text.replace(k, ""), axis=1)

In [None]:
df.to_csv(removed_path, index=False)  

#####All emojis and hashtags kept

In [None]:
#Replace emojis with text
df = replace_emojis(emo_keys, [], df)

In [None]:
df.to_csv(mantained_path, index=False)  

####Data unification

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/removed_english_data_1.csv', encoding='utf8', engine='python')
df2 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/removed_english_data_2.csv', encoding='utf8', engine='python')
df3 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/removed_english_data_3.csv', encoding='utf8', engine='python')
df4 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/removed_english_data_4.csv', encoding='utf8', engine='python')
df5 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/removed_english_data_5.csv', encoding='utf8', engine='python')

df = pd.concat([df1, df2, df3, df4, df5])

df.to_csv('/content/drive/MyDrive/TFG/data/final_data/removed_english_data.csv', index=False)  

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/filtered_english_data_1.csv', encoding='utf8', engine='python')
df2 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/filtered_english_data_2.csv', encoding='utf8', engine='python')
df3 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/filtered_english_data_3.csv', encoding='utf8', engine='python')
df4 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/filtered_english_data_4.csv', encoding='utf8', engine='python')
df5 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/filtered_english_data_5.csv', encoding='utf8', engine='python')

df = pd.concat([df1, df2, df3, df4, df5])

df.to_csv('/content/drive/MyDrive/TFG/data/final_data/filtered_english_data.csv', index=False)  

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/mantained_english_data_1.csv', encoding='utf8', engine='python')
df2 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/mantained_english_data_2.csv', encoding='utf8', engine='python')
df3 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/mantained_english_data_3.csv', encoding='utf8', engine='python')
df4 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/mantained_english_data_4.csv', encoding='utf8', engine='python')
df5 = pd.read_csv('/content/drive/MyDrive/TFG/data/final_data/mantained_english_data_5.csv', encoding='utf8', engine='python')

df = pd.concat([df1, df2, df3, df4, df5])

df.to_csv('/content/drive/MyDrive/TFG/data/final_data/mantained_english_data.csv', index=False)  