# Libraries

In [1]:
import os
import pandas as pd
import numpy as np
import re

# Files

## Dataset donwload

In [2]:
dataset_messi = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-and-word-embeddings/messi_tweets.csv", encoding="utf-8")
dataset_ronaldo = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-and-word-embeddings/ronaldo_tweets.csv", encoding="utf-8")

## Show some examples

## Messi

In [3]:
dataset_messi.head(5)

Unnamed: 0,tweet_id,author_id,content,lang,date,source,geo,retweet_count,like_count,quote_count
0,1442143079373365248,1302375645666848768,@moh19si_ Who needs Messi ?,en,2021-09-26T15:04:38.000Z,Twitter for iPhone,,0,0,0
1,1442143071995457542,116969561,RT @Forbes: Lionel Messi ranked No. 2 in this ...,en,2021-09-26T15:04:36.000Z,Twitter for Android,,11,0,0
2,1442143058594652160,1296470104486612997,RT @ESPNFC: Pele congratulates Messi for break...,en,2021-09-26T15:04:33.000Z,Twitter Web App,,637,0,0
3,1442143054836678656,4083107893,RT @CrewsMat10: PSG have lost 0 (ZERO) games s...,en,2021-09-26T15:04:32.000Z,Twitter for Android,,510,0,0
4,1442143052265664514,1248782697029406720,RT @ESPNFC: Pele congratulates Messi for break...,en,2021-09-26T15:04:32.000Z,Twitter for Android,,637,0,0


## Ronaldo

In [4]:
dataset_ronaldo.head(5)

Unnamed: 0,tweet_id,author_id,content,lang,date,source,geo,retweet_count,like_count,quote_count
0,1442216040629514241,1395557892779122694,@AdusLP ronaldo was offside,en,2021-09-26T19:54:33.000Z,Twitter Web App,,0,1,0
1,1442216031792074755,1176842712420630530,RT @Football__Tweet: 👽 The most effective trio...,en,2021-09-26T19:54:31.000Z,Twitter for Android,,18,0,0
2,1442216026456748036,2385664598,Ronaldo JR the only footballer to score a goal...,en,2021-09-26T19:54:30.000Z,Twitter for iPhone,,0,0,0
3,1442216002197016578,1142495555790299138,RT @EdmundOris: Perfect weekend for Barca fans...,en,2021-09-26T19:54:24.000Z,Twitter for iPhone,,571,0,0
4,1442215992344596488,793153149066481664,RT @EfoEtornam: No way Ansu Fati has more goal...,en,2021-09-26T19:54:22.000Z,Twitter for iPhone,,106,0,0


# Data cleaning

## Messi

### Check for null values

In [5]:
dataset_messi.isna().sum()

tweet_id             0
author_id            0
content              0
lang                 0
date                 0
source               0
geo              37003
retweet_count        0
like_count           0
quote_count          0
dtype: int64

#### Percentage of null geo values 

In [6]:
count = len(dataset_messi.index)
print(f"Number of values : {count}")

Number of values : 37097


In [7]:
count_null = dataset_messi.isna().sum()["geo"]
percentage = count_null*100/count
print(f"Percentage of null geo values {round(percentage,2)}%")

Percentage of null geo values 99.75%


### Assign the -1 id to null geo values

In [8]:
dataset_messi.geo = dataset_messi.geo.apply(lambda x: -1 if pd.isna(x) else x)

#### Display results

In [9]:
dataset_messi["geo"].head(5)

0    -1
1    -1
2    -1
3    -1
4    -1
Name: geo, dtype: object

In [10]:
dataset_messi["geo"].unique()[0:5]

array([-1, '1744381d2805a216', '5446838227498f51', '6a6d896ba1cb5dc4',
       '5ccdc4a6f900a93f'], dtype=object)

### Clean tweets : remove mentions, extra spaces and links

#### Remove emojis

In [11]:
def to_ascii(text):
    return text.encode("ascii","ignore").decode("ascii")

#### Regex

In [12]:
regex1 = r'@[A-Za-z0-9]+'
regex2 = r'https?://[A-Za-z0-9./]+'
regex = r'|'.join((regex1, regex2))

#### Applying the regex

In [13]:
dataset_messi.content = dataset_messi.content.apply(lambda x: re.sub(regex, ' ', str(x).lower()).strip())
dataset_messi.content = dataset_messi.content.apply(lambda x: x.replace("rt","",1)) # Remove the "RT" tag

In [14]:
dataset_messi.content = dataset_messi.content.apply(lambda x: x.replace("_","")) # Remove the "_" 
dataset_messi.content = dataset_messi.content.apply(lambda x: x.replace(":","")) # Remove the ":" 

In [15]:
dataset_messi.content = dataset_messi.content.apply(lambda x: to_ascii(x)) # Remove the emojis 

#### Display results

In [16]:
dataset_messi.head(5)

Unnamed: 0,tweet_id,author_id,content,lang,date,source,geo,retweet_count,like_count,quote_count
0,1442143079373365248,1302375645666848768,who needs messi ?,en,2021-09-26T15:04:38.000Z,Twitter for iPhone,-1,0,0,0
1,1442143071995457542,116969561,lionel messi ranked no. 2 in this year's li...,en,2021-09-26T15:04:36.000Z,Twitter for Android,-1,11,0,0
2,1442143058594652160,1296470104486612997,pele congratulates messi for breaking his r...,en,2021-09-26T15:04:33.000Z,Twitter Web App,-1,637,0,0
3,1442143054836678656,4083107893,psg have lost 0 (zero) games since they sig...,en,2021-09-26T15:04:32.000Z,Twitter for Android,-1,510,0,0
4,1442143052265664514,1248782697029406720,pele congratulates messi for breaking his r...,en,2021-09-26T15:04:32.000Z,Twitter for Android,-1,637,0,0


### Check for non-english content

In [17]:
assert dataset_messi["lang"].unique()=="en", "Non-english content detected"

> Test passed

## Ronaldo

### Check for null values

In [18]:
dataset_ronaldo.isna().sum()

tweet_id             0
author_id            0
content              0
lang                 0
date                 0
source               0
geo              36879
retweet_count        0
like_count           0
quote_count          0
dtype: int64

### Percentage of null geo values

In [19]:
count = len(dataset_ronaldo.index)
print(f"Number of values: {count}")

Number of values: 37084


In [20]:
count_null = dataset_ronaldo.isna().sum()["geo"]
percentage = count_null * 100/count
print(f"Percentage of null geo values: {round(percentage,2)}%")

Percentage of null geo values: 99.45%


### Assign the -1 id to null geo values

In [21]:
dataset_ronaldo.geo = dataset_ronaldo.geo.apply(lambda x: -1 if pd.isna(x) else x)

#### Display results

In [22]:
dataset_ronaldo["geo"].head(5)

0    -1
1    -1
2    -1
3    -1
4    -1
Name: geo, dtype: object

In [23]:
dataset_ronaldo["geo"].unique()[0:5]

array([-1, '00611f1548c7bdef', 'd91084e084c8fc53', '00817d75276a95cf',
       'ade6e91af3b145c7'], dtype=object)

### Clean tweets : remove mentions, links and extra spaces

#### Regex

#### Applying the regex

In [24]:
dataset_ronaldo.content = dataset_ronaldo.content.apply(lambda x: re.sub(regex, ' ', str(x).lower()).strip())
dataset_ronaldo.content = dataset_ronaldo.content.apply(lambda x: x.replace("rt", "", 1)) # Remove the "RT" tag

In [25]:
dataset_ronaldo.content = dataset_ronaldo.content.apply(lambda x: x.replace("_","")) # Remove the "_" 
dataset_ronaldo.content = dataset_ronaldo.content.apply(lambda x: x.replace(":","")) # Remove the ":" 

In [26]:
dataset_ronaldo.content = dataset_ronaldo.content.apply(lambda x: to_ascii(x)) # Remove the emojis 

#### Display results

In [27]:
dataset_ronaldo.head(5)

Unnamed: 0,tweet_id,author_id,content,lang,date,source,geo,retweet_count,like_count,quote_count
0,1442216040629514241,1395557892779122694,ronaldo was offside,en,2021-09-26T19:54:33.000Z,Twitter Web App,-1,0,1,0
1,1442216031792074755,1176842712420630530,tweet the most effective trios over a seaso...,en,2021-09-26T19:54:31.000Z,Twitter for Android,-1,18,0,0
2,1442216026456748036,2385664598,ronaldo jr the only footballer to score a goal...,en,2021-09-26T19:54:30.000Z,Twitter for iPhone,-1,0,0,0
3,1442216002197016578,1142495555790299138,perfect weekend for barca fans and for me p...,en,2021-09-26T19:54:24.000Z,Twitter for iPhone,-1,571,0,0
4,1442215992344596488,793153149066481664,no way ansu fati has more goals outside the...,en,2021-09-26T19:54:22.000Z,Twitter for iPhone,-1,106,0,0


### Check for non-english content

In [28]:
assert dataset_ronaldo["lang"].unique()=="en", "Non-english content detected"

```
Test passed
```

# Save the cleaned datasets

## Messi 

In [29]:
dataset_messi.to_csv("Cleaned_messi_tweets.csv", index=False)

## Ronaldo

In [30]:
dataset_ronaldo.to_csv("Cleaned_ronaldo_tweets.csv",index=False)