# Libraries

In [1]:
!pip install --user clean-text

Collecting clean-text
  Downloading clean_text-0.6.0-py3-none-any.whl (11 kB)
Collecting ftfy<7.0,>=6.0
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 550 kB/s 
Installing collected packages: ftfy, clean-text
Successfully installed clean-text-0.6.0 ftfy-6.1.1


In [2]:
import os
import re
import numpy as np
import pandas as pd
from cleantext import clean

# Files

## Dataset donwload

In [3]:
dataset_messi = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-and-word-embeddings/messi_tweets.csv", encoding="utf-8")
dataset_ronaldo = pd.read_csv("/kaggle/input/twitter-sentiment-analysis-and-word-embeddings/ronaldo_tweets.csv", encoding="utf-8")

  exec(code_obj, self.user_global_ns, self.user_ns)


# Show some examples

## Messi

In [4]:
dataset_messi.head(5)

Unnamed: 0,tweet_id,author_id,content,lang,date,source,geo,retweet_count,like_count,quote_count
0,1514503439291543554,1504674335533187078,When did this happened????? #Messi https://t.c...,en,2022-04-14T07:18:52.000Z,Twitter for Android,,1,1,0
1,1514503399462432770,1437183590781964290,Highest Ranking Active Players on My All-Time ...,en,2022-04-14T07:18:43.000Z,Twitter for iPhone,,0,0,0
2,1514503259762958336,819679110,@st3vat @PederHL @sidlowe Don’t get me wrong m...,en,2022-04-14T07:18:09.000Z,Twitter for iPhone,,0,0,0
3,1514503228221702144,1012688452863909891,@DaddyArteta @RonaldoW7_ @goal @PSG_English @T...,en,2022-04-14T07:18:02.000Z,Twitter for Android,,0,0,0
4,1514503144478322688,1309496212345810944,Messi has to pay now 50m to play?? https://t.c...,en,2022-04-14T07:17:42.000Z,Twitter for Android,,0,1,0


## Ronaldo

In [5]:
dataset_ronaldo.head(5)

Unnamed: 0,tweet_id,author_id,content,lang,date,source,geo,retweet_count,like_count,quote_count
0,1514503408820051969,1406215235829182470,"🗣 Dressing room source at Ajax: ""There's bee...",en,2022-04-14T07:18:45.000Z,Twitter Web App,,0.0,0.0,0.0
1,1514503399462432770,1437183590781964290,Highest Ranking Active Players on My All-Time ...,en,2022-04-14T07:18:43.000Z,Twitter for iPhone,,0.0,0.0,0.0
2,1514503356764418051,1215836950931705856,@RonaIdoProp @theutdjournal @lauriewhitwell @T...,en,2022-04-14T07:18:33.000Z,Twitter for Android,,0.0,0.0,0.0
3,1514503333112864771,1201535456141168640,Timber + @atchouameni + @Darwinn99 That's it ...,en,2022-04-14T07:18:27.000Z,Twitter for Android,,0.0,0.0,0.0
4,1514503191563579394,1080591523715129344,A quick reminder that Cristiano Ronaldo is the...,en,2022-04-14T07:17:53.000Z,Twitter for iPhone,,0.0,0.0,0.0


# Data cleaning

## Messi

### Check for null values

In [6]:
dataset_messi.isna().sum()

tweet_id             0
author_id            0
content              0
lang                 0
date                 0
source               0
geo              73511
retweet_count        0
like_count           0
quote_count          0
dtype: int64

### Drop duplicates
Based on tweet ID

In [7]:
print(f"Row count before deletion: {len(dataset_messi.index)} ")
dataset_messi = dataset_messi.drop_duplicates(subset="tweet_id", keep="first")
print(f"Row count after deletion: {len(dataset_messi.index)}")

Row count before deletion: 74049 
Row count after deletion: 74049


#### Percentage of null geo values 

In [8]:
count = len(dataset_messi.index)
print(f"Number of values : {count}")

Number of values : 74049


In [9]:
count_null = dataset_messi.isna().sum()["geo"]
percentage = count_null*100/count
print(f"Percentage of null geo values {round(percentage,2)}%")

Percentage of null geo values 99.27%


### Assign the -1 id to null geo values

In [10]:
dataset_messi.geo = dataset_messi.geo.apply(lambda x: -1 if pd.isna(x) else x)

#### Display results

In [11]:
dataset_messi["geo"].head(5)

0    -1
1    -1
2    -1
3    -1
4    -1
Name: geo, dtype: object

In [12]:
print(f"Available locations ids: {dataset_messi['geo'].unique()[0:5]}")

Available locations ids: [-1 '01c1521c4381a164' '0e587c59401d0a27' '2c614b2aca285e79'
 '317fcc4b21a604d5']


### Clean tweets : remove mentions, extra spaces and links

#### Utility function

In [13]:
def clean_tweet(text):
    """
    Removes punctuation, emojis, normalize whitespaces...from a text
    """
    
    text = clean(text,
                 no_punct=True,
                 lower=True,
                 no_emoji=True,
                 normalize_whitespace=True
                )
    
    return text

#### Regex

In [14]:
# Remove mentions
regex_mentions = r"@[A-Za-z0-9_]+"
# Remove links
regex_links = r"https?://[A-Za-z0-9./]+"
# Remove some special characters
regex_special = r"[^A-Za-z0-9]+"

### Clean tweets : remove mentions, links and extra spaces

In [15]:
# Remove mentions
dataset_messi.content = dataset_messi.content.apply(lambda x: re.sub(regex_mentions, " ", str(x).strip()))
# Remove links 
dataset_messi.content = dataset_messi.content.apply(lambda x: re.sub(regex_links, " ", str(x).strip()))
# Remove special characters
dataset_messi.content = dataset_messi.content.apply(lambda x: re.sub(regex_special, " ", str(x).strip()))
# Clean tweets
dataset_messi.content = dataset_messi.content.apply(lambda x: clean_tweet(x)) 

#### Display results

In [16]:
dataset_messi.head(5)

Unnamed: 0,tweet_id,author_id,content,lang,date,source,geo,retweet_count,like_count,quote_count
0,1514503439291543554,1504674335533187078,when did this happened messi,en,2022-04-14T07:18:52.000Z,Twitter for Android,-1,1,1,0
1,1514503399462432770,1437183590781964290,highest ranking active players on my all time ...,en,2022-04-14T07:18:43.000Z,Twitter for iPhone,-1,0,0,0
2,1514503259762958336,819679110,don t get me wrong mate i know he was good but...,en,2022-04-14T07:18:09.000Z,Twitter for iPhone,-1,0,0,0
3,1514503228221702144,1012688452863909891,messi fans have no shame,en,2022-04-14T07:18:02.000Z,Twitter for Android,-1,0,0,0
4,1514503144478322688,1309496212345810944,messi has to pay now 50m to play,en,2022-04-14T07:17:42.000Z,Twitter for Android,-1,0,1,0


### Check for non-english content

In [17]:
assert dataset_messi["lang"].unique()=="en", "Non-english content detected"

> Test passed

## Ronaldo

### Check for null values

In [18]:
dataset_ronaldo.isna().sum()

tweet_id             0
author_id            0
content              0
lang                 1
date                 2
source               1
geo              73417
retweet_count        1
like_count           2
quote_count          2
dtype: int64

### Drop null values
For columns relevant to our analysis

In [19]:
dataset_ronaldo.dropna(subset=["lang","date","source","retweet_count","like_count", "quote_count"], inplace=True)

In [20]:
dataset_ronaldo.isna().sum()

tweet_id             0
author_id            0
content              0
lang                 0
date                 0
source               0
geo              73416
retweet_count        0
like_count           0
quote_count          0
dtype: int64

### Drop duplicates

In [21]:
print(f"Row count before deletion: {len(dataset_ronaldo.index)} ")
dataset_ronaldo = dataset_ronaldo.drop_duplicates(subset="tweet_id", keep="first")
print(f"Row count after deletion: {len(dataset_ronaldo.index)}")

Row count before deletion: 74064 
Row count after deletion: 74064


### Percentage of null geo values

In [22]:
count = len(dataset_ronaldo.index)
print(f"Number of values: {count}")

Number of values: 74064


In [23]:
count_null = dataset_ronaldo.isna().sum()["geo"]
percentage = count_null * 100/count
print(f"Percentage of null geo values: {round(percentage,2)}%")

Percentage of null geo values: 99.13%


### Assign the -1 id to null geo values

In [24]:
dataset_ronaldo.geo = dataset_ronaldo.geo.apply(lambda x: -1 if pd.isna(x) else x)

#### Display results

In [25]:
dataset_ronaldo["geo"].head(5)

0    -1
1    -1
2    -1
3    -1
4    -1
Name: geo, dtype: object

In [26]:
print(f"Avaialble location ids: {dataset_ronaldo['geo'].unique()[0:5]}")

Avaialble location ids: [-1 '178a87b8e2eaa375' '00e1c334666793a2' '02b5b310e3979394'
 '11980e22bad50000']


### Clean tweets : remove mentions, links and extra spaces

In [27]:
# Remove mentions
dataset_ronaldo.content = dataset_ronaldo.content.apply(lambda x: re.sub(regex_mentions, " ", str(x).strip()))
# Remove links
dataset_ronaldo.content = dataset_ronaldo.content.apply(lambda x: re.sub(regex_links, " ", str(x).strip()))
# Remove special characters
dataset_ronaldo.content = dataset_ronaldo.content.apply(lambda x: re.sub(regex_special, " ", str(x).strip()))
# Clean tweets
dataset_ronaldo.content = dataset_ronaldo.content.apply(lambda x: clean_tweet(x)) 

#### Display results

In [28]:
dataset_ronaldo.head(5)

Unnamed: 0,tweet_id,author_id,content,lang,date,source,geo,retweet_count,like_count,quote_count
0,1514503408820051969,1406215235829182470,dressing room source at ajax there s been crit...,en,2022-04-14T07:18:45.000Z,Twitter Web App,-1,0.0,0.0,0.0
1,1514503399462432770,1437183590781964290,highest ranking active players on my all time ...,en,2022-04-14T07:18:43.000Z,Twitter for iPhone,-1,0.0,0.0,0.0
2,1514503356764418051,1215836950931705856,i personally feel that cdm is the most importa...,en,2022-04-14T07:18:33.000Z,Twitter for Android,-1,0.0,0.0,0.0
3,1514503333112864771,1201535456141168640,timber that s it these are the three essential...,en,2022-04-14T07:18:27.000Z,Twitter for Android,-1,0.0,0.0,0.0
4,1514503191563579394,1080591523715129344,a quick reminder that cristiano ronaldo is the...,en,2022-04-14T07:17:53.000Z,Twitter for iPhone,-1,0.0,0.0,0.0


### Check for non-english content

In [29]:
assert dataset_ronaldo["lang"].unique()=="en", "Non-english content detected"

```
Test passed
```

# Save the cleaned datasets

## Messi 

In [30]:
dataset_messi.to_csv("Cleaned_messi_tweets.csv", index=False)

## Ronaldo

In [31]:
dataset_ronaldo.to_csv("Cleaned_ronaldo_tweets.csv", index=False)